In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
data = pd.read_csv('Wine_Price_Data.csv')
data.describe()

Unnamed: 0,Vintage,Red,White,Rosé,Orange,Sparkling,Winery?,Vineyard?,Oaked?,AVA,Price/750mL,Rating,ABV %
count,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0
mean,2020.013699,0.561644,0.39726,0.059361,0.004566,0.03653,0.90411,0.506849,0.680365,0.808219,27.876393,87.917808,13.619178
std,1.444634,0.497322,0.490452,0.23684,0.067574,0.188034,0.295115,0.501098,0.467403,0.394604,29.888677,3.057942,1.287768
min,2014.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.99,72.0,6.8
25%,2019.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,12.99,86.0,13.0
50%,2021.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,18.99,88.0,13.8
75%,2021.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,33.495,90.0,14.5
max,2022.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,260.99,96.0,16.0


# Cleaning Data

In [3]:
# add log price, dropping producer, making everything uppercase
data['Log_Price'] = np.log(data['Price/750mL'])
data = data.drop(['Producer', 'Price/750mL'], axis=1) 
for col in ["Variety/Vine", "State"]:
    data[col] = data[col].str.upper()
    data[col] = data[col].str.strip()
    print(data[col].value_counts())

CABERNET SAUVIGNON                                       36
PINOT NOIR                                               35
CHARDONNAY                                               35
BLEND                                                    28
MERLOT                                                   11
SAUVIGNON BLANC                                          10
PINOT GRIGIO                                              7
PINOT GRIS                                                7
RIESLING                                                  6
RED BLEND                                                 5
ZINFANDEL                                                 5
ROSÉ                                                      4
CABARNET SAUVIGNON                                        3
PETITE SIRAH                                              3
GRENACHE                                                  2
SYRAH                                                     2
MOSCATO                                 

In [4]:
# Cleaning Data, must need at least 3 entries or else it will be classified as other

data["State"] = data["State"].str.replace("NC", "NORTH CAROLINA")

for col in ["Variety/Vine"]:
    data[col] = data[col].str.replace("CHARDONNNAY", "CHARDONNAY")
    data[col] = data[col].str.replace("BUTTERY CHARDONNAY", "CHARDONNAY")
    data[col] = data[col].str.replace("ROSE", "ROSÉ")
    data[col] = data[col].str.replace("FUME BLANC", "SAUVIGNON BLANC")
    data[col] = data[col].str.replace("ORANGE SAUVIGNON BLANC", "SAUVIGNON BLANC")
    data[col] = data[col].str.replace("CABARNET SAUVIGNON", "CABERNET SAUVIGNON")
    data[col] = data[col].str.replace("ZINFANDEL BLANC NOIR", "ZINFANDEL")
    data[col] = data[col].str.replace("PINOT NOIR BLANC", "PINOT NOIR")
    data[col] = data[col].str.replace("JOVINO PINOT NOIR", "PINOT NOIR")
    data[col] = data[col].str.replace("PINO GRIS", "PINOT GRIS")
    for blend in ["CHERRY MOSCATO","RED BLEND", "PINK MOSCATO", "CHAMPAGNE EXTRA DRY", "BLEND: SYRAH, GRENACHE, MOURVEDRE, CABERNET SAUVIGNON"]:
        data[col] = data[col].str.replace(blend, "BLEND")
    unique_entries = data[col].value_counts()
    
print(data["State"].value_counts())
print(data["Variety/Vine"].value_counts())


CALIFORNIA        158
OREGON             26
WASHINGTON         15
NORTH CAROLINA     11
NEW YORK            4
WASHINGOTN          2
INDIANA             1
NEVADA              1
WEST VIRGINIA       1
Name: State, dtype: int64
CABERNET SAUVIGNON    39
BLEND                 37
CHARDONNAY            37
PINOT NOIR            36
SAUVIGNON BLANC       12
MERLOT                11
PINOT GRIS             8
PINOT GRIGIO           7
RIESLING               6
ZINFANDEL              6
ROSÉ                   5
PETITE SIRAH           3
MOSCATO                2
SYRAH                  2
GRENACHE               2
GREEN APPLE WINE       1
NIAGARA                1
MUSCADINE              1
VIOGNEIR               1
SCUPPERNONG            1
CARIGNAN               1
Name: Variety/Vine, dtype: int64


In [5]:
# add other for states and variety/vine with less than 3 entries

for col in ["Variety/Vine", "State"]:
    print(f"Break Down of Other for {col}")
    unique_entries = data[col].value_counts()
    print(unique_entries[unique_entries < 3])
    data[col] = data[col].apply(lambda x: x if x in unique_entries[unique_entries >= 3].index else "OTHER")
    print(data[col].value_counts())

Break Down of Other for Variety/Vine
MOSCATO             2
SYRAH               2
GRENACHE            2
GREEN APPLE WINE    1
NIAGARA             1
MUSCADINE           1
VIOGNEIR            1
SCUPPERNONG         1
CARIGNAN            1
Name: Variety/Vine, dtype: int64
CABERNET SAUVIGNON    39
BLEND                 37
CHARDONNAY            37
PINOT NOIR            36
OTHER                 12
SAUVIGNON BLANC       12
MERLOT                11
PINOT GRIS             8
PINOT GRIGIO           7
ZINFANDEL              6
RIESLING               6
ROSÉ                   5
PETITE SIRAH           3
Name: Variety/Vine, dtype: int64
Break Down of Other for State
WASHINGOTN       2
INDIANA          1
NEVADA           1
WEST VIRGINIA    1
Name: State, dtype: int64
CALIFORNIA        158
OREGON             26
WASHINGTON         15
NORTH CAROLINA     11
OTHER               5
NEW YORK            4
Name: State, dtype: int64


In [6]:
# One Hot Encoding

data = pd.get_dummies(data, columns=["Variety/Vine", "State"], drop_first=True)
data.head()

Unnamed: 0,Vintage,Red,White,Rosé,Orange,Sparkling,Winery?,Vineyard?,Oaked?,AVA,...,Variety/Vine_PINOT NOIR,Variety/Vine_RIESLING,Variety/Vine_ROSÉ,Variety/Vine_SAUVIGNON BLANC,Variety/Vine_ZINFANDEL,State_NEW YORK,State_NORTH CAROLINA,State_OREGON,State_OTHER,State_WASHINGTON
0,2018,0,0,1,0,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,2021,1,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2,2021,0,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,2022,1,0,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,2019,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


# Cleaned Up Data Summary and Analysis

In [7]:
data.describe()

Unnamed: 0,Vintage,Red,White,Rosé,Orange,Sparkling,Winery?,Vineyard?,Oaked?,AVA,...,Variety/Vine_PINOT NOIR,Variety/Vine_RIESLING,Variety/Vine_ROSÉ,Variety/Vine_SAUVIGNON BLANC,Variety/Vine_ZINFANDEL,State_NEW YORK,State_NORTH CAROLINA,State_OREGON,State_OTHER,State_WASHINGTON
count,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,...,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0,219.0
mean,2020.013699,0.561644,0.39726,0.059361,0.004566,0.03653,0.90411,0.506849,0.680365,0.808219,...,0.164384,0.027397,0.022831,0.054795,0.027397,0.018265,0.050228,0.118721,0.022831,0.068493
std,1.444634,0.497322,0.490452,0.23684,0.067574,0.188034,0.295115,0.501098,0.467403,0.394604,...,0.371472,0.163612,0.149707,0.2281,0.163612,0.134214,0.218916,0.324202,0.149707,0.253169
min,2014.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2019.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2021.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2021.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2022.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# print all the data
data.to_csv('Wine_Price_Data_Cleaned.csv', index=False)

# Data Analysis

In [9]:
cleaned_data = pd.read_csv('Wine_Price_Data_Cleaned.csv')
sm.add_constant(cleaned_data)
Y = cleaned_data["Log_Price"]
X = data.drop(['Log_Price'], axis=1)

In [10]:
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
print(model.summary())
print(model.params)

                                 OLS Regression Results                                
Dep. Variable:              Log_Price   R-squared (uncentered):                   0.979
Model:                            OLS   Adj. R-squared (uncentered):              0.976
Method:                 Least Squares   F-statistic:                              307.3
Date:                Mon, 29 May 2023   Prob (F-statistic):                   1.01e-143
Time:                        01:14:32   Log-Likelihood:                         -136.36
No. Observations:                 219   AIC:                                      330.7
Df Residuals:                     190   BIC:                                      429.0
Df Model:                          29                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------

makes since that vintage is negative since higher the vintage (younger the wine) price deacreases by .38% and it was find that in the stuyd 4.8% decrease, which is off. 

# interperting data, blend is dropped and Califronia is dropped 


in study saugvginon blac was dropped, maybe should drop that indeatad of blend