******** Predicting Wine Price ****************

## Read in data

In [1]:
import pandas as pd

wine = pd.read_csv("data/wine.csv")
print(wine.dtypes)

Year             int64
Price          float64
WinterRain       int64
AGST           float64
HarvestRain      int64
Age              int64
FrancePop      float64
dtype: object


In [2]:
print(wine.describe())

              Year      Price  WinterRain       AGST  HarvestRain        Age  \
count    25.000000  25.000000   25.000000  25.000000    25.000000  25.000000   
mean   1965.800000   7.067224  605.280000  16.509336   148.560000  17.200000   
std       7.691987   0.650341  132.277965   0.675397    74.419464   7.691987   
min    1952.000000   6.204900  376.000000  14.983300    38.000000   5.000000   
25%    1960.000000   6.518800  536.000000  16.200000    89.000000  11.000000   
50%    1966.000000   7.121100  600.000000  16.533300   130.000000  17.000000   
75%    1972.000000   7.495000  697.000000  17.066700   187.000000  23.000000   
max    1978.000000   8.493700  830.000000  17.650000   292.000000  31.000000   

          FrancePop  
count     25.000000  
mean   49694.436760  
std     3665.270243  
min    43183.569000  
25%    46583.995000  
50%    50254.966000  
75%    52894.183000  
max    54602.193000  


## Linear Regression (one variable)


In [10]:
from sklearn import linear_model
import numpy as np
model1 = linear_model.LinearRegression()
model1.fit(np.array(wine['AGST'])[:,np.newaxis], wine['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Or equvalently

In [11]:
model1.fit(np.matrix([wine['AGST'].as_matrix()]).transpose(), wine['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Sum of Squared Errors

In [12]:
# make prediction on the training data
model1Pred = model1.predict(np.array(wine['AGST'])[:,np.newaxis]) 
# calculating and printing out SSE
print(sum(( model1Pred - wine['Price'])**2))

5.73487514702


## Linear Regression (two variables)

In [13]:
model2 = linear_model.LinearRegression()
model2.fit(wine[['AGST','HarvestRain']], wine['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
print(sum(( model2.predict(wine[['AGST','HarvestRain']]) - wine['Price'])**2))

2.97037334017


## Linear Regression (all variables)


In [15]:
predictors3 = ['AGST', 'HarvestRain', 'WinterRain', 'Age', 'FrancePop']
model3 = linear_model.LinearRegression()
model3.fit(wine[predictors3], wine['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
print(model3.coef_ ,  model3.intercept_)

[  6.01223884e-01  -3.95812450e-03   1.04250681e-03   5.84748489e-04
  -4.95273038e-05] -0.450398864395


In [17]:
print(sum(( model3.predict(wine[predictors3]) - wine['Price'])**2))

1.73211271534



## Remove FrancePop

In [18]:
predictors4 = ['AGST', 'HarvestRain', 'WinterRain', 'Age']
model4 = linear_model.LinearRegression()
model4.fit(wine[predictors4], wine['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
print(model4.coef_ ,  model4.intercept_)

[ 0.60720935 -0.00397153  0.00107551  0.02393083] -3.42998018693



## Correlations
Pearson correlation coefficient between two variables

In [20]:
import numpy as np
print(np.corrcoef(wine['WinterRain'], wine['Price'])[0,1])

0.136650547388


In [21]:
print(np.corrcoef(wine['Age'], wine['FrancePop'])[0,1])

-0.994485097111


Correlation Matrix

In [22]:
print(wine.corr())

                 Year     Price  WinterRain      AGST  HarvestRain       Age  \
Year         1.000000 -0.447768    0.016970 -0.246916     0.028009 -1.000000   
Price       -0.447768  1.000000    0.136651  0.659563    -0.563322  0.447768   
WinterRain   0.016970  0.136651    1.000000 -0.321091    -0.275441 -0.016970   
AGST        -0.246916  0.659563   -0.321091  1.000000    -0.064496  0.246916   
HarvestRain  0.028009 -0.563322   -0.275441 -0.064496     1.000000 -0.028009   
Age         -1.000000  0.447768   -0.016970  0.246916    -0.028009  1.000000   
FrancePop    0.994485 -0.466862   -0.001622 -0.259162     0.041264 -0.994485   

             FrancePop  
Year          0.994485  
Price        -0.466862  
WinterRain   -0.001622  
AGST         -0.259162  
HarvestRain   0.041264  
Age          -0.994485  
FrancePop     1.000000  



## Read in test set

In [23]:
wineTest = pd.read_csv("data/wine_test.csv")
print(wineTest)

   Year   Price  WinterRain     AGST  HarvestRain  Age  FrancePop
0  1979  6.9541         717  16.1667          122    4  54835.832
1  1980  6.4979         578  16.0000           74    3  55110.236


## Make test set predictions

In [24]:
predictTest = model4.predict(wineTest[predictors4])
print(predictTest)

[ 6.76892463  6.6849104 ]


## Compute R-squared

In [25]:
SSE = sum((predictTest - wineTest['Price'])**2)
SST = sum((wineTest['Price'] - np.mean(wineTest['Price']))**2)
print(1 - SSE/SST)

0.334390470275


In [26]:
print(model4.score(wineTest[predictors4], wineTest['Price']))

0.334390470275
