In [None]:
import acquire
import model
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

### Acquire and get wine database

In [19]:
df = acquire.get_wine()
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


### Split the database

In [20]:
X_train, y_train, X_validate, y_validate, X_test, y_test = model.train_validate_test_dummy(df, 'quality')

### Scale the X sets

In [21]:
X_train_scaled, X_validate_scaled, X_test_scaled = model.scale_data(X_train, X_validate, X_test)

In [22]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X_train_scaled[['density', 'alcohol']]).inertia_ for k in range(2, 12)}).plot(marker='x')
    pd.Series({k: KMeans(k).fit(X_train_scaled[['residual sugar', 'alcohol', 'total sulfur dioxide']]).inertia_ for k in range(2, 12)}).plot(marker='x')
    pd.Series({k: KMeans(k).fit(X_train_scaled[['volatile acidity', 'chlorides', 'density', 'alcohol']]).inertia_ for k in range(2, 12)}).plot(marker='x')
   
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

### Create three clustering models based on 2, 3 and four features

In [23]:
X_train_2_features = X_train_scaled[['density', 'alcohol']]
X_validate_2_features = X_validate_scaled[['density', 'alcohol']]


kmeans2 = KMeans(n_clusters=4)
kmeans2.fit(X_train_2_features)

X_train['2_cluster'] = kmeans2.predict(X_train_2_features)
X_validate['2_cluster'] = kmeans2.predict(X_validate_2_features)

In [24]:
X_train_3_features = X_train_scaled[['residual sugar', 'total sulfur dioxide', 'alcohol']]
X_train_3_features
X_validate_3_features = X_validate_scaled[['residual sugar', 'total sulfur dioxide', 'alcohol']]
X_validate_3_features

kmeans3 = KMeans(n_clusters=4)
kmeans3.fit(X_train_3_features)

X_train['3_cluster'] = kmeans3.predict(X_train_3_features)
X_validate['3_cluster'] = kmeans3.predict(X_validate_3_features)

In [25]:
X_train_4_features = X_train_scaled[['volatile acidity', 'chlorides', 'density', 'alcohol']]
X_train_4_features
X_validate_4_features = X_validate_scaled[['volatile acidity', 'chlorides', 'density', 'alcohol']]
X_validate_4_features

kmeans4 = KMeans(n_clusters=4)
kmeans4.fit(X_train_4_features)

X_train['4_cluster'] = kmeans4.predict(X_train_4_features)
X_validate['4_cluster'] = kmeans4.predict(X_validate_4_features)

### Take a look at the X_train with the cluster columns

In [26]:
X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,white,2_cluster,3_cluster,4_cluster
2138,5.5,0.24,0.32,8.70,0.060,19.0,102.0,0.99400,3.27,0.31,10.4,1,3,0,2
19,7.9,0.32,0.51,1.80,0.341,17.0,56.0,0.99690,3.04,1.08,9.2,0,1,1,3
601,7.2,0.24,0.40,1.40,0.045,31.0,106.0,0.99140,2.88,0.38,10.8,1,0,0,2
2540,9.0,0.29,0.34,12.10,0.030,34.0,177.0,0.99706,3.13,0.47,10.6,1,3,0,2
501,10.4,0.44,0.73,6.55,0.074,38.0,76.0,0.99900,3.17,0.85,12.0,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3319,5.1,0.21,0.28,1.40,0.047,48.0,148.0,0.99168,3.50,0.49,10.4,1,3,0,2
1943,6.3,0.25,0.44,11.60,0.041,48.0,195.0,0.99680,3.18,0.52,9.5,1,1,3,0
285,7.3,0.32,0.25,7.20,0.056,47.0,180.0,0.99610,3.08,0.47,8.8,1,1,3,0
607,7.3,0.25,0.29,7.50,0.049,38.0,158.0,0.99650,3.43,0.38,9.6,1,1,3,0


### Visualize Clusters on unscaled dataframe (even though clustered on scaled)

In [27]:
sns.pairplot(X_train, x_vars=['density', 'alcohol'], y_vars=['density', 'alcohol'], hue='2_cluster')

<seaborn.axisgrid.PairGrid at 0x7f97076bfa90>

In [28]:
sns.pairplot(X_train,  x_vars=['residual sugar', 'total sulfur dioxide', 'alcohol'] , y_vars=['residual sugar', 'total sulfur dioxide', 'alcohol'], hue='3_cluster')

<seaborn.axisgrid.PairGrid at 0x7f971fac0640>

In [29]:
sns.pairplot(X_train, x_vars=['density', 'alcohol', 'volatile acidity', 'chlorides'], y_vars=['density', 'alcohol', 'volatile acidity', 'chlorides'], hue='4_cluster')

<seaborn.axisgrid.PairGrid at 0x7f97081b64a0>

In [46]:
feat_to_dummy = ['2_cluster', '3_cluster', '4_cluster']
cluster_dummy_list = []


for feat in feat_to_dummy:
    # creating a dummy column for the current feature
    df_dummies = pd.get_dummies(X_train[feat], drop_first=True)
    
    # Concatenate the original DataFrame and the dummy variables DataFrame
    df = pd.concat([X_train, df_dummies], axis=1)

    # dropping the original feature
    df.drop(columns=feat_to_dummy, inplace=True)
    
    cluster_dummy_list.append(df)

### Create seperate train dataframes for 2, 3 and 4 features respectively to use in linear regression 

In [49]:
X_train_2, X_train_3, X_train_4 = cluster_dummy_list
X_train_2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,white,1,2,3
2138,5.5,0.24,0.32,8.70,0.060,19.0,102.0,0.99400,3.27,0.31,10.4,1,0,0,1
19,7.9,0.32,0.51,1.80,0.341,17.0,56.0,0.99690,3.04,1.08,9.2,0,1,0,0
601,7.2,0.24,0.40,1.40,0.045,31.0,106.0,0.99140,2.88,0.38,10.8,1,0,0,0
2540,9.0,0.29,0.34,12.10,0.030,34.0,177.0,0.99706,3.13,0.47,10.6,1,0,0,1
501,10.4,0.44,0.73,6.55,0.074,38.0,76.0,0.99900,3.17,0.85,12.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3319,5.1,0.21,0.28,1.40,0.047,48.0,148.0,0.99168,3.50,0.49,10.4,1,0,0,1
1943,6.3,0.25,0.44,11.60,0.041,48.0,195.0,0.99680,3.18,0.52,9.5,1,1,0,0
285,7.3,0.32,0.25,7.20,0.056,47.0,180.0,0.99610,3.08,0.47,8.8,1,1,0,0
607,7.3,0.25,0.29,7.50,0.049,38.0,158.0,0.99650,3.43,0.38,9.6,1,1,0,0


In [50]:
feat_to_dummy = ['2_cluster', '3_cluster', '4_cluster']
cluster_dummy_list = []


for feat in feat_to_dummy:
    # creating a dummy column for the current feature
    df_dummies = pd.get_dummies(X_validate[feat], drop_first=True)
    
    # Concatenate the original DataFrame and the dummy variables DataFrame
    df = pd.concat([X_validate, df_dummies], axis=1)

    # dropping the original feature
    df.drop(columns=feat_to_dummy, inplace=True)
    
    cluster_dummy_list.append(df)

In [52]:
X_validate_2, X_validate_3, X_validate_4 = cluster_dummy_list
X_validate_2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,white,1,2,3
497,6.8,0.23,0.32,1.6,0.026,43.0,147.0,0.99040,3.29,0.54,12.5,1,0,1,0
813,6.9,0.39,0.24,2.1,0.102,4.0,7.0,0.99462,3.44,0.58,11.4,0,0,0,0
741,9.2,0.53,0.24,2.6,0.078,28.0,139.0,0.99788,3.21,0.57,9.5,0,1,0,0
439,6.2,0.35,0.04,1.2,0.060,23.0,108.0,0.99340,3.26,0.54,9.2,1,1,0,0
200,9.6,0.32,0.47,1.4,0.056,9.0,24.0,0.99695,3.22,0.82,10.3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334,9.0,0.24,0.50,1.2,0.048,26.0,107.0,0.99180,3.21,0.34,12.4,1,0,1,0
4522,9.0,0.20,0.33,3.5,0.049,10.0,40.0,0.99440,3.14,0.36,9.8,1,0,0,1
1507,7.5,0.38,0.57,2.3,0.106,5.0,12.0,0.99605,3.36,0.55,11.4,0,0,0,0
1395,6.9,0.19,0.33,1.6,0.039,27.0,98.0,0.98980,3.09,0.46,12.3,1,0,1,0


In [54]:
# turn series into dataframes to append new columns with predicted values
y_train_mvp = pd.DataFrame(y_train)
y_validate_mvp = pd.DataFrame(y_validate)
y_test_mvp = pd.DataFrame(y_test)

# 1. Predict based on mean
quality_pred_mean = y_train_mvp['quality'].mean()
y_train_mvp['quality_pred_mean'] = quality_pred_mean
y_validate_mvp['quality_pred_mean'] = quality_pred_mean

# 2. Do same for median
quality_pred_median_mvp = y_train_mvp['quality'].median()
y_train_mvp['quality_pred_median'] = quality_pred_median_mvp
y_validate_mvp['quality_pred_median'] = quality_pred_median_mvp

# 3.  RMSE of tax_value_pred_mean
rmse_train = mean_squared_error(y_train_mvp.quality, y_train_mvp.quality_pred_mean) ** (1/2)
rmse_validate = mean_squared_error(y_validate_mvp.quality, y_validate_mvp.quality_pred_mean) ** (1/2)

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2)) 

# 4.  RMSE of tax_value_pred_median
rmse_train = mean_squared_error(y_train_mvp.quality, y_train_mvp.quality_pred_median) ** (1/2)
rmse_validate = mean_squared_error(y_validate_mvp.quality, y_validate_mvp.quality_pred_median) ** (1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

RMSE using Mean
Train/In-Sample:  0.87 
Validate/Out-of-Sample:  0.89
RMSE using Median
Train/In-Sample:  0.89 
Validate/Out-of-Sample:  0.91


## Models

### Linear Regression on 2 feature clusters

In [69]:
# X_train_2 = X_train_2.drop(columns=['fixed acidity','chlorides','free sulfur dioxide','white'])
# X_validate_2 = X_validate_2.drop(columns=['fixed acidity','chlorides','free sulfur dioxide','white'])

In [70]:
# turn series into dataframes to append new columns with predicted values
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)
y_test = pd.DataFrame(y_test)

X_train_2 = X_train_2.rename(columns={1: 'cluster_1', 2: 'cluster_2', 3: 'cluster_3'})
X_validate_2 = X_validate_2.rename(columns={1: 'cluster_1', 2: 'cluster_2', 3: 'cluster_3'})
#. Create the model object
lm = LinearRegression()

#. Fit to training and specify column in y_train since it is now a series
lm.fit(X_train_2, y_train.quality)

# predict
y_train['quality_pred_lm'] = lm.predict(X_train_2)

# RMSE
rmse_train = mean_squared_error(y_train.quality, y_train.quality_pred_lm) ** (1/2)

# predict validate
y_validate['quality_pred_lm'] = lm.predict(X_validate_2)

#Validate RMSE 
rmse_validate = mean_squared_error(y_validate.quality, y_validate.quality_pred_lm) ** (1/2)

print('RMSE for OLS using LinearRegression\nTraining/In-Sample: ', rmse_train,
     '\nValidation/Out-of-Sample: ', rmse_validate)

RMSE for OLS using LinearRegression
Training/In-Sample:  0.7321062842771571 
Validation/Out-of-Sample:  0.7440008791764751


### Linear Regression on 3 feature clusters

In [63]:
# turn series into dataframes to append new columns with predicted values
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)
y_test = pd.DataFrame(y_test)

X_train_3 = X_train_3.rename(columns={1: 'cluster_1', 2: 'cluster_2', 3: 'cluster_3'})
X_validate_3 = X_validate_3.rename(columns={1: 'cluster_1', 2: 'cluster_2', 3: 'cluster_3'})
#. Create the model object
lm = LinearRegression()

#. Fit to training and specify column in y_train since it is now a series
lm.fit(X_train_3, y_train.quality)

# predict
y_train['quality_pred_lm'] = lm.predict(X_train_3)

# RMSE
rmse_train = mean_squared_error(y_train.quality, y_train.quality_pred_lm) ** (1/2)

# predict validate
y_validate['quality_pred_lm'] = lm.predict(X_validate_3)

#Validate RMSE 
rmse_validate = mean_squared_error(y_validate.quality, y_validate.quality_pred_lm) ** (1/2)

print('RMSE for OLS using LinearRegression\nTraining/In-Sample: ', rmse_train,
     '\nValidation/Out-of-Sample: ', rmse_validate)

RMSE for OLS using LinearRegression
Training/In-Sample:  0.728515414593524 
Validation/Out-of-Sample:  0.7387132351450343


### Linear Regression on 4 feature clusters

In [64]:
# turn series into dataframes to append new columns with predicted values
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)
y_test = pd.DataFrame(y_test)

X_train_4 = X_train_4.rename(columns={1: 'cluster_1', 2: 'cluster_2', 3: 'cluster_3'})
X_validate_4 = X_validate_4.rename(columns={1: 'cluster_1', 2: 'cluster_2', 3: 'cluster_3'})
#. Create the model object
lm = LinearRegression()

#. Fit to training and specify column in y_train since it is now a series
lm.fit(X_train_4, y_train.quality)

# predict
y_train['quality_pred_lm'] = lm.predict(X_train_4)

# RMSE
rmse_train = mean_squared_error(y_train.quality, y_train.quality_pred_lm) ** (1/2)

# predict validate
y_validate['quality_pred_lm'] = lm.predict(X_validate_4)

#Validate RMSE 
rmse_validate = mean_squared_error(y_validate.quality, y_validate.quality_pred_lm) ** (1/2)

print('RMSE for OLS using LinearRegression\nTraining/In-Sample: ', rmse_train,
     '\nValidation/Out-of-Sample: ', rmse_validate)

RMSE for OLS using LinearRegression
Training/In-Sample:  0.7276921290904094 
Validation/Out-of-Sample:  0.7366509067990562


### Polynomial Regression

In [65]:
for i in range(0,5):
    # make the polynomial features to get a new set of features
    pf = PolynomialFeatures(degree=i)

    # fit and transform X_train_scaled
    X_train_degree2 = pf.fit_transform(X_train_2)

    # transform X_validate_scaled & X_test_scaled
    X_validate_degree2 = pf.transform(X_validate_2)
    #X_test_degree2_mvp = pf.transform(X_test_mvp)

    # create the model object
    lm2 = LinearRegression()

    # fit the model to our training data. We must specify the column in y_train, 
    # since we have converted it to a dataframe from a series! 
    lm2.fit(X_train_2, y_train.quality)

    # predict train
    y_train['quality_pred_poly'] = lm2.predict(X_train_2)

    # evaluate: rmse
    rmse_train = mean_squared_error(y_train.quality, y_train.quality_pred_poly)**(1/2)

    # predict validate
    y_validate['quality_pred_poly'] = lm2.predict(X_validate_2)

    # evaluate: rmse
    rmse_validate = mean_squared_error(y_validate.quality, y_validate.quality_pred_poly)**(1/2)

    print("RMSE for Polynomial Model, degrees=", i, "\nTraining/In-Sample: ", rmse_train, 
          "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Polynomial Model, degrees= 0 
Training/In-Sample:  0.7276642994335428 
Validation/Out-of-Sample:  0.7357744545803262
RMSE for Polynomial Model, degrees= 1 
Training/In-Sample:  0.7276642994335428 
Validation/Out-of-Sample:  0.7357744545803262
RMSE for Polynomial Model, degrees= 2 
Training/In-Sample:  0.7276642994335428 
Validation/Out-of-Sample:  0.7357744545803262
RMSE for Polynomial Model, degrees= 3 
Training/In-Sample:  0.7276642994335428 
Validation/Out-of-Sample:  0.7357744545803262
RMSE for Polynomial Model, degrees= 4 
Training/In-Sample:  0.7276642994335428 
Validation/Out-of-Sample:  0.7357744545803262


In [113]:

pd.DataFrame(y_train)
recombined_train = pd.concat([X_train_2, y_train], axis=1)
#recombined_train.groupby('2_cluster').quality.median()

In [115]:
x_2 = recombined_train[recombined_train['2']==0]
y_2 = recombined_train[recombined_train['2_cluster']==1]
z_2 = recombined_train[recombined_train['2_cluster']==2]
w_2 = recombined_train[recombined_train['2_cluster']==3]

KeyError: '2'

In [86]:
stats.f_oneway(X_train_2[X_train_2['cluster_1']==0],X_train_2[X_train_2['cluster_1']==1],X_train_2[X_train_2['cluster_1']==2],X_train_2[X_train_2['cluster_1']==3])

F_onewayResult(statistic=array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]), pvalue=array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]))

In [107]:
x_2 = X_train[X_train.quality['2_cluster']==0]
y_2 = X_train[X_train.quality['2_cluster']==1]
z_2 = X_train[X_train['2_cluster']==2]
w_2 = X_train[X_train['2_cluster']==3]

x_3 = X_train[X_train['3_cluster']==0]
y_3 = X_train[X_train['3_cluster']==1]
z_3 = X_train[X_train['3_cluster']==2]
w_3 = X_train[X_train['3_cluster']==3]

x_4 = X_train[X_train['4_cluster']==0]
y_4 = X_train[X_train['4_cluster']==1]
z_4 = X_train[X_train['4_cluster']==2]
w_4 = X_train[X_train['4_cluster']==3]

AttributeError: 'DataFrame' object has no attribute 'quality'

In [106]:
F, p = stats.f_oneway(x_2,y_2,z_2,w_2, axis=)
F, p

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 851 and the array at index 1 has size 1263

Unnamed: 0,volatile acidity,citric acid,residual sugar,total sulfur dioxide,density,pH,sulphates,alcohol,cluster_1,cluster_2,cluster_3
2138,0.24,0.32,8.70,102.0,0.99400,3.27,0.31,10.4,0,0,1
601,0.24,0.40,1.40,106.0,0.99140,2.88,0.38,10.8,0,0,0
2540,0.29,0.34,12.10,177.0,0.99706,3.13,0.47,10.6,0,0,1
501,0.44,0.73,6.55,76.0,0.99900,3.17,0.85,12.0,0,0,0
1265,0.57,0.05,2.30,36.0,0.99564,3.38,0.60,10.3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1969,0.41,0.30,3.20,164.0,0.99270,3.53,0.79,11.7,0,0,0
4506,0.19,0.28,1.10,100.0,0.99040,3.22,0.69,11.2,0,0,0
2901,0.22,0.32,2.20,92.0,0.99076,3.27,0.59,11.9,0,1,0
3319,0.21,0.28,1.40,148.0,0.99168,3.50,0.49,10.4,0,0,1


In [92]:
y

Unnamed: 0,volatile acidity,citric acid,residual sugar,total sulfur dioxide,density,pH,sulphates,alcohol,cluster_1,cluster_2,cluster_3
19,0.32,0.51,1.8,56.0,0.99690,3.04,1.08,9.2,1,0,0
117,0.56,0.12,2.0,28.0,0.99700,3.37,0.50,9.4,1,0,0
175,0.50,0.04,1.5,49.0,0.99580,3.35,0.78,9.5,1,0,0
2414,0.38,0.18,7.4,195.0,0.99773,3.53,0.71,9.2,1,0,0
1961,0.21,0.47,1.3,123.0,0.99590,2.90,0.64,9.5,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
226,0.59,0.50,2.0,81.0,0.99640,3.04,1.61,9.5,1,0,0
582,0.49,0.49,2.2,15.0,1.00000,3.19,0.43,9.2,1,0,0
1943,0.25,0.44,11.6,195.0,0.99680,3.18,0.52,9.5,1,0,0
285,0.32,0.25,7.2,180.0,0.99610,3.08,0.47,8.8,1,0,0


In [93]:
z

Unnamed: 0,volatile acidity,citric acid,residual sugar,total sulfur dioxide,density,pH,sulphates,alcohol,cluster_1,cluster_2,cluster_3


In [94]:
w

Unnamed: 0,volatile acidity,citric acid,residual sugar,total sulfur dioxide,density,pH,sulphates,alcohol,cluster_1,cluster_2,cluster_3
