In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# 1. Load the dataset wineQualityReds (consider the first column as the index)

In [15]:
# read_csv
data = pd.read_csv("data/wineQualityReds.csv", index_col=0)
data

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
1,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
2,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
3,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
4,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
5,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1596,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1597,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1598,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# 2. Split the dataset in train and test considering that the target column is called 'quality'

In [16]:
data

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
1,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
2,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
3,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
4,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
5,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1596,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1597,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1598,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [17]:
df = data.copy()
del df['quality']

In [18]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, data['quality'],test_size = 0.25)

In [19]:
y_test

803     7
479     5
881     5
1352    6
511     5
       ..
1077    6
179     5
389     6
807     7
1407    6
Name: quality, Length: 400, dtype: int64

# 3. Normalize train and test data

In [20]:
# StandardScaler + fit_transform
sss = StandardScaler()

In [31]:
X_train =  sss.fit_transform(X_train,y_train)

In [32]:
X_train

array([[ 0.32958458,  0.51868897, -0.43565403, ..., -0.90945683,
         1.59244617, -1.16912524],
       [ 0.44547077,  1.43665501, -1.40287307, ...,  0.32990047,
         1.17914291, -0.98238293],
       [-0.77133423,  0.24051745, -1.30106054, ...,  0.32990047,
        -0.29694014, -0.23541368],
       ...,
       [-0.13396018,  1.65919223, -0.18112271, ...,  0.3951298 ,
         0.64775301,  0.2314421 ],
       [ 1.14078791, -0.14892269,  0.73519007, ..., -0.97468616,
         0.3525364 ,  0.97841134],
       [-0.53956185,  0.51868897, -1.14834175, ...,  0.72127646,
        -0.53311343,  0.97841134]])

In [33]:
X_train.mean()

3.1990315906766813e-15

In [34]:
X_train.std()

1.0

In [35]:
X_test =  sss.fit_transform(X_test,y_test)

# 4. Create a RandomForest classifier with 300 estimators

In [37]:
# RandomForestClassifier
rf = RandomForestClassifier(n_estimators=300)
rf

RandomForestClassifier(n_estimators=300)

# 5. Get the accuracy of the model in cross validation (with K=5)

In [38]:
# cross_val_score
scores_train = cross_val_score(rf,X_train,y_train,cv=5)
scores_train

array([0.7       , 0.61666667, 0.67916667, 0.6625    , 0.68200837])

In [39]:
scores_test = cross_val_score(rf,X_test,y_test,cv=5)
scores_test



array([0.5875, 0.6   , 0.525 , 0.625 , 0.625 ])

In [40]:
scores_all = cross_val_score(rf,df,data['quality'],cv=5)
scores_all

array([0.546875  , 0.571875  , 0.615625  , 0.575     , 0.57366771])

# 6. Apply in cross validation a GridSearch with the following parameters to be tested:
- 'n_estimators': [100, 300, 500, 800, 1000],
- 'criterion': ['gini', 'entropy'],
- 'bootstrap': [True, False]

In [41]:
# GridSearchCV
parameters =  {'n_estimators':[100, 300,500,800,1000], 'criterion':['gini','entropy'], 'bootstrap':[True,False]}
gsv = GridSearchCV(rf,parameters,cv=5)
gsv.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_estimators=300),
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'n_estimators': [100, 300, 500, 800, 1000]})

In [45]:
gsv.fit(X_test,y_test)



GridSearchCV(cv=5, estimator=RandomForestClassifier(n_estimators=300),
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'n_estimators': [100, 300, 500, 800, 1000]})

# 7. Indicate the configuration that generates the best accuracy

In [46]:
# best_params_ + best_score_
gsv.best_params_

{'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 500}

In [47]:
gsv.best_score_

0.6025

# 8. Create a pipeline with the the preprocessing component, the model component and the Grid Search Analysis.