In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error , r2_score
import joblib

In [26]:
np.set_printoptions(suppress=True)

In [27]:
data = pd.read_csv("C:/Users/User/Desktop/winequality-red.csv")

In [28]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [29]:
data.shape

(1596, 12)

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1596 entries, 0 to 1595
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1596 non-null   float64
 1   volatile acidity      1596 non-null   float64
 2   citric acid           1596 non-null   float64
 3   residual sugar        1596 non-null   float64
 4   chlorides             1596 non-null   float64
 5   free sulfur dioxide   1596 non-null   float64
 6   total sulfur dioxide  1596 non-null   float64
 7   density               1596 non-null   float64
 8   pH                    1596 non-null   float64
 9   sulphates             1596 non-null   float64
 10  alcohol               1596 non-null   float64
 11  quality               1596 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 149.8 KB


In [31]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1596.0,1596.0,1596.0,1596.0,1596.0,1596.0,1596.0,1596.0,1596.0,1596.0,1596.0,1596.0
mean,8.31416,0.527954,0.270276,2.535558,0.08712,15.858396,46.382206,0.996744,3.311917,0.656385,10.421147,5.637218
std,1.732203,0.179176,0.193894,1.405515,0.045251,10.460554,32.839138,0.001888,0.153346,0.163057,1.060371,0.80708
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.86,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.996745,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997833,3.4,0.73,11.1,6.0
max,15.6,1.58,0.79,15.5,0.611,72.0,289.0,1.00369,4.01,1.98,14.0,8.0


In [32]:
Y = data["quality"]
X = data.drop("quality",axis = 1)
Y.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    test_size=0.2, 
                                                    random_state=38, 
                                                    stratify=Y)

In [34]:
X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
943,10.2,0.440,0.58,4.1,0.092,11.0,24.0,0.99745,3.29,0.99,12.0
144,8.1,0.670,0.55,1.8,0.117,32.0,141.0,0.99680,3.17,0.62,9.4
16,8.5,0.280,0.56,1.8,0.092,35.0,103.0,0.99690,3.30,0.75,10.5
1208,6.6,0.440,0.15,2.1,0.076,22.0,53.0,0.99570,3.32,0.62,9.3
1058,9.1,0.400,0.50,1.8,0.071,7.0,16.0,0.99462,3.21,0.69,12.5
...,...,...,...,...,...,...,...,...,...,...,...
811,12.6,0.410,0.54,2.8,0.103,19.0,41.0,0.99939,3.21,0.76,11.3
1112,7.0,0.690,0.07,2.5,0.091,15.0,21.0,0.99572,3.38,0.60,11.3
938,9.9,0.270,0.49,5.0,0.082,9.0,17.0,0.99484,3.19,0.52,12.5
748,8.3,0.650,0.10,2.9,0.089,17.0,40.0,0.99803,3.29,0.55,9.5


In [35]:
X_test

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
376,11.4,0.625,0.66,6.2,0.088,6.0,24.0,0.9988,3.11,0.99,13.3
427,12.8,0.840,0.63,2.4,0.088,13.0,35.0,0.9997,3.10,0.60,10.4
158,7.6,0.950,0.03,2.0,0.090,7.0,20.0,0.9959,3.20,0.56,9.6
564,8.7,0.700,0.24,2.5,0.226,5.0,15.0,0.9991,3.32,0.60,9.0
1303,9.7,0.690,0.32,2.5,0.088,22.0,91.0,0.9979,3.29,0.62,10.1
...,...,...,...,...,...,...,...,...,...,...,...
522,9.2,0.430,0.49,2.4,0.086,23.0,116.0,0.9976,3.23,0.64,9.5
1278,7.1,0.460,0.20,1.9,0.077,28.0,54.0,0.9956,3.37,0.64,10.4
902,9.2,0.580,0.20,3.0,0.081,15.0,115.0,0.9980,3.23,0.59,9.5
54,7.6,0.510,0.15,2.8,0.110,33.0,73.0,0.9955,3.17,0.63,10.2


In [36]:
X_train_scaled = preprocessing.scale(X_train)
print( X_train_scaled )

[[ 1.11195387 -0.48719061  1.6118231  ... -0.16689871  1.98034594
   1.5045866 ]
 [-0.11009173  0.82227638  1.45595843 ... -0.9427193  -0.23914587
  -0.95131328]
 [ 0.12267886 -1.39812418  1.50791332 ... -0.10224699  0.54067558
   0.08772129]
 ...
 [ 0.93737593 -1.45505752  1.14422909 ... -0.81341587 -0.83900852
   1.97687504]
 [ 0.00629356  0.70840969 -0.88201162 ... -0.16689871 -0.65904973
  -0.85685559]
 [-0.57563292  1.76167662 -1.24569585 ...  0.47961845 -1.13893985
  -0.7623979 ]]


In [37]:
print( X_train_scaled.mean(axis=0) )
print( X_train_scaled.std(axis=0) )

[ 0. -0. -0.  0.  0. -0. -0. -0.  0.  0. -0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [38]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [40]:
X_train_scaled = scaler.transform(X_train)
print (X_train_scaled.mean(axis=0))
print (X_train_scaled.std(axis=0))

[ 0. -0. -0.  0.  0. -0. -0. -0.  0.  0. -0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [41]:
X_test_scaled = scaler.transform(X_test)
print (X_test_scaled.mean(axis=0))

[ 0.0724877   0.06764265  0.01323608  0.01075375 -0.03607542 -0.00626225
 -0.06727066 -0.02721265 -0.12568324 -0.10417677  0.06602554]


In [43]:
pipeline = make_pipeline(preprocessing.StandardScaler(),
                         RandomForestRegressor(n_estimators=100,
                                               random_state=46))

In [44]:
print( pipeline.get_params() )

{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('randomforestregressor', RandomForestRegressor(random_state=46))], 'verbose': False, 'standardscaler': StandardScaler(), 'randomforestregressor': RandomForestRegressor(random_state=46), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'randomforestregressor__bootstrap': True, 'randomforestregressor__ccp_alpha': 0.0, 'randomforestregressor__criterion': 'squared_error', 'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__max_leaf_nodes': None, 'randomforestregressor__max_samples': None, 'randomforestregressor__min_impurity_decrease': 0.0, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__min_weight_fraction_leaf': 0.0, 'randomforestregressor__n_estimators': 100, 'randomforestregressor__n_jobs': None, 'randomforestregressor__oob_score': False, 'rand

In [45]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [47]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10) 
clf.fit(X_train,Y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=46))]),
             param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
                         'randomforestregressor__max_features': ['auto', 'sqrt',
                                                                 'log2']})

In [49]:
pred = clf.predict(X_test)
print( r2_score(Y_test, pred) )
print( mean_squared_error(Y_test, pred) )

0.547841681093666
0.2958634375


In [50]:
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']