In [1]:
# Importing pandas module
import pandas as pd
# Importing numpy module
import numpy as np
# Importing train_test_split method from sklearn.model_selection
from sklearn.model_selection import train_test_split
# Importing preprocsessing method from sklearn
from sklearn import preprocessing
# Importing RandomForest module from sklearn.ensemble
from sklearn.ensemble import RandomForestRegressor
# Importing cross-validation pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
# Importing evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score
# Importing module to save sklearn models
from sklearn.externals import joblib



In [2]:
# Loading wine data using url
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [3]:
# printing sample data
data.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


In [4]:
# Cleaning the data
data = pd.read_csv(dataset_url, sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
# shape of the data
data.shape

(1599, 12)

In [6]:
# describing the data
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [7]:
# Features of the data
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [8]:
# target feature of the data
y = data.quality
# input features of the data
x = data.drop('quality', axis = 1)

In [9]:
# spliting data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 123, stratify = y)

In [10]:
# Scaling the trained data
x_train_scaled = preprocessing.scale(x_train)
print(x_train_scaled)

[[ 0.51358886  2.19680282 -0.164433   ...  1.08415147 -0.69866131
  -0.58608178]
 [-1.73698885 -0.31792985 -0.82867679 ...  1.46964764  1.2491516
   2.97009781]
 [-0.35201795  0.46443143 -0.47100705 ... -0.13658641 -0.35492962
  -0.20843439]
 ...
 [-0.98679628  1.10708533 -0.93086814 ...  0.24890976 -0.98510439
   0.35803669]
 [-0.69826067  0.46443143 -1.28853787 ...  1.08415147 -0.35492962
  -0.68049363]
 [ 3.1104093  -0.62528606  2.08377675 ... -1.61432173  0.79084268
  -0.39725809]]


In [11]:
# verifying the scaled data
print(x_train_scaled.mean(axis = 0))
print(x_train_scaled.std(axis = 0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [12]:
# Fitting the transformer API
scaler = preprocessing.StandardScaler().fit(x_train)
print(scaler)

StandardScaler(copy=True, with_mean=True, with_std=True)


In [13]:
# Applying Transformer to trained data
x_train_scaled = scaler.transform(x_train)
print(x_train_scaled.mean(axis = 0))
print(x_train_scaled.std(axis = 0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [14]:
# Applying Transformer to test data
x_test_scaled = scaler.transform(x_test)
print(x_test_scaled.mean(axis = 0))
print(x_test_scaled.std(axis = 0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [15]:
# Creating the pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators = 100))
print(pipeline)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=None,
                                       oob_score=False, random_state=None,
                                       verbose=0, warm_start=False))],
         verbose=False)


In [16]:
# printing tunable hyper parameters
print(pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False))], 'verbose': False, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
           

In [17]:
# Tuning the hyperparameters
hyperparameters = {'randomforestregressor__max_features':['auto','sqrt','log2'],
                  'randomforestregressor__max_depth':[None,5,3,1]}
print(hyperparameters)

{'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]}


In [18]:
# Cross validation with pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv = 10)
clf.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

In [19]:
# printing the best parameters
print(clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}


In [20]:
# retraining the model
print(clf.refit)

True


In [21]:
# Predicting new set of data
y_pred = clf.predict(x_test)
print(y_pred)

[6.43 5.77 5.08 5.51 6.23 5.56 5.   4.95 5.02 5.77 5.36 5.6  5.59 5.1
 5.82 5.55 6.57 5.76 5.75 6.96 5.53 5.68 5.03 6.02 5.96 5.06 5.23 5.19
 6.02 5.89 5.82 6.63 6.01 5.02 4.95 5.92 5.08 6.04 5.06 6.   4.83 5.75
 6.47 5.16 6.1  5.34 5.47 5.58 5.05 6.41 5.99 5.38 5.89 5.23 5.63 5.72
 5.35 5.31 5.   5.27 5.29 5.13 5.05 5.85 5.94 5.41 6.35 5.03 5.17 6.63
 5.76 6.01 5.14 5.01 5.39 5.97 5.41 5.11 5.13 5.3  6.44 5.71 6.1  6.18
 5.09 5.97 6.52 6.16 5.87 5.76 5.9  5.33 6.33 5.76 5.7  5.75 6.74 6.66
 5.58 6.76 5.07 5.53 5.17 6.52 5.08 4.81 5.69 5.07 5.58 5.93 6.05 5.4
 6.04 5.43 5.23 5.25 6.04 5.07 5.01 6.   5.81 5.1  5.76 6.3  5.3  5.26
 5.42 6.08 5.57 5.38 5.76 6.05 5.12 5.42 5.1  6.36 5.   5.12 6.56 5.52
 5.18 5.12 5.65 6.06 5.31 5.46 5.13 6.63 5.75 5.15 5.52 5.12 4.88 4.99
 5.21 5.95 5.39 5.69 5.73 5.24 5.61 5.27 5.27 6.02 5.   6.   5.17 5.39
 5.55 5.19 6.11 5.13 5.64 5.08 5.6  5.49 5.01 5.51 5.58 5.05 6.02 5.64
 5.07 5.03 5.21 6.17 5.16 5.66 5.24 4.86 5.34 6.66 5.9  5.8  5.49 5.17
 5.32 5.

In [23]:
# Evaluating model performance
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))

0.4588786246140808
0.34917125000000004


In [24]:
# Saving the model
joblib.dump(clf,'rf_regressor.pkl')

['rf_regressor.pkl']