In [2]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [7]:
#1. Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

#To help us perform crossvalidation
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
#To import evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.externals import joblib


In [4]:
#2. Load red wine data
dataset_url='http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data=pd.read_csv(dataset_url)

In [6]:
print(data.head)

<bound method NDFrame.head of      fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0      7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1      7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2     7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3     11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4      7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                  

In [8]:
data=pd.read_csv(dataset_url, sep=';')

In [9]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [11]:
data.shape

(1599, 12)

In [55]:
data.describe()
data.style.background_gradient(cmap='Blues')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [None]:
#All of the features are numeric, which is convenient.
#However, they have some very different scales, so let's make a mental note to standardize the data later.

In [54]:
#3. Split data into training and test sets
y=data.quality
X=data.drop('quality', axis=1)

In [22]:
#Split data into train and test sets
#we have set aside 20% of the data as a test set for evaluating our model
# We also set an arbitrary "random state" (a.k.a. seed) so that we can reproduce our results.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=123, stratify=y)

In [24]:
#4. Declare Data preprocessing steps
#insert your preprocessing steps into a cross-validation pipeline
#We'll be standardizing here
#Standardization is the process of subtracting the means from each feature and then dividing by the feature standard deviations.

#STEP 1: Fitting the transformer API using Scikit


scaler = preprocessing.StandardScaler().fit(X_train)

#STEP 2:Applying transformer to training data
X_test_scaled=scaler.transform(X_test)

In [28]:
print (X_test_scaled.mean(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]


In [31]:
print(X_test_scaled.std(axis=0))

[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [32]:
#4. Declare Data preprocessing
#Pipeline with preprocessing and model
#a modeling pipeline that first transforms the data using StandardScaler()
#and then fits a model using a random forest regressor.
pipeline = make_pipeline(preprocessing.StandardScaler(),RandomForestRegressor(n_estimators=100))

In [36]:
#5. Declare hyperparameters to tune
#HYPERPARAMETERS
#There are two types of parameters we need to worry about: model parameters and hyperparameters. 
#Models parameters can be learned directly from the data (i.e. regression coefficients), while hyperparameters cannot 

hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],'randomforestregressor__max_depth': [None, 5, 3, 1]}


In [37]:
#Tune model using crosss validation pipeline
'''Cross-validation is a process for reliably estimating
the performance of a method for building a model by training and evaluating your 
model multiple times using the same method.

Practically, that "method" is simply a set of hyperparameters in this context.

The best practice when performing CV is to include your data preprocessing steps inside the cross-validation loop.
This prevents accidentally tainting your training folds with influential data from your test fold.'''

#Scikit-Learn makes it simple to set this up:

clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

'''GridSearchCV essentially performs cross-validation across the entire "grid" (all possible permutations) of hyperparameters
It takes in your model (in this case, we're using a model pipeline), 
the hyperparameters you want to tune, and the number of folds to create.


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                            

In [39]:
print (clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto'}


In [40]:
#Refit the entire training set

'''Conveniently, GridSearchCV from sklearn will
automatically refit the model with the best set of hyperparameters using the entire training set.
'''
#7. Refit the entire raiining set
#to confirm
print (clf.refit)

True


In [41]:
#Evaluate model pipeline on test data
#Predict a new set of data
y_pred=clf.predict(X_test)

In [44]:
print (r2_score(y_test,y_pred))

0.45828294690961946


In [46]:
print(mean_squared_error(y_test, y_pred))

0.3495556249999999


In [47]:
#Save model to a pkf file
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [48]:
#Save model for future
#Load model from pkl file
clf2 = joblib.load('rf_regressor.pkl')
 
# Predict data set using loaded model
clf2.predict(X_test)

array([6.55, 5.81, 5.01, 5.4 , 6.4 , 5.74, 4.91, 4.7 , 5.  , 5.98, 5.21,
       5.69, 5.68, 5.02, 5.85, 5.74, 6.66, 5.76, 5.69, 6.99, 5.44, 5.6 ,
       5.03, 6.01, 5.91, 5.03, 5.51, 5.09, 5.93, 5.87, 5.88, 6.56, 5.96,
       5.  , 4.86, 5.9 , 5.03, 5.85, 5.02, 5.87, 4.78, 5.97, 6.86, 5.08,
       6.08, 5.4 , 5.38, 5.57, 5.06, 6.52, 5.97, 5.29, 5.94, 5.1 , 5.73,
       6.02, 5.16, 5.4 , 4.97, 5.25, 5.31, 5.16, 5.01, 5.71, 5.95, 5.18,
       6.43, 5.04, 5.16, 6.58, 5.66, 5.45, 5.07, 5.02, 5.27, 5.95, 5.26,
       5.03, 5.23, 5.24, 6.71, 5.65, 6.12, 6.64, 5.08, 5.86, 6.54, 5.99,
       5.67, 5.92, 5.85, 5.26, 6.53, 5.61, 5.68, 5.75, 6.7 , 6.74, 5.57,
       6.8 , 5.1 , 5.47, 5.14, 6.61, 5.02, 4.57, 5.6 , 5.07, 5.71, 5.99,
       5.72, 5.48, 6.13, 5.48, 5.09, 5.2 , 5.86, 5.03, 5.05, 6.04, 5.84,
       5.14, 5.79, 6.08, 5.24, 5.21, 5.4 , 5.85, 5.26, 5.31, 5.92, 6.34,
       5.13, 5.31, 5.05, 6.57, 5.  , 5.19, 6.84, 5.47, 5.14, 5.02, 5.77,
       6.13, 5.4 , 5.39, 5.13, 6.56, 5.56, 5.03, 5.