In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib



In [2]:
# Load wine data from remote URLPython
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [3]:
# Read CSV with semicolon separator
data = pd.read_csv(dataset_url, sep=';')

In [4]:
data.shape

(1599, 12)

In [5]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [6]:
# Separate target feature from training features or input features
y = data.quality
X = data.drop('quality', axis=1)

In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

In [8]:
# Declare data preprocessing steps
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))

In [10]:
# List tunable hyperparameters
pipeline.get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('randomforestregressor',
   RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                         max_depth=None, max_features='auto', max_leaf_nodes=None,
                         max_samples=None, min_impurity_decrease=0.0,
                         min_impurity_split=None, min_samples_leaf=1,
                         min_samples_split=2, min_weight_fraction_leaf=0.0,
                         n_estimators=100, n_jobs=None, oob_score=False,
                         random_state=None, verbose=0, warm_start=False))],
 'verbose': False,
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'randomforestregressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
 

In [11]:
# Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                    'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [17]:
# Sklearn cross-validation with pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                            

In [19]:
clf.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'sqrt'}

In [20]:
clf.refit

True

In [21]:
# Predict a new set of data
y_pred = clf.predict(X_test)

In [24]:
r2_score(y_test, y_pred)

0.47415218838912776

In [25]:
mean_squared_error(y_test, y_pred)

0.339315625

In [26]:
# Save model to a .pkl file
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [27]:
# Load model from .pkl file
clf2 = joblib.load('rf_regressor.pkl')
 
# Predict data set using loaded model
clf2.predict(X_test)

array([6.5 , 5.56, 5.02, 5.49, 6.47, 5.63, 4.88, 4.76, 5.  , 6.1 , 5.3 ,
       5.71, 5.89, 5.13, 5.78, 5.79, 6.71, 5.72, 5.69, 7.  , 5.52, 5.59,
       5.04, 6.03, 5.91, 5.08, 5.5 , 5.12, 5.85, 5.98, 5.89, 6.47, 5.96,
       5.06, 4.93, 5.84, 5.05, 6.05, 4.97, 5.87, 4.87, 5.91, 6.49, 5.14,
       6.12, 5.36, 5.54, 5.43, 5.16, 6.59, 5.99, 5.27, 5.83, 5.16, 5.56,
       5.72, 5.33, 5.37, 4.94, 5.3 , 5.33, 5.24, 5.04, 5.85, 5.89, 5.22,
       6.38, 5.02, 5.21, 6.75, 5.64, 5.66, 5.17, 5.03, 5.27, 5.95, 5.28,
       5.15, 5.29, 5.21, 6.27, 5.54, 6.1 , 6.38, 5.1 , 5.93, 6.49, 6.37,
       5.74, 5.71, 5.89, 5.3 , 6.36, 5.65, 5.7 , 5.68, 6.7 , 6.65, 5.43,
       6.84, 5.03, 5.46, 5.17, 6.47, 5.05, 4.76, 5.63, 4.96, 5.67, 6.01,
       5.94, 5.48, 6.07, 5.3 , 5.23, 5.16, 5.9 , 5.12, 4.98, 5.81, 5.9 ,
       5.07, 5.76, 6.16, 5.21, 5.4 , 5.41, 6.04, 5.51, 5.44, 5.71, 6.22,
       5.23, 5.36, 5.06, 6.36, 5.02, 5.08, 6.57, 5.48, 5.18, 5.07, 5.52,
       6.08, 5.31, 5.35, 5.05, 6.47, 5.8 , 5.09, 5.