In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score

# load dataset

In [3]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [4]:
data.head(5)

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


In [5]:
data.shape

(1599, 1)

In [6]:

#read dataset by separating semicolon
data = pd.read_csv(dataset_url, sep=';')

In [7]:
data.shape

(1599, 12)

In [8]:
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [9]:
data['quality'].head(150)

0      5
1      5
2      5
3      6
4      5
5      5
6      5
7      7
8      7
9      5
10     5
11     5
12     5
13     5
14     5
15     5
16     7
17     5
18     4
19     6
20     6
21     5
22     5
23     5
24     6
25     5
26     5
27     5
28     5
29     6
      ..
120    5
121    6
122    5
123    5
124    5
125    5
126    5
127    5
128    7
129    5
130    5
131    5
132    5
133    6
134    6
135    5
136    5
137    5
138    5
139    5
140    5
141    5
142    6
143    5
144    6
145    5
146    5
147    5
148    6
149    6
Name: quality, Length: 150, dtype: int64

In [10]:
data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# data preprocessing

In [11]:
data.isnull().count()

fixed acidity           1599
volatile acidity        1599
citric acid             1599
residual sugar          1599
chlorides               1599
free sulfur dioxide     1599
total sulfur dioxide    1599
density                 1599
pH                      1599
sulphates               1599
alcohol                 1599
quality                 1599
dtype: int64

In [12]:
data.isna().count()

fixed acidity           1599
volatile acidity        1599
citric acid             1599
residual sugar          1599
chlorides               1599
free sulfur dioxide     1599
total sulfur dioxide    1599
density                 1599
pH                      1599
sulphates               1599
alcohol                 1599
quality                 1599
dtype: int64

In [13]:
data.index.is_categorical()

False

In [23]:
#split data into training & testing
#declare independet and dependent variabls

y = data.quality
x = data.drop('quality', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y)

In [24]:
#pipeline with preprosessing model-parameters
pipeline = make_pipeline(preprocessing.StandardScaler(),RandomForestRegressor(n_estimators=100))


#declare random forest hyper parametrs to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]}

#sklearn cross-validaion pipeline
model = GridSearchCV(pipeline, hyperparameters, cv=10)
model.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

In [25]:
print(model.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto'}


In [26]:

#Conveniently, GridSearchCV from sklearn will automatically refit the model with the best set of hyperparameters using the entire training set.
print(model.refit)


True


In [27]:
#prediction

y_pred = model.predict(x_test)
y_pred

array([6.6 , 5.94, 5.  , 5.38, 6.45, 5.88, 5.04, 4.65, 5.02, 5.97, 5.24,
       5.74, 5.8 , 5.05, 5.76, 5.72, 6.67, 5.73, 5.78, 6.98, 5.49, 5.64,
       4.99, 6.04, 5.97, 5.04, 5.58, 5.13, 5.87, 6.  , 5.83, 6.56, 5.98,
       5.02, 4.95, 5.9 , 5.02, 5.68, 5.06, 5.79, 4.89, 5.97, 6.76, 5.09,
       6.1 , 5.39, 5.65, 5.46, 5.08, 6.52, 6.  , 5.33, 5.93, 5.17, 5.73,
       5.87, 5.13, 5.37, 4.95, 5.31, 5.28, 5.  , 5.01, 5.8 , 5.98, 5.29,
       6.28, 5.01, 5.14, 6.59, 5.71, 5.47, 5.07, 5.04, 5.3 , 5.95, 5.28,
       5.14, 5.28, 5.2 , 6.56, 5.67, 6.25, 6.52, 5.12, 5.96, 6.56, 6.08,
       5.6 , 5.81, 5.88, 5.31, 6.46, 5.55, 5.72, 5.82, 6.65, 6.71, 5.46,
       6.83, 5.  , 5.34, 5.07, 6.57, 5.09, 4.51, 5.77, 5.11, 5.86, 5.94,
       5.61, 5.54, 6.13, 5.43, 4.98, 5.22, 5.9 , 5.09, 5.  , 6.03, 5.81,
       5.11, 5.84, 5.93, 5.2 , 5.48, 5.38, 5.87, 5.53, 5.45, 5.81, 6.33,
       5.09, 5.27, 5.02, 6.42, 5.  , 5.15, 6.75, 5.45, 5.22, 5.06, 5.85,
       6.09, 5.39, 5.49, 5.14, 6.56, 5.63, 5.04, 5.

In [32]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [34]:
#evaluating model

print(mean_squared_error(y_pred, y_test))

0.3519396875
