In [3]:
#This is a kernel for setting benchmarks on Wine Quality Dataset. This dataset will proceed in the following way -
#Pre-Process data using - Missing values imputation, Duplicates removal
#Feature Selection using PCA
#Setting Model Benchmarks by Cross Validation & Hyperparametrization when required
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv('../../NumericData/winequality.csv')
data.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
data.corr

<bound method DataFrame.corr of        type  fixed acidity  volatile acidity  citric acid  residual sugar  \
0     white            7.0             0.270         0.36           20.70   
1     white            6.3             0.300         0.34            1.60   
2     white            8.1             0.280         0.40            6.90   
3     white            7.2             0.230         0.32            8.50   
4     white            7.2             0.230         0.32            8.50   
5     white            8.1             0.280         0.40            6.90   
6     white            6.2             0.320         0.16            7.00   
7     white            7.0             0.270         0.36           20.70   
8     white            6.3             0.300         0.34            1.60   
9     white            8.1             0.220         0.43            1.50   
10    white            8.1             0.270         0.41            1.45   
11    white            8.6             0.230

In [6]:
data.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
type                    6497 non-null object
fixed acidity           6487 non-null float64
volatile acidity        6489 non-null float64
citric acid             6494 non-null float64
residual sugar          6495 non-null float64
chlorides               6495 non-null float64
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6488 non-null float64
sulphates               6493 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
dtypes: float64(11), int64(1), object(1)
memory usage: 659.9+ KB


In [8]:
data['quality'].unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [9]:
#count of each target variable
from collections import Counter
Counter(data['quality'])

Counter({6: 2836, 5: 2138, 7: 1079, 8: 193, 4: 216, 3: 30, 9: 5})

In [10]:
#next we shall create a new column called Review. This column will contain the values of 1,2, and 3. 
#1 - Bad
#2 - Average
#3 - Excellent
#This will be split in the following way. 
#1,2,3 --> Bad
#4,5,6,7 --> Average
#8,9,10 --> Excellent
#Create an empty list called Reviews
reviews = []
for i in data['quality']:
    if i >= 1 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
data['Reviews'] = reviews

In [11]:
#view final data
data.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'Reviews'],
      dtype='object')

In [12]:
data['Reviews'].unique()

array(['2', '3', '1'], dtype=object)

In [13]:
Counter(data['Reviews'])

Counter({'2': 6269, '3': 198, '1': 30})

In [14]:
x = data.iloc[:,1:11]
y = data['Reviews']

In [15]:
x.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47
7,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45
8,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45


In [16]:
y.head(10)

0    2
1    2
2    2
3    2
4    2
5    2
6    2
7    2
8    2
9    2
Name: Reviews, dtype: object

In [17]:
#Scaling using standard scalar for PCA
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

In [18]:
#Find missing values
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
data.dropna(inplace=True)

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """
        Impute missing values:
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

wines = DataFrameImputer().fit_transform(data)

In [19]:
# importing model for feature importance
from sklearn.ensemble import ExtraTreesClassifier

# passing the model
model = ExtraTreesClassifier(random_state = 53)

# feeding all our features to var 'X'
X = data.iloc[:,1:11]
# feeding our target variable to var 'y'
y = data['Reviews']

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ETC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ETC']).sort_values('ETC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
ETC_feature_importances.head(8)



Unnamed: 0,ETC
density,0.112072
volatile acidity,0.107933
free sulfur dioxide,0.106279
sulphates,0.102915
residual sugar,0.102288
chlorides,0.10153
fixed acidity,0.095435
pH,0.094691


In [20]:
# importing model for feature importance
from sklearn.ensemble import RandomForestClassifier

# passing the model
model = RandomForestClassifier(random_state = 53)

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
RFC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['RFC']).sort_values('RFC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
RFC_feature_importances.head(8)



Unnamed: 0,RFC
residual sugar,0.118661
density,0.116441
sulphates,0.115823
volatile acidity,0.108757
free sulfur dioxide,0.105719
fixed acidity,0.102517
total sulfur dioxide,0.088364
citric acid,0.082946


In [21]:
# importing model for feature importance
from sklearn.ensemble import AdaBoostClassifier

# passing the model
model = AdaBoostClassifier(random_state = 53)

model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ADB_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ADB']).sort_values('ADB', ascending=False)

# removing traces of this model
model = None

ADB_feature_importances.head(10)

Unnamed: 0,ADB
sulphates,0.18
residual sugar,0.14
free sulfur dioxide,0.14
total sulfur dioxide,0.14
density,0.14
fixed acidity,0.08
volatile acidity,0.06
citric acid,0.06
pH,0.04
chlorides,0.02


In [22]:
data.columns
#cols = list(cols.drop(['type', 'reviews', 'quality']))
#X_train, X_test, y_train, y_test = train_test_split(data.loc[:, cols], y_ql.values, test_size=0.20, random_state=101)

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'Reviews'],
      dtype='object')

In [23]:
from sklearn.model_selection import train_test_split
cols=data.columns
cols = list(cols.drop(['type', 'Reviews', 'quality','pH','chlorides']))
X_train, X_test, y_train, y_test = train_test_split(data.loc[:, cols], y, test_size=0.20, random_state=101)

In [24]:
data.loc[:, cols].shape

(6463, 9)

In [25]:
y.shape

(6463,)

In [22]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((5170, 9), (5170,), (1293, 9), (1293,))

In [23]:
# Spot Check Algorithms

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('SVM_rbf', SVC()))
#models.append(('SVM_linear', SVC(kernel='linear')))

In [24]:
# Evaluate each model in turn
#import warnings
#warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
train_results = []
test_results = []
names = []
for name, model in models:
    cv_train_results = cross_val_score(model, X_train, y_train, 
                                       cv=10, scoring='accuracy')
    train_results.append(cv_train_results)
    clf = model.fit(X_train, y_train)
    cv_test_results = accuracy_score(y_test, clf.predict(X_test))
    test_results.append(cv_test_results)
    names.append(name)
    result = "%s: %f (%f) %f" % (name, cv_train_results.mean(), cv_train_results.std(), 
                                cv_test_results)
    print(result)



LR: 0.964799 (0.001105) 0.965197




RF: 0.970019 (0.002638) 0.973705
DT: 0.945070 (0.006533) 0.962104




SVM_rbf: 0.968471 (0.001249) 0.966744
