In [1]:
#This is a kernel for setting benchmarks on Wine Quality Dataset. This dataset will proceed in the following way -
#Pre-Process data using - Missing values imputation, Duplicates removal
#Feature Selection using PCA
#Setting Model Benchmarks by Cross Validation & Hyperparametrization when required
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_original = pd.read_csv('../../NumericData/winequality.csv')
data_original.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
data_noise = pd.read_csv('../../NumericData/50_noisy_winequality.csv')
data_noise.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,red,15.56,0.27,1.43,20.7,0.19,45.0,205.42,1.04,3.0,1.78,9.24,6
1,red,13.56,0.42,0.79,20.81,0.049,14.0,132.0,0.99,3.3,0.68,13.75,6
2,red,8.1,0.28,1.41,6.9,0.15,77.37,268.52,1.02,3.1,1.25,12.68,6
3,white,12.9,0.32,0.32,22.88,0.058,47.0,395.79,0.9956,3.19,1.55,9.9,6
4,red,10.19,0.92,1.38,49.49,0.06,20.39,60.12,1.03,3.19,0.63,14.21,6


In [4]:
data_noise.corr

<bound method DataFrame.corr of        type  fixed acidity  volatile acidity  citric acid  residual sugar  \
0       red          15.56             0.270         1.43           20.70   
1       red          13.56             0.420         0.79           20.81   
2       red           8.10             0.280         1.41            6.90   
3     white          12.90             0.320         0.32           22.88   
4       red          10.19             0.920         1.38           49.49   
5     white           8.10             0.280         1.48           50.42   
6       red           6.20             0.320         1.48           41.50   
7     white           9.17             0.320         0.36           27.56   
8       red          14.32             0.530         0.86            1.60   
9       red          12.82             0.090         0.39           21.55   
10    white           6.72             0.270         0.41            1.74   
11      red           6.22             0.230

In [5]:
data_noise.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [6]:
data_noise.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
type                    6497 non-null object
fixed acidity           6495 non-null float64
volatile acidity        6497 non-null float64
citric acid             6497 non-null float64
residual sugar          6497 non-null float64
chlorides               6497 non-null float64
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6496 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
dtypes: float64(11), int64(1), object(1)
memory usage: 659.9+ KB


In [7]:
data_noise['quality'].unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [8]:
#count of each target variable
from collections import Counter
Counter(data_noise['quality'])

Counter({6: 2836, 5: 2138, 7: 1079, 8: 193, 4: 216, 3: 30, 9: 5})

In [9]:
#next we shall create a new column called Review. This column will contain the values of 1,2, and 3. 
#1 - Bad
#2 - Average
#3 - Excellent
#This will be split in the following way. 
#1,2,3 --> Bad
#4,5,6,7 --> Average
#8,9,10 --> Excellent
#Create an empty list called Reviews
reviews = []
for i in data_noise['quality']:
    if i >= 1 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
data_noise['Reviews'] = reviews

In [10]:
#next we shall create a new column called Review. This column will contain the values of 1,2, and 3. 
#1 - Bad
#2 - Average
#3 - Excellent
#This will be split in the following way. 
#1,2,3 --> Bad
#4,5,6,7 --> Average
#8,9,10 --> Excellent
#Create an empty list called Reviews
reviews = []
for i in data_original['quality']:
    if i >= 1 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
data_original['Reviews'] = reviews

In [11]:
#chkpoint
data_original.shape

(6497, 14)

In [12]:
#view final data
data_noise.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'Reviews'],
      dtype='object')

In [13]:
data_noise['Reviews'].unique()

array(['2', '3', '1'], dtype=object)

In [14]:
Counter(data_noise['Reviews'])

Counter({'2': 6269, '3': 198, '1': 30})

In [15]:
x_noise = data_noise.iloc[:,1:11]
y_noise = data_noise['Reviews']

In [16]:
x_original = data_noise.iloc[:,1:11]
y_original = data_noise['Reviews']

In [17]:
#Find missing values
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
data_noise.dropna(inplace=True)

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """
        Impute missing values:
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

data_noise = DataFrameImputer().fit_transform(data_noise)

In [23]:
data_noise.shape

(6494, 14)

In [18]:
#Find missing values
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
data_original.dropna(inplace=True)

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """
        Impute missing values:
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

data_original = DataFrameImputer().fit_transform(data_original)

In [22]:
#Chkpt
data_original.shape

(6463, 14)

In [19]:
# importing model for feature importance
from sklearn.ensemble import ExtraTreesClassifier

# passing the model
model_noise = ExtraTreesClassifier(random_state = 53)

# feeding all our features to var 'X'
X_noise = data_noise.iloc[:,1:11]
# feeding our target variable to var 'y'
y_noise = data_noise['Reviews']

# training the model
model_noise.fit(X_noise, y_noise)

# extracting feature importance from model and making a dataframe of it in descending order
ETC_feature_importances = pd.DataFrame(model_noise.feature_importances_, index = X_noise.columns, columns=['ETC']).sort_values('ETC', ascending=False)

# removing traces of this model
model_noise = None

# show top 10 features
ETC_feature_importances.head(10)



Unnamed: 0,ETC
chlorides,0.108405
volatile acidity,0.105844
fixed acidity,0.10405
residual sugar,0.103755
pH,0.099082
free sulfur dioxide,0.098404
citric acid,0.097799
sulphates,0.09687
total sulfur dioxide,0.095943
density,0.089847


In [20]:
# importing model for feature importance
from sklearn.ensemble import ExtraTreesClassifier

# passing the model
model_original = ExtraTreesClassifier(random_state = 53)

# feeding all our features to var 'X'
X_original = data_original.iloc[:,1:11]
# feeding our target variable to var 'y'
y_original = data_original['Reviews']

# training the model
model_original.fit(X_original, y_original)

# extracting feature importance from model and making a dataframe of it in descending order
ETC_feature_importances = pd.DataFrame(model_original.feature_importances_, index = X_original.columns, columns=['ETC']).sort_values('ETC', ascending=False)

# removing traces of this model
model_original = None

# show top 10 features
ETC_feature_importances.head(10)



Unnamed: 0,ETC
density,0.112072
volatile acidity,0.107933
free sulfur dioxide,0.106279
sulphates,0.102915
residual sugar,0.102288
chlorides,0.10153
fixed acidity,0.095435
pH,0.094691
citric acid,0.088685
total sulfur dioxide,0.088171


In [42]:
y_original.shape

(6463,)

In [21]:
data_original.shape

(6463, 14)

In [24]:
data_noise.shape

(6494, 14)

In [None]:
#data_noise = data_noise.iloc[0:6463,:]

In [44]:
from sklearn.model_selection import train_test_split
cols=data_noise.columns
cols = list(cols.drop(['type', 'Reviews', 'quality','pH','chlorides']))
X_train_noise, X_test_noise, y_train_noise, y_test_noise = train_test_split(data_noise.loc[0:6463, cols],data_noise.loc[0:6463, 'Reviews'], test_size=0.20, random_state=101)

In [45]:
y_test_noise.shape

(1293,)

In [46]:
from sklearn.model_selection import train_test_split
cols=data_original.columns
cols = list(cols.drop(['type', 'Reviews', 'quality','pH','chlorides']))
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(data_original.loc[:, cols], y_original, test_size=0.20, random_state=101)

In [47]:
y_test_original.shape

(1293,)

In [48]:
# Spot Check Algorithms

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('SVM_rbf', SVC()))
#models.append(('SVM_linear', SVC(kernel='linear')))

In [50]:
# Evaluate each model in turn
#import warnings
#warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
train_results = []
test_results = []
names = []
for name, model in models:
    cv_train_results = cross_val_score(model, X_train_noise, y_train_noise, 
                                       cv=10, scoring='accuracy')
    train_results.append(cv_train_results)
    clf = model.fit(X_train_noise, y_train_noise)
    cv_test_results = accuracy_score(y_test_noise, clf.predict(X_test_original))
    test_results.append(cv_test_results)
    names.append(name)
    result = "%s: %f (%f) %f" % (name, cv_train_results.mean(), cv_train_results.std(), 
                                cv_test_results)
    print(result)



LR: 0.964785 (0.001107) 0.964424
RF: 0.964591 (0.001187) 0.964424
DT: 0.923765 (0.005574) 0.960557




SVM_rbf: 0.964785 (0.001107) 0.964424
