In [1]:
#This is a kernel for setting benchmarks on Wine Quality Dataset. This dataset will proceed in the following way -
#Pre-Process data using - Missing values imputation, Duplicates removal
#Feature Selection using PCA
#Setting Model Benchmarks by Cross Validation & Hyperparametrization when required
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_noise = pd.read_csv('../../NumericData/10_noisy_winequality.csv')
data_noise.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,57.46,0.3,14.0,172.82,0.994,3.16,0.49,9.5,6
2,white,8.1,0.28,0.75,44.0,0.05,60.5,97.0,0.9951,3.26,0.44,10.1,6
3,red,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
data_original = pd.read_csv('../../NumericData/winequality.csv')
data_original.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
data_noise.corr

<bound method DataFrame.corr of        type  fixed acidity  volatile acidity  citric acid  residual sugar  \
0     white           7.00             0.270         0.36           20.70   
1     white           6.30             0.300         0.34           57.46   
2     white           8.10             0.280         0.75           44.00   
3       red           7.20             0.230         0.32            8.50   
4     white           7.20             0.230         0.32            8.50   
5     white           8.10             0.280         0.40            6.90   
6     white           6.20             1.500         0.16            7.00   
7     white           7.00             0.270         0.36           15.62   
8     white           6.30             0.390         0.34            1.60   
9     white           8.10             0.220         0.43           26.75   
10    white           8.10             0.270         0.41            1.45   
11      red          15.46             0.630

In [5]:
data_noise.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [6]:
data_noise.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
type                    6497 non-null object
fixed acidity           6488 non-null float64
volatile acidity        6489 non-null float64
citric acid             6494 non-null float64
residual sugar          6495 non-null float64
chlorides               6496 non-null float64
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6488 non-null float64
sulphates               6493 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
dtypes: float64(11), int64(1), object(1)
memory usage: 659.9+ KB


In [7]:
data_noise['quality'].unique()

array([6, 5, 7, 8, 4, 3, 9])

In [8]:
#count of each target variable
from collections import Counter
Counter(data_noise['quality'])

Counter({6: 2836, 5: 2138, 7: 1079, 8: 193, 4: 216, 3: 30, 9: 5})

In [9]:
#next we shall create a new column called Review. This column will contain the values of 1,2, and 3. 
#1 - Bad
#2 - Average
#3 - Excellent
#This will be split in the following way. 
#1,2,3 --> Bad
#4,5,6,7 --> Average
#8,9,10 --> Excellent
#Create an empty list called Reviews
reviews = []
for i in data_noise['quality']:
    if i >= 1 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
data_noise['Reviews'] = reviews

In [10]:
#next we shall create a new column called Review. This column will contain the values of 1,2, and 3. 
#1 - Bad
#2 - Average
#3 - Excellent
#This will be split in the following way. 
#1,2,3 --> Bad
#4,5,6,7 --> Average
#8,9,10 --> Excellent
#Create an empty list called Reviews
reviews = []
for i in data_original['quality']:
    if i >= 1 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
data_original['Reviews'] = reviews

In [11]:
#view final data
data_noise.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'Reviews'],
      dtype='object')

In [12]:
data_noise['Reviews'].unique()

array(['2', '3', '1'], dtype=object)

In [13]:
Counter(data_noise['Reviews'])

Counter({'2': 6269, '3': 198, '1': 30})

In [14]:
x_noise = data_noise.iloc[:,1:11]
y_noise = data_noise['Reviews']

In [15]:
x_noise.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45
1,6.3,0.3,0.34,57.46,0.3,14.0,172.82,0.994,3.16,0.49
2,8.1,0.28,0.75,44.0,0.05,60.5,97.0,0.9951,3.26,0.44
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44
6,6.2,1.5,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,1.82
7,7.0,0.27,0.36,15.62,0.4,45.0,33.65,1.001,3.0,0.45
8,6.3,0.39,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49
9,8.1,0.22,0.43,26.75,0.044,28.0,129.0,0.9938,2.9,0.45


In [16]:
y_noise.head(10)

0    2
1    2
2    2
3    2
4    2
5    2
6    2
7    2
8    2
9    2
Name: Reviews, dtype: object

In [17]:
#Find missing values
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
data_noise.dropna(inplace=True)

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """
        Impute missing values:
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

data_noise = DataFrameImputer().fit_transform(data_noise)

In [18]:
#Find missing values
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
data_original.dropna(inplace=True)

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """
        Impute missing values:
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

data_original = DataFrameImputer().fit_transform(data_original)

In [19]:
# importing model for feature importance
from sklearn.ensemble import ExtraTreesClassifier

# passing the model
model = ExtraTreesClassifier(random_state = 53)

# feeding all our features to var 'X'
X = data_noise.iloc[:,1:11]
# feeding our target variable to var 'y'
y = data_noise['Reviews']

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ETC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ETC']).sort_values('ETC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
ETC_feature_importances.head(8)



Unnamed: 0,ETC
density,0.108638
sulphates,0.108539
chlorides,0.104799
fixed acidity,0.100919
volatile acidity,0.099729
total sulfur dioxide,0.099099
pH,0.099065
residual sugar,0.096325


In [20]:
# importing model for feature importance
from sklearn.ensemble import RandomForestClassifier

# passing the model
model = RandomForestClassifier(random_state = 53)

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
RFC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['RFC']).sort_values('RFC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
RFC_feature_importances.head(8)



Unnamed: 0,RFC
total sulfur dioxide,0.120165
volatile acidity,0.108349
density,0.107388
free sulfur dioxide,0.103277
chlorides,0.100365
pH,0.097289
fixed acidity,0.095348
sulphates,0.094919


In [21]:
# importing model for feature importance
from sklearn.ensemble import AdaBoostClassifier

# passing the model
model = AdaBoostClassifier(random_state = 53)

model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ADB_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ADB']).sort_values('ADB', ascending=False)

# removing traces of this model
model = None

ADB_feature_importances.head(10)

Unnamed: 0,ADB
total sulfur dioxide,0.2
citric acid,0.16
free sulfur dioxide,0.16
residual sugar,0.14
density,0.08
pH,0.08
sulphates,0.06
fixed acidity,0.04
volatile acidity,0.04
chlorides,0.04


In [22]:
from sklearn.model_selection import train_test_split
cols=data_noise.columns
cols = list(cols.drop(['type', 'Reviews', 'quality']))
X_train_noise, X_test_noise, y_train_noise, y_test_noise = train_test_split(data_noise.loc[:, cols], y, test_size=0.20, random_state=31)

In [23]:
y_org=data_original.iloc[:, 13]

In [24]:
from sklearn.model_selection import train_test_split
cols=data_original.columns
cols = list(cols.drop(['type', 'Reviews', 'quality']))
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(data_original.loc[:, cols],y_org, test_size=0.20, random_state=31)

In [25]:
#Modelling with Noisy Training Data and Clean Test data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit(X_train_noise, y_train_noise.ravel())
y_pred = model.predict(X_test_original)



In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test_original, y_pred)
print(cm)

[[   0    5    0]
 [   0 1252    0]
 [   0   36    0]]


In [27]:
print(f1_score(y_test_original, y_pred, average='weighted'))
print(accuracy_score(y_test_original, y_pred))

0.9526916128404876
0.9682907965970611


  'precision', 'predicted', average, warn_for)


In [28]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

model.fit(X_train_noise, y_train_noise.ravel())
y_pred = model.predict(X_test_original)



In [29]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test_original, y_pred)
print(cm)
print(f1_score(y_test_original, y_pred, average='weighted'))
print(accuracy_score(y_test_original, y_pred))

[[   0    5    0]
 [   0 1252    0]
 [   0   26   10]]
0.9685550814107721
0.9760247486465584


  'precision', 'predicted', average, warn_for)
