In [1]:
#This is a kernel for setting benchmarks on Wine Quality Dataset. This dataset will proceed in the following way -
#Pre-Process data using - Missing values imputation, Duplicates removal
#Feature Selection using PCA
#Setting Model Benchmarks by Cross Validation & Hyperparametrization when required
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_noise = pd.read_csv('../../NumericData/20_noisy_winequality.csv')
data_noise.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,1.5,20.7,0.045,45.0,170.0,1.001,3.0,0.45,10.29,6
1,white,6.3,0.3,0.92,20.81,0.049,14.0,132.0,0.994,3.3,0.68,9.5,6
2,white,8.1,0.28,0.4,6.9,0.46,30.0,97.0,0.9951,3.1,0.44,10.1,6
3,white,7.2,0.83,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
data_noise.shape

(6497, 13)

In [4]:
data_original = pd.read_csv('../../NumericData/winequality.csv')
data_original.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
data_original.shape

(6497, 13)

In [6]:
data_noise.corr

<bound method DataFrame.corr of        type  fixed acidity  volatile acidity  citric acid  residual sugar  \
0     white           7.00             0.270         1.50           20.70   
1     white           6.30             0.300         0.92           20.81   
2     white           8.10             0.280         0.40            6.90   
3     white           7.20             0.830         0.32            8.50   
4     white           7.20             0.230         0.32            8.50   
5     white           8.10             0.280         0.40            6.90   
6       red           6.20             0.320         0.16            7.00   
7     white           6.26             0.270         0.36           20.70   
8     white          14.32             0.530         0.34            1.60   
9     white           8.10             1.350         1.58            1.50   
10      red           8.10             0.270         0.41            1.45   
11    white           8.60             0.230

In [7]:
data_noise.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [8]:
data_noise.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
type                    6497 non-null object
fixed acidity           6490 non-null float64
volatile acidity        6490 non-null float64
citric acid             6495 non-null float64
residual sugar          6496 non-null float64
chlorides               6496 non-null float64
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6490 non-null float64
sulphates               6493 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
dtypes: float64(11), int64(1), object(1)
memory usage: 659.9+ KB


In [9]:
data_noise['quality'].unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [10]:
#count of each target variable
from collections import Counter
Counter(data_noise['quality'])

Counter({6: 2836, 5: 2138, 7: 1079, 8: 193, 4: 216, 3: 30, 9: 5})

In [11]:
#next we shall create a new column called Review. This column will contain the values of 1,2, and 3. 
#1 - Bad
#2 - Average
#3 - Excellent
#This will be split in the following way. 
#1,2,3 --> Bad
#4,5,6,7 --> Average
#8,9,10 --> Excellent
#Create an empty list called Reviews
reviews = []
for i in data_noise['quality']:
    if i >= 1 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
data_noise['Reviews'] = reviews

In [12]:
#next we shall create a new column called Review. This column will contain the values of 1,2, and 3. 
#1 - Bad
#2 - Average
#3 - Excellent
#This will be split in the following way. 
#1,2,3 --> Bad
#4,5,6,7 --> Average
#8,9,10 --> Excellent
#Create an empty list called Reviews
reviews = []
for i in data_original['quality']:
    if i >= 1 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
data_original['Reviews'] = reviews

In [13]:
#view final data
data_noise.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'Reviews'],
      dtype='object')

In [14]:
data_noise['Reviews'].unique()

array(['2', '3', '1'], dtype=object)

In [15]:
Counter(data_noise['Reviews'])

Counter({'2': 6269, '3': 198, '1': 30})

In [16]:
x_noise = data_noise.iloc[:,1:11]
y_noise = data_noise['Reviews']

In [17]:
data_noise.shape

(6497, 14)

In [18]:
x_noise.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates
0,7.0,0.27,1.5,20.7,0.045,45.0,170.0,1.001,3.0,0.45
1,6.3,0.3,0.92,20.81,0.049,14.0,132.0,0.994,3.3,0.68
2,8.1,0.28,0.4,6.9,0.46,30.0,97.0,0.9951,3.1,0.44
3,7.2,0.83,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47
7,6.26,0.27,0.36,20.7,0.045,45.0,170.0,1.0,2.73,0.45
8,14.32,0.53,0.34,1.6,0.049,52.49,337.22,1.01,3.3,0.49
9,8.1,1.35,1.58,1.5,0.044,235.78,129.0,1.03,2.82,0.83


In [19]:
y_noise.head(10)

0    2
1    2
2    2
3    2
4    2
5    2
6    2
7    2
8    2
9    2
Name: Reviews, dtype: object

In [20]:
#Find missing values
from sklearn.base import BaseEstimator, TransformerMixin, clone, ClassifierMixin
data_noise.dropna(inplace=True)

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """
        Impute missing values:
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

data_noise = DataFrameImputer().fit_transform(data_noise)

In [21]:
data_noise.shape

(6472, 14)

In [24]:
#Find missing values
data_original=data_original.iloc[0:6472, :]

In [25]:
data_original.shape

(6472, 14)

In [26]:
# importing model for feature importance
from sklearn.ensemble import ExtraTreesClassifier

# passing the model
model = ExtraTreesClassifier(random_state = 53)

# feeding all our features to var 'X'
X = data_noise.iloc[:,1:11]
# feeding our target variable to var 'y'
y = data_noise['Reviews']

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ETC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ETC']).sort_values('ETC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
ETC_feature_importances.head(8)



Unnamed: 0,ETC
total sulfur dioxide,0.106847
density,0.106171
volatile acidity,0.104495
sulphates,0.100934
chlorides,0.099881
residual sugar,0.099473
fixed acidity,0.098635
pH,0.096015


In [27]:
# importing model for feature importance
from sklearn.ensemble import RandomForestClassifier

# passing the model
model = RandomForestClassifier(random_state = 53)

# training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
RFC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['RFC']).sort_values('RFC', ascending=False)

# removing traces of this model
model = None

# show top 10 features
RFC_feature_importances.head(8)



Unnamed: 0,RFC
total sulfur dioxide,0.12929
density,0.120844
fixed acidity,0.106754
citric acid,0.10067
sulphates,0.093302
free sulfur dioxide,0.092661
pH,0.091698
chlorides,0.08878


In [28]:
# importing model for feature importance
from sklearn.ensemble import AdaBoostClassifier

# passing the model
model = AdaBoostClassifier(random_state = 53)

model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ADB_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ADB']).sort_values('ADB', ascending=False)

# removing traces of this model
model = None

ADB_feature_importances.head(10)

Unnamed: 0,ADB
free sulfur dioxide,0.18
citric acid,0.16
residual sugar,0.16
total sulfur dioxide,0.14
sulphates,0.12
volatile acidity,0.08
chlorides,0.06
density,0.04
pH,0.04
fixed acidity,0.02


In [29]:
loop = 150

In [30]:
y.shape

(6472,)

In [31]:
data_noise.shape

(6472, 14)

In [32]:
data_original.shape

(6472, 14)

In [33]:
cols=data_noise.columns
cols = list(cols.drop(['type', 'Reviews', 'quality','pH','chlorides']))
data_noise.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Reviews
0,white,7.0,0.27,1.5,20.7,0.045,45.0,170.0,1.001,3.0,0.45,10.29,6,2
1,white,6.3,0.3,0.92,20.81,0.049,14.0,132.0,0.994,3.3,0.68,9.5,6,2
2,white,8.1,0.28,0.4,6.9,0.46,30.0,97.0,0.9951,3.1,0.44,10.1,6,2
3,white,7.2,0.83,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,2
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,2


In [34]:
from sklearn.model_selection import train_test_split
cols=data_noise.columns
cols = list(cols.drop(['type', 'Reviews', 'quality','pH','chlorides']))
X_train_noise = []
X_test_noise = []
y_train_noise = []
y_test_noise = []
for i in range(0, loop):
    X_train_noisey, X_test_noisey, y_train_noisey, y_test_noisey = train_test_split(data_noise.loc[:, cols], data_noise.iloc[:, 13], test_size=0.20, random_state=i)
    X_train_noise.append(X_train_noisey)
    X_test_noise.append(X_test_noisey)
    y_train_noise.append(y_train_noisey)
    y_test_noise.append(y_test_noisey)

In [35]:
data_original.loc[:, cols].shape

(6472, 9)

In [36]:
y.shape

(6472,)

In [37]:
data_original.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Reviews
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,2
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,2
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,2
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,2
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,2


In [38]:
from sklearn.model_selection import train_test_split
cols=data_original.columns
cols = list(cols.drop(['type', 'Reviews', 'quality','pH','chlorides']))
X_train_original = []
X_test_original = []
y_train_original = []
y_test_original = []
for i in range(0, loop):
    X_train_originalx, X_test_originalx, y_train_originalx, y_test_originalx = train_test_split(data_original.loc[:, cols], data_original.iloc[:, 13], test_size=0.20, random_state=i)
    X_train_noise.append(X_train_originalx)
    X_test_noise.append(X_test_originalx)
    y_train_noise.append(y_train_originalx)
    y_test_noise.append(y_test_originalx)

In [39]:
from sklearn.linear_model import LogisticRegression

y_pred = []
for i in range(0, loop):
    model = LogisticRegression()
    model.fit(X_train_noise[i], y_train_noise[i].ravel())
    y_predi = model.predict(X_test_original[i])
    y_pred.append(y_predi)



IndexError: list index out of range

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
accuracy = []
f1 = []
for i in range(0, loop):
    cm = confusion_matrix(y_test_original[i], y_pred[i])
    #print(cm)
    #print(accuracy_score(y_test_original[i], y_pred[i]))
    #print(f1_score(y_test_original[i], y_pred[i],average='weighted'))
    accuracy.append(accuracy_score(y_test_original[i], y_pred[i]))
    f1.append(f1_score(y_test_original[i], y_pred[i],average='weighted'))

print("Mean Accuracy is : "+str(np.mean(accuracy)))
print("Mean F1 is : "+str(np.mean(f1)))