# Simple Wine Quality Classifier on Wine Quality Dataset

In [1]:
# Import important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Statistical Visualization
import seaborn as sns

# Classification or Regression imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

#Model Selection Specific

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

# Load Dataset

In [2]:
df = pd.read_csv('./data/mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


# Analyze Dataset

In [3]:
df.shape

(8124, 23)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
class                       8124 non-null object
cap-shape                   8124 non-null object
cap-surface                 8124 non-null object
cap-color                   8124 non-null object
bruises                     8124 non-null object
odor                        8124 non-null object
gill-attachment             8124 non-null object
gill-spacing                8124 non-null object
gill-size                   8124 non-null object
gill-color                  8124 non-null object
stalk-shape                 8124 non-null object
stalk-root                  8124 non-null object
stalk-surface-above-ring    8124 non-null object
stalk-surface-below-ring    8124 non-null object
stalk-color-above-ring      8124 non-null object
stalk-color-below-ring      8124 non-null object
veil-type                   8124 non-null object
veil-color                  8124 non-null object
ring-number

In [39]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


### Count number of instances for each quality

In [40]:
df['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

# Encode The Data Set

## Label Encoder

In [41]:
# Using Label Encoder

le = LabelEncoder()

df_label = df.apply(le.fit_transform)
df_label.head()
df_label.shape

(8124, 23)

# Create Features and Label Splits

In [53]:
X= df_label.drop(['class'], axis=1)
y = df_label['class']
y.head()

0    1
1    0
2    0
3    1
4    0
Name: class, dtype: int64

## One Hot Encoding

In [54]:
# Using One Hot Encoder

ohe = OneHotEncoder()

X_oh = pd.get_dummies(X, columns = X.columns)
# df_oh = pd.DataFrame(df_one_hot)
# df_oh.shape
X_oh.head()

Unnamed: 0,cap-shape_0,cap-shape_1,cap-shape_2,cap-shape_3,cap-shape_4,cap-shape_5,cap-surface_0,cap-surface_1,cap-surface_2,cap-surface_3,...,population_3,population_4,population_5,habitat_0,habitat_1,habitat_2,habitat_3,habitat_4,habitat_5,habitat_6
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


# Train Test Split

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_oh, y, test_size = 0.2)

print(X_train.shape, X_test.shape)

(6499, 117) (1625, 117)


# Feature Scaling to X_train and X_test to classify better.

In [19]:
# fsc = StandardScaler()
# X_train = fsc.fit_transform(X_train)
# X_test = fsc.transform(X_test)

In [56]:
models = []

models.append(("Logistic Regression:", LogisticRegression()))
models.append(("K-Nearest Neighbour:", KNeighborsClassifier(n_neighbors=3)))
models.append(("Decision Tree Classifier:", DecisionTreeClassifier()))
models.append(("Random Forest Classifier:", RandomForestClassifier(n_estimators=64)))
models.append(("MLP:", MLPClassifier(hidden_layer_sizes=(45,30,15),solver='sgd',learning_rate_init=0.01,max_iter=500)))
models.append(("GradientBoostingClassifier:", GradientBoostingClassifier()))
models.append(("SVC:", SVC(kernel = 'rbf', random_state = 0)))

print('Models appended...')

Models appended...


In [57]:
def run_models():
    results = []
    names = []
    for name,model in models:
        cv_result = cross_val_score(model, X_train, y_train.values.ravel(), cv = 10, scoring = "accuracy")
        names.append(name)
        results.append(cv_result)
    for i in range(len(names)):
        print(names[i],results[i].mean()*100)

# Function to run the Models with Cross Validation

In [58]:
run_models()

Logistic Regression: 99.9846153846
K-Nearest Neighbour: 100.0
Decision Tree Classifier: 99.9846153846
Random Forest Classifier: 100.0
MLP: 99.9692307692
GradientBoostingClassifier: 99.9692544015
SVC: 99.8461537733


## Accuracy Outputs

### With LabelEncoder Encoding Data Shape (8124, 23)

- Logistic Regression: 95.1068796339
- K Nearest Neighbour: 99.8922602093
- Decision Tree Classifier: 100.0
- Random Forest Classifier: 100.0
- MLP: 99.6153609103
- GradientBoostingClassifier: 100.0
- SVC: 100.0

### With LabelEncoder Encoding Data Shape (8124, 119)

- Logistic Regression: 99.9846153846
- K-Nearest Neighbour: 100.0
- Decision Tree Classifier: 99.9846153846
- Random Forest Classifier: 100.0
- MLP: 99.9692307692
- GradientBoostingClassifier: 99.9692544015
- SVC: 99.8461537733

## Analysis

- Label Encoder gives best accuracy of 100 with less dimensions which is good. Although **Logistic regression is only 95%**
- One Hot Encoder gives best accuracy of 100 with almost all the classifiers are closing to **99-100%** range. But with **5 times more dimensions i.e. 117 instead of 23 original**.