# Sample Wine Classifier on Wine Dataset

In [134]:
# Import important libraries
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

%matplotlib inline

# Load Dataset

In [135]:
df = pd.read_csv('../data/wine.csv')
#Adding Columns

df.columns = [  'name'
                 ,'alcohol'
                 ,'malicAcid'
                 ,'ash'
                 ,'ashalcalinity'
                 ,'magnesium'
                 ,'totalPhenols'
                 ,'flavanoids'
                 ,'nonFlavanoidPhenols'
                 ,'proanthocyanins'
                 ,'colorIntensity'
                 ,'hue'
                 ,'od280_od315'
                 ,'proline'
                ]

# Analyze Train Dataset

In [136]:
df.head(15)

Unnamed: 0,name,alcohol,malicAcid,ash,ashalcalinity,magnesium,totalPhenols,flavanoids,nonFlavanoidPhenols,proanthocyanins,colorIntensity,hue,od280_od315,proline
0,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
1,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
2,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
3,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735
4,1,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
5,1,14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290
6,1,14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295
7,1,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045
8,1,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045
9,1,14.1,2.16,2.3,18.0,105,2.95,3.32,0.22,2.38,5.75,1.25,3.17,1510


In [137]:
df.shape

(129, 14)

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 14 columns):
name                   129 non-null int64
alcohol                129 non-null float64
malicAcid              129 non-null float64
ash                    129 non-null float64
ashalcalinity          129 non-null float64
magnesium              129 non-null int64
totalPhenols           129 non-null float64
flavanoids             129 non-null float64
nonFlavanoidPhenols    129 non-null float64
proanthocyanins        129 non-null float64
colorIntensity         129 non-null float64
hue                    129 non-null float64
od280_od315            129 non-null float64
proline                129 non-null int64
dtypes: float64(11), int64(3)
memory usage: 14.2 KB


In [139]:
df.isnull().sum()

name                   0
alcohol                0
malicAcid              0
ash                    0
ashalcalinity          0
magnesium              0
totalPhenols           0
flavanoids             0
nonFlavanoidPhenols    0
proanthocyanins        0
colorIntensity         0
hue                    0
od280_od315            0
proline                0
dtype: int64

In [140]:
df.describe()

Unnamed: 0,name,alcohol,malicAcid,ash,ashalcalinity,magnesium,totalPhenols,flavanoids,nonFlavanoidPhenols,proanthocyanins,colorIntensity,hue,od280_od315,proline
count,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0
mean,1.550388,12.934109,1.970078,2.339767,18.810078,99.689922,2.520543,2.485581,0.33062,1.748217,4.183566,1.059039,2.946899,787.96124
std,0.499394,0.884909,0.883102,0.298215,3.401995,15.25297,0.546967,0.738913,0.109421,0.539571,1.627075,0.169339,0.469261,353.045686
min,1.0,11.03,0.74,1.36,10.6,70.0,1.1,0.57,0.13,0.41,1.28,0.69,1.59,278.0
25%,1.0,12.22,1.51,2.17,16.6,88.0,2.13,2.0,0.26,1.42,2.85,0.94,2.73,472.0
50%,2.0,12.99,1.73,2.32,18.6,98.0,2.56,2.55,0.3,1.7,3.84,1.05,2.96,714.0
75%,2.0,13.72,2.06,2.53,21.0,108.0,2.95,2.99,0.4,1.99,5.3,1.17,3.3,1060.0
max,2.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,8.9,1.71,4.0,1680.0


# Create Test and Train Splits

In [141]:
X= df.drop(['name'], axis=1)

X.head()

Unnamed: 0,alcohol,malicAcid,ash,ashalcalinity,magnesium,totalPhenols,flavanoids,nonFlavanoidPhenols,proanthocyanins,colorIntensity,hue,od280_od315,proline
0,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
2,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
3,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735
4,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450


In [142]:
y = df['name']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: name, dtype: int64

In [143]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.2)

print(train_X.shape, valid_X.shape)

(103, 13) (26, 13)


## Description on Algorithm Used

I have used multiple Sci-kit learn algorithms to compare the testing accuracy for the dataset. And then I used RandomForestClassifier to predict the labels.

In [144]:
models = []

models.append(("Logistic Regression:",LogisticRegression()))
models.append(("Naive Bayes:",GaussianNB()))
models.append(("K-Nearest Neighbour:",KNeighborsClassifier(n_neighbors=3)))
models.append(("Decision Tree:",DecisionTreeClassifier()))
models.append(("Random Forest:",RandomForestClassifier()))
models.append(("AdaBoostClassifier:",AdaBoostClassifier()))
models.append(("GradientBoostingClassifier:",GradientBoostingClassifier()))

print('Models appended...')

Models appended...


In [145]:
results = []
names = []
for name,model in models:
    kfold = KFold(n_splits=10, random_state=0)
    cv_result = cross_val_score(model,train_X,train_y.values.ravel(), cv = kfold,scoring = "accuracy")
    names.append(name)
    results.append(cv_result)
for i in range(len(names)):
    print(names[i],results[i].mean()*100)

Logistic Regression: 97.0
Naive Bayes: 98.0
K-Nearest Neighbour: 93.0909090909
Decision Tree: 95.0
Random Forest: 98.0
AdaBoostClassifier: 98.0909090909
GradientBoostingClassifier: 95.0909090909


In [146]:
clf = RandomForestClassifier()

In [147]:
clf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [148]:
y_pred = clf.predict(valid_X)

## Confusion matrix on Train

In [149]:
confusion_matrix(valid_y, y_pred)

array([[11,  0],
       [ 0, 15]])