In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import pylab as pl

In [83]:
from sklearn.datasets import load_wine
data = load_wine()

In [84]:
print(data['data'])

[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]


In [85]:
print(data['target'])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [86]:
print(data.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [87]:
print(data.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [88]:
print(data.target_names)

['class_0' 'class_1' 'class_2']


In [89]:
x = data.data
x[0:5]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
        2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02, 2.800e+00,
        3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
        1.185e+03],
       [1.437e+01, 1.950e+00, 2.500e+00, 1.680e+01, 1.130e+02, 3.850e+00,
        3.490e+00, 2.400e-01, 2.180e+00, 7.800e+00, 8.600e-01, 3.450e+00,
        1.480e+03],
       [1.324e+01, 2.590e+00, 2.870e+00, 2.100e+01, 1.180e+02, 2.800e+00,
        2.690e+00, 3.900e-01, 1.820e+00, 4.320e+00, 1.040e+00, 2.930e+00,
        7.350e+02]])

In [90]:
y = data.target
y[0:5]

array([0, 0, 0, 0, 0])

In [91]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 4)

In [92]:
x_train.shape

(124, 13)

In [93]:
x_test.shape

(54, 13)

In [94]:
y_train.shape

(124,)

In [95]:
y_test.shape

(54,)

In [96]:
from sklearn.linear_model import LogisticRegression
regr = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', C = 0.01, max_iter = 100)
regr.fit(x_train, y_train)



LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [97]:
y_test[0:20]

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0, 1, 0, 1, 1, 2])

In [98]:
ypred = regr.predict(x_test)
ypred[0:20]

array([2, 1, 0, 0, 1, 2, 0, 0, 0, 1, 1, 0, 2, 2, 0, 1, 0, 1, 1, 2])

In [99]:
a_1 = regr.score(x_train, y_train)
a_1

0.9435483870967742

In [100]:
a_2 = regr.score(x_test, y_test)
a_2

0.9074074074074074

In [101]:
a_3 = regr.score(x_test, ypred)
a_3

1.0

In [102]:
from sklearn.metrics import jaccard_similarity_score, accuracy_score
a_4 = jaccard_similarity_score(y_test, ypred)
a_4



0.9074074074074074

In [103]:
a_5 = accuracy_score(y_test, ypred)
a_5

0.9074074074074074

In [104]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [105]:
y_test[0:15]

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0])

In [106]:
yhat = knn.predict(x_test)
yhat[0:15]

array([2, 2, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2, 2, 0])

In [107]:
b_1 = knn.score(x_train, y_train)
b_1

1.0

In [108]:
b_2 = knn.score(x_test, y_test)
b_2

0.7962962962962963

In [109]:
b_3 = knn.score(x_test, yhat)
b_3

1.0

In [110]:
b_4 = jaccard_similarity_score(y_test, yhat)
b_4



0.7962962962962963

In [111]:
b_5 = accuracy_score(y_test, yhat)
b_5

0.7962962962962963

In [112]:
from sklearn import svm
clf = svm.SVC(kernel = 'rbf', gamma = 'auto')
clf.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [113]:
y_test[0:15]

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0])

In [114]:
ypred_1 = clf.predict(x_test)
ypred_1[0:15]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [115]:
c_1 = clf.score(x_train, y_train)
c_1

1.0

In [116]:
c_2 = clf.score(x_test, y_test)
c_2

0.37037037037037035

In [117]:
c_3 = clf.score(x_test, ypred_1)
c_3

1.0

In [118]:
c_4 = jaccard_similarity_score(y_test, ypred_1)
c_4



0.37037037037037035

In [119]:
c_5 = accuracy_score(y_test, ypred_1)
c_5

0.37037037037037035

In [120]:
from sklearn import tree 
dt = tree.DecisionTreeClassifier()
dt.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [121]:
y_test[0:15]

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0])

In [122]:
yhat_1 = dt.predict(x_test)
yhat_1[0:15]

array([2, 2, 0, 0, 0, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0])

In [123]:
d_1 = dt.score(x_train, y_train)
d_1

1.0

In [124]:
d_2 = dt.score(x_test, y_test)
d_2

0.9259259259259259

In [125]:
d_3 = dt.score(x_test, yhat_1)
d_3

1.0

In [126]:
d_4 = jaccard_similarity_score(y_test, yhat_1)
d_4



0.9259259259259259

In [127]:
d_5 = accuracy_score(y_test, yhat_1)
d_5

0.9259259259259259

In [128]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [129]:
y_test[0:15]

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0])

In [130]:
ypred_2 = rf.predict(x_test)
ypred_2

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0, 1, 0, 1, 1, 2, 1, 2,
       1, 2, 0, 2, 1, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2,
       1, 0, 1, 2, 0, 2, 1, 1, 1, 0])

In [131]:
e_1 = rf.score(x_train, y_train)
e_1

1.0

In [132]:
e_2 = rf.score(x_test, y_test)
e_2

0.9814814814814815

In [133]:
e_3 = rf.score(x_test, ypred_2)
e_3

1.0

In [134]:
e_4 = jaccard_similarity_score(y_test, ypred_2)
e_4



0.9814814814814815

In [135]:
e_5 = accuracy_score(y_test, ypred_2)
e_5

0.9814814814814815

In [136]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
gsn = GaussianNB()
gsn.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [137]:
y_test[0:15]

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0])

In [138]:
yhat_2 = gsn.predict(x_test)
yhat_2[0:15]

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0])

In [139]:
f_1 = gsn.score(x_train, y_train)
f_1

0.9919354838709677

In [140]:
f_2 = gsn.score(x_test, y_test)
f_2

0.9814814814814815

In [141]:
f_3 = gsn.score(x_test, yhat_2)
f_3

1.0

In [142]:
f_4 = jaccard_similarity_score(y_test, yhat_2)
f_4



0.9814814814814815

In [143]:
f_5 = accuracy_score(y_test, yhat_2)
f_5

0.9814814814814815

In [144]:
mul = MultinomialNB()
mul.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [145]:
y_test[0:15]

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0])

In [146]:
ypred_3 = mul.predict(x_test)
ypred_3[0:15]

array([2, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 2, 0])

In [147]:
g_1 = mul.score(x_train, y_train)
g_1

0.9032258064516129

In [148]:
g_2 = mul.score(x_test, y_test)
g_2

0.8888888888888888

In [149]:
g_3 = mul.score(x_test, ypred_3)
g_3

1.0

In [150]:
g_4 = jaccard_similarity_score(y_test, ypred_3)
g_4



0.8888888888888888

In [151]:
g_5 = accuracy_score(y_test, ypred_3)
g_5

0.8888888888888888

In [152]:
ber = BernoulliNB()
ber.fit(x_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [153]:
y_test[0:15]

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0])

In [154]:
yhat_3 = ber.predict(x_test)
yhat_3[0:15]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [155]:
h_1 = ber.score(x_train, y_train)
h_1

0.41935483870967744

In [156]:
h_2 = ber.score(x_test, y_test)
h_2

0.35185185185185186

In [157]:
h_3 = ber.score(x_test, yhat_3)
h_3

1.0

In [158]:
h_4 = jaccard_similarity_score(y_test, yhat_3)
h_4



0.35185185185185186

In [159]:
h_5 = accuracy_score(y_test, yhat_3)
h_5

0.35185185185185186

In [161]:
df = pd.DataFrame({'Training Score' : [a_1, b_1, c_1, d_1, e_1, f_1, g_1, h_1],
                  'Testing Score' : [a_2, b_2, c_2, d_2, e_2, f_2, g_2, h_2],
                  'Predicted Score' : [a_3, b_3, c_3, d_3, e_3, f_3, g_3, h_3],
                  'Accuracy Score' : [a_4, b_4, c_4, d_4, e_4, f_4, g_4, h_4],
                  'Jaccard Similarity Score' : [a_5, b_5, c_5, d_5, e_5, f_5, g_5, h_5]}, index = ['Logistic Regression', 'KNN', 'SVM', 'Decision Tree', 'Random Forest', 'Gaussian NB', 'Multinomial NB', 'Bernoulli NB'])
df

Unnamed: 0,Training Score,Testing Score,Predicted Score,Accuracy Score,Jaccard Similarity Score
Logistic Regression,0.943548,0.907407,1.0,0.907407,0.907407
KNN,1.0,0.796296,1.0,0.796296,0.796296
SVM,1.0,0.37037,1.0,0.37037,0.37037
Decision Tree,1.0,0.925926,1.0,0.925926,0.925926
Random Forest,1.0,0.981481,1.0,0.981481,0.981481
Gaussian NB,0.991935,0.981481,1.0,0.981481,0.981481
Multinomial NB,0.903226,0.888889,1.0,0.888889,0.888889
Bernoulli NB,0.419355,0.351852,1.0,0.351852,0.351852
