# Dataset: load_wine

## 1. Dataset analysis

In [2]:
from sklearn.datasets import load_wine
import pandas as pandas
import seaborn as seaborn
import matplotlib.pyplot as matplot
import numpy as np

In [3]:
wine = load_wine()
print("BRIEF DESCRIPTION OF THE DATASET: ")
print(wine.DESCR)
print()

print("ATTRIBUTES: ")
print(dir(wine))
print()

X = wine.data
y = wine.target
fn = wine.feature_names

print("DATA:")
show_data = pandas.DataFrame(data=X, columns=fn)
show_data['class'] = pandas.Series(wine.target)
show_data

BRIEF DESCRIPTION OF THE DATASET: 
.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:            

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


**Further data analysis**

In [4]:
show_data.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,0.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,0.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,2.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0


# 2. Model fitting

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

**PERCEPTRON**

In [5]:
def perceptron(X, y, margin=0.1, learning_rate=1.0, max_iters=200):
    N, D = X.shape
    Y = np.unique(y)
    C = Y.size
    weights = np.zeros((1+D, C))
    
    for iteration in range(1, max_iters + 1):
        errors = 0
        
        for n in range(N):
            xn = np.array([1, *X[n, :]])
            cn = np.squeeze(np.where(Y==y[n]))
            gn = weights[:,cn].T @ xn
            err = False
            
            for c in np.arange(C):
                if c != cn and weights[:,c].T @ xn + margin >= gn:
                    weights[:, c] = weights[:, c] - learning_rate*xn; err = True
            if err:
                weights[:, cn] = weights[:, cn] + learning_rate*xn
                errors = errors + 1
                
        if errors == 0:
            break

    return weights

In [6]:
%%timeit -n1 -r1

best_precision = 0
best_params = [0,0,0]

for margin in (0.01, 0.1, 0.3, 0.5, 0.7):
    for learning_rate in (0.1, 0.5, 1, 2):
        for max_iters in (100, 200, 500, 1000):
            W = perceptron(X_train, y_train, margin, learning_rate, max_iters)

            X_testh = np.hstack([np.ones((len(X_test), 1)), X_test])
            y_test_pred  = np.argmax(X_testh @ W, axis=1).reshape(-1, 1)
            err_test = np.count_nonzero(y_test_pred != y_test) / len(X_test)
            precision = 100 - err_test
            
            if precision > best_precision:
                best_precision = precision
                best_params = [margin, learning_rate, max_iters]

print(f'Accuracy: {best_precision:.1f}% with margin {best_params[0]}, learning rate {best_params[1]} and {best_params[2]} maximum iterations')

Accuracy: 77.5% with margin 0.01, learning rate 0.1 and 1000 maximum iterations
2min 26s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**NAIVE BAYES** 

In [9]:
%%timeit -n1 -r1
nb = GaussianNB()

Gnb = {"var_smoothing": [1e-9, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSnb = GridSearchCV(nb, Gnb, scoring='accuracy', refit=True, cv=5)

acc = GSnb.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSnb.best_params_}')

Accuracy: 100.0% con {'var_smoothing': 1e-09}
104 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**LINEAR DISCRIMINANT ANALYSIS**

In [10]:
%%timeit -n1 -r1
lda = LinearDiscriminantAnalysis()

Glda = {"solver": ['svd', 'lsqr', 'eigen'], "n_components": [1,2], "tol": [1e-5, 1e-4, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSlda = GridSearchCV(lda, Glda, scoring='accuracy', refit=True, cv=5)

acc = GSlda.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSlda.best_params_}')

Accuracy: 100.0% con {'n_components': 1, 'solver': 'svd', 'tol': 1e-05}
1.32 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**QUADRATIC DISCRIMINANT ANALYSIS**

In [11]:
%%timeit -n1 -r1
import warnings; warnings.filterwarnings('ignore')

qda = QuadraticDiscriminantAnalysis()

Gqda = {"tol": [1e-5, 1e-4, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSqda = GridSearchCV(qda, Gqda, scoring='accuracy', refit=True, cv=5)

acc = GSqda.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSqda.best_params_}')

Accuracy: 97.2% con {'tol': 1e-05}
108 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**LOGISTIC REGRESSION**

In [12]:
%%timeit -n1 -r1

log_reg = LogisticRegression(random_state=23)

Glogreg = {"penalty": ['l1', 'l2', None], "tol": [0.001, 0.01, 0.1], "solver": ['lbfgs', 'liblinear', 'newton-cg'], "multi_class": ['auto', 'multinomial'], "max_iter": [10, 50, 100], "n_jobs": [1,2,4], "C": [0.001, 0.1, 1, 10, 100]}
GSlogreg = GridSearchCV(log_reg, Glogreg, scoring='accuracy', refit=True, cv=5)

acc = GSlogreg.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSlogreg.best_params_}')

Accuracy: 100.0% con {'C': 100, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': 1, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
4min 36s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**DECISION TREE CLASSIFIER**

In [13]:
%%timeit -n1 -r1

dtc = DecisionTreeClassifier(random_state=23)

Gdtc = {"criterion": ['gini', 'entropy', 'log_loss'], "max_depth": [1,3,5,7,10,20], "min_samples_split": [2,3,4,5]}
GSdtc = GridSearchCV(dtc, Gdtc, scoring='accuracy', refit=True, cv=5)

acc = GSdtc.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSdtc.best_params_}')

Accuracy: 97.2% con {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}
1.47 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**RANDOM FOREST**

In [14]:
%%timeit -n1 -r1

rfc = RandomForestClassifier(random_state=23)

Grfc = {"n_estimators": [1,5,10,25,50], "criterion": ['gini', 'entropy', 'log_loss'], "bootstrap": [True,False], "max_depth": [1,3,5,7,10,20], "min_samples_split": [2,3,4,5]}
GSrfc = GridSearchCV(rfc, Grfc, scoring='accuracy', refit=True, cv=5)

acc = GSrfc.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSrfc.best_params_}')

Accuracy: 100.0% con {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 3, 'n_estimators': 50}
1min 23s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**ADABOOST CLASSIFIER**

In [15]:
%%timeit -n1 -r1

abc = AdaBoostClassifier(random_state=23)

Gabc = {"n_estimators": [2,5,10,20,50,100], "learning_rate": [0.01,0.1,0.3,0.5,0.7,0.9,0.99,1]}
GSabc = GridSearchCV(abc, Gabc, scoring='accuracy', refit=True, cv=5)

acc = (GSabc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSabc.best_params_}')

Accuracy: 100.0% con {'learning_rate': 0.7, 'n_estimators': 50}
15.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**GRADIENT BOOSTING CLASSIFIER**

In [16]:
%%timeit -n1 -r1

gbc = GradientBoostingClassifier(random_state=23)

Ggbc = {"n_estimators": [2,5,10,20,50], "learning_rate": [0.01,0.1,0.3,0.5,0.7,0.9], "criterion": ['friedman_mse', 'squared_error'], "max_depth": [1,3,5,7,10], "min_samples_split": [2,3,4]}
GSgbc = GridSearchCV(gbc, Ggbc, scoring='accuracy', refit=True, cv=5)

acc = (GSgbc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSgbc.best_params_}')

Accuracy: 100.0% con {'criterion': 'friedman_mse', 'learning_rate': 0.5, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 20}
6min 27s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**KNEIGHBORS CLASSIFIER**

In [17]:
%%timeit -n1 -r1

knc = KNeighborsClassifier()

Gknc = {'n_neighbors': [1,2,5,10,20], 'weights':['uniform', 'distance'], 'leaf_size': [5,10,20,30,50], 'p': [1,2,3,4,5]}
GSknc = GridSearchCV(knc, Gknc, scoring='accuracy', refit=True, cv=5)

acc = (GSknc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSknc.best_params_}')

Accuracy: 80.6% con {'leaf_size': 5, 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
4.89 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
