# proglie

In [None]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneOut

import pandas as pd
import numpy as np
from statistics import mean

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


# setting up labels for dataset
labels = ('class', 'spec_num', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor',
          'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif',
          'entropy')
# importing data
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)

### random forest

we first use grid sarch with k-fold cross validation: for n_var we choose sqrt(p) and p/3, with p total number variable

In [None]:
k = 5   # number of folds

# building the grid
grid_param_rf = {'n_estimators': (100, 200, 500, 700),
                 'criterion': ('gini', 'entropy'),
                 'max_features': ('sqrt', 5)}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# grid search
rf_cv = GridSearchCV(RandomForestClassifier(), grid_param_rf, cv=k, scoring='balanced_accuracy')
rf_cv.fit(X, y)

In [None]:
# printing accuracy and best parameters

print(rf_cv.best_score_)
print(rf_cv.best_params_)

we now try with leave-one-out cross validation (pretty slow, but ok with less parameters)

In [None]:
# building the grid
grid_param_rf_less = {'n_estimators': (100, 500),
                       'max_features': ('sqrt', 5)}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# grid search
rf_loocv = GridSearchCV(RandomForestClassifier(criterion='gini'), grid_param_rf_less, cv=LeaveOneOut(), scoring='accuracy', n_jobs=-2)
rf_loocv.fit(X, y)

In [None]:
# printing accuracy and best parameters

print(rf_loocv.best_score_)
print(rf_loocv.best_params_)

### single tree

grid search with k-fold cross validation

In [None]:
k = 5   # number of folds

# griglia dei parametri su cui fare la ricerca
grid_param_tree = {'criterion': ('gini', 'entropy'), 'min_samples_split': np.arange(2, 40)}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# grid search
tree_cv = GridSearchCV(tree.DecisionTreeClassifier(), grid_param_tree, cv=k, scoring='balanced_accuracy')
tree_cv.fit(X, y)

In [None]:
# printing accuracy and best parameters

print(tree_cv.best_score_)
print(tree_cv.best_params_)

grid search with leave-one-out cross validation

In [None]:
# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# grid search
tree_loocv = GridSearchCV(tree.DecisionTreeClassifier(), grid_param_tree, cv=LeaveOneOut(), scoring='accuracy')
tree_loocv.fit(X, y)

In [None]:
# printing accuracy and best parameters

print(tree_loocv.best_score_)
print(tree_loocv.best_params_)

### SVM

In [None]:
# packages for pipelining and scaling

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

grid search with k-fold cross validation: we use various kernels and a wide range for the C parameter; degree indicates the degree of the polynimial used in the kernel (if the kernel is not polynomial this parameter is ignored); decision_function_shape indicates the way in which the binary classifier technique is adapted to multiclass classification; gamma is a parameter used by some of the kernels (it is a rough measure of the distance of the significant data points)

before using the SVC classifier we performa a standard scaling on the input data

In [None]:
# building the pipeline
pipe = Pipeline([('scaling', StandardScaler()),
                 ('SVM', svm.SVC(decision_function_shape='ovo'))])

k=5   # number of folds

# building the range of the regularization parameter (C) and of gamm
reg_param = np.logspace(-10, 11, 22)
gamm = np.logspace(-9, 3, 13)

grid_param_svm = {'SVM__C': reg_param,
                  'SVM__kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
                  'SVM__degree': np.arange(2, 5),
                  'SVM__decision_function_shape': ('ovo', 'ovr'),
                  'SVM__gamma': gamm}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# grid search
svm_cv = GridSearchCV(pipe, grid_param_svm, cv=k, scoring='balanced_accuracy', n_jobs=-2)
svm_cv.fit(X,y)

In [None]:
# printing accuracy and best parameters

print(svm_cv.best_score_)
print(svm_cv.best_params_)

we now try with leave-one-out cross validation

In [None]:
# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# grid search
svm_loocv = GridSearchCV(pipe, grid_param_svm, cv=LeaveOneOut(), scoring='accuracy', n_jobs=-2)
svm_loocv.fit(X,y)

In [None]:
# printing accuracy and best parameters

print(svm_cv.best_score_)
print(svm_cv.best_params_)

VALUTARE SE METTERE ANCHE GRID SEARCH CON VALORI PIÙ SPECIFICI

### naive bayes

in this case we don't use grid search because we don't have any hyper-parameter to choose;
we use instead a k-fold cross validation and a leave-one-out cross validation to evaluate the technique

In [None]:
k = 5

# shuffling the dataframe
#df = df.sample(frac=1).reset_index()
#df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# k-fold cross validation
effect_NB_cv = cross_validate(GaussianNB(), X, y, cv=k, scoring='balanced_accuracy')

In [None]:
# printing accuracy

print(mean(effect_NB_cv['test_score']))

In [None]:
# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# leave-one-out cross validation
effect_NB_loocv = cross_validate(GaussianNB(), X, y, cv=LeaveOneOut(), scoring='balanced_accuracy')

In [None]:
# printing accuracy

print(mean(effect_NB_loocv['test_score']))

### KNN with 5-fold CV

In [None]:
from sklearn.neighbors import KNeighborsClassifier

rownum = df.shape[0] #number of rows of dataframe
k = 5
# griglia dei parametri su cui fare la ricerca
grid_param_cv = {   'n_neighbors': np.arange(1, rownum -rownum//k), 
                    'weights': ('uniform', 'distance'), 
                    'metric': ('cosine', 'euclidean', 'manhattan')
                }

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

knn_cv = GridSearchCV(  KNeighborsClassifier(), grid_param_cv, cv=k, scoring='balanced_accuracy', 
                        return_train_score=False, n_jobs =2, verbose=1)
knn_cv.fit(X, y)
print("Best scores after CV are: ", knn_cv.best_score_)
print("Best params after CV are: ", knn_cv.best_params_)

### KNN with LOOCV

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# griglia dei parametri su cui fare la ricerca. Per quanto riguarda n_neighbours come mai abbiamo scelto quel range?
grid_param_less = { 'n_neighbors': np.arange(1, 30), 
                    'weights': ('uniform', 'distance'),
                    'metric': ('cosine', 'euclidean', 'manhattan')}
# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
df.shape
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

knn_loocv = GridSearchCV(KNeighborsClassifier(), grid_param_less, cv=LeaveOneOut(), scoring='balanced_accuracy', return_train_score=False, n_jobs=-2)
knn_loocv.fit(X, y)
print("Best scores after LOOCV are: ", knn_loocv.best_score_)
print("Best params after LOOCV are: ", knn_loocv.best_params_)