# ML - Supervised - Classification

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split as TTS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier as RF

In /home/q-pi/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/q-pi/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/q-pi/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /home/q-pi/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/q-pi/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_c

## Labelised Data

In [2]:
#Chargement
#df = pd.read_csv("data/filename.csv")
df = sns.load_dataset('iris')

### Training

In [3]:
#Isolation
#target_feature_name = 'target_feature'
target_feature_name = 'species'
Y = df[target_feature_name]
X = df.drop(columns=target_feature_name)

In [4]:
#Classes/Labels
classes = Y.astype('category').cat.categories.tolist()

In [5]:
#TTS
X_tr, X_te, Y_tr, Y_te = TTS(X, Y, stratify=Y, random_state=314)

In [6]:
#Pipeline
pipeline_details = [('PCA', PCA(random_state=20)),
                    ('LinearSVC', LinearSVC())]
pipeline = Pipeline(steps=pipeline_details)

In [15]:
#GridSearchCV

hyperparameters = {}
hyperparameters['PCA__n_components'] = [i for i in range(1, X_tr.shape[1]+1)]
hyperparameters['LinearSVC__tol']  = [10**-i for i in range(10, 1, -1)]

hyperparameter_search = GridSearchCV(pipeline,
                                     hyperparameters,
                                     scoring='accuracy', 
                                     cv=5)

hyperparameter_search.fit(X_tr, Y_tr)

print('Meilleur score : {:.5f}'.format(hyperparameter_search.best_score_))
print('Meilleur paramètres : {}'.format(hyperparameter_search.best_params_))



Meilleur score : 0.96482
Meilleur paramètres : {'LinearSVC__tol': 1e-10, 'PCA__n_components': 2}


In [16]:
#PCA
N=hyperparameter_search.best_params_['PCA__n_components']
pca = PCA(n_components=N, random_state=20)
pca.fit(X_tr);

In [17]:
#Transform
X_tr_PCA = pca.transform(X_tr)
X_te_PCA = pca.transform(X_te)

In [19]:
#LinearSVC
N=hyperparameter_search.best_params_['LinearSVC__tol']
lSvc = LinearSVC(tol=N)
lSvc.fit(X_tr_PCA, Y_tr);

In [21]:
#Accuracy du training
train_preds = lSvc.predict(X_tr_PCA)
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(train_preds, Y_tr)))

Accuracy : 0.9821428571428571


## Unlabelised Data

In [22]:
#Chargement
#df = pd.read_csv("data/filename.csv")
df = pd.DataFrame(X_te_PCA)

### Prediction

In [24]:
#LinearSVC
preds = lSvc.predict(df)
#proba = lSvc.predict_proba(df)

## Result

In [26]:
df = pd.DataFrame(X_te)
df['Predictions'] = preds
#for i in range(0, len(proba[0])):
#    df[classes[i]] = proba[:, i]

#for k in range(0, len(proba)):
#    print('iris[', k, ']')
#    for i in range(0, len(proba[0])):
#        print('{} {}= {}'.format(classes[i], (10-len(classes[i]))*' ', proba[k][i]))

df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Predictions
93,5.0,2.3,3.3,1.0,versicolor
90,5.5,2.6,4.4,1.2,versicolor
83,6.0,2.7,5.1,1.6,virginica
48,5.3,3.7,1.5,0.2,setosa
35,5.0,3.2,1.2,0.2,setosa
50,7.0,3.2,4.7,1.4,virginica
42,4.4,3.2,1.3,0.2,setosa
106,4.9,2.5,4.5,1.7,versicolor
34,4.9,3.1,1.5,0.2,setosa
66,5.6,3.0,4.5,1.5,versicolor


## Matrice de confusion

In [None]:
from sklearn.metrics import confusion_matrix
def show_cm(cm, labels):
    df_cm = pd.DataFrame(cm, labels, labels)
    sns.heatmap(df_cm, annot=True)
    plt.show()

In [None]:
cm_train = confusion_matrix(train_preds, Y_tr, labels=classes)
show_cm(cm_train, classes)
#Accuracy du training
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(train_preds, Y_tr)))

In [None]:
cm_test = confusion_matrix(preds, Y_te, labels=classes)
show_cm(cm_test, classes)
#Accuracy du training
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(preds, Y_te)))