# ML - Supervised - Classification

In [302]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split as TTS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier as KNN

## Labelised Data

In [303]:
#Chargement
#df = pd.read_csv("data/filename.csv")
df = sns.load_dataset('iris')

### Training

In [304]:
#Isolation
#target_feature_name = 'target_feature'
target_feature_name = 'species'
Y = df[target_feature_name]
X = df.drop(columns=target_feature_name)

In [305]:
#TTS
X_tr, X_te, Y_tr, Y_te = TTS(X, Y, stratify=Y, random_state=314)

In [306]:
#Pipeline
pipeline_details = [('PCA', PCA(random_state=42)),
                    ('KNN', KNN())]
pipeline = Pipeline(steps=pipeline_details)

In [307]:
#GridSearchCV
print([i for i in range(1, X_tr.shape[1] + 1)])
print([i for i in range(1, 21 + 1, 2)])

hyperparameters = {}
hyperparameters['PCA__n_components'] = [i for i in range(1, X_tr.shape[1]+1)]
hyperparameters['KNN__n_neighbors']  = [i for i in range(1, 21 +1, 2)]

print(hyperparameters)

hyperparameter_search = GridSearchCV(pipeline,
                                     hyperparameters,
                                     scoring='accuracy', 
                                     cv=5)

hyperparameter_search.fit(X_tr, Y_tr)

print('Meilleur score : {:.5f}'.format(hyperparameter_search.best_score_))
print('Meilleur paramètres : {}'.format(hyperparameter_search.best_params_))

[1, 2, 3, 4]
[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]
{'PCA__n_components': [1, 2, 3, 4], 'KNN__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
Meilleur score : 0.99091
Meilleur paramètres : {'KNN__n_neighbors': 15, 'PCA__n_components': 4}


In [308]:
#PCA
N=hyperparameter_search.best_params_['PCA__n_components']
pca = PCA(n_components=N)
pca.fit(X_tr)
X_tr_PCA = pca.transform(X_tr)
X_te_PCA = pca.transform(X_te)

In [309]:
#KNN
N=hyperparameter_search.best_params_['KNN__n_neighbors']
knn = KNN(n_neighbors=N)
knn.fit(X_tr_PCA, Y_tr)

KNeighborsClassifier(n_neighbors=15)

In [310]:
#Accuracy
preds = knn.predict(X_te)
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(preds, Y_te)))

Accuracy : 0.3157894736842105


## Unlabelised Data

In [311]:
#Chargement
#df = pd.read_csv("data/filename.csv")
df = pd.DataFrame(X_te_PCA)

### Prediction

In [312]:
#KNN
preds = knn.predict(df)
df['Predictions'] = preds

## Result

In [313]:
#df

In [314]:
df = df.drop(columns='Predictions')

In [315]:
unknow_iris = df[2:10]
print(unknow_iris)
proba = knn.predict_proba(unknow_iris)#[0]
print(proba)
print("setosa \t\t= {}\nversicolor \t= {}\nvirginica \t= {}".format(proba[0], proba[1], proba[2]))

          0         1         2         3
2  1.363608 -0.404683  0.045669  0.198818
3 -2.567370  0.552244 -0.066586  0.028245
4 -2.886608  0.021055 -0.160340 -0.164462
5  1.257516  0.644915 -0.489204 -0.027015
6 -3.013032 -0.344942  0.239463  0.080936
7  0.514276 -1.126970  0.656221  0.134302
8 -2.655977 -0.160104 -0.119918 -0.014939
9  0.645019 -0.310415  0.353463  0.201254
[[0.         0.26666667 0.73333333]
 [1.         0.         0.        ]
 [1.         0.         0.        ]
 [0.         0.66666667 0.33333333]
 [1.         0.         0.        ]
 [0.         0.8        0.2       ]
 [1.         0.         0.        ]
 [0.         0.93333333 0.06666667]]
setosa 		= [0.         0.26666667 0.73333333]
versicolor 	= [1. 0. 0.]
virginica 	= [1. 0. 0.]
