# ML - Supervised - Classification

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split as TTS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier as KNN

## Labelised Data

In [3]:
#Chargement
#https://www.kaggle.com/uciml/mushroom-classification
filename = "mushrooms.csv"
path = "data/{}".format(filename)
df = pd.read_csv(path)

### Cleaning

In [4]:
df.isnull().values.any()

False

### Isolation

In [5]:
#Isolation
#target_feature_name = 'target_feature'
target_feature_name = 'class'
Y = df[target_feature_name]
X = df.drop(columns=target_feature_name)

### Features Engineering

In [6]:
X = X.apply(lambda d : d.apply(lambda d : ord(d)-ord('a')+1))

### Labels

In [7]:
#Classes/Labels
classes = Y.astype('category').cat.categories.tolist()

## Model Building

In [8]:
#TTS
X_tr, X_te, Y_tr, Y_te = TTS(X, Y, stratify=Y, random_state=314)

In [9]:
#Pipeline
pipeline_details = [('PCA', PCA(random_state=20)),
                    ('KNN', KNN())]
pipeline = Pipeline(steps=pipeline_details)

In [29]:
#GridSearchCV

hyperparameters = {}
hyperparameters['PCA__n_components'] = [i for i in range(1, int(df.shape[1]/2))]
hyperparameters['KNN__n_neighbors']  = [i for i in range(1, 15 +1, 2)]

In [30]:
hyperparameters

{'KNN__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15],
 'PCA__n_components': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [31]:
hyperparameter_search = GridSearchCV(pipeline,
                                     hyperparameters,
                                     scoring='accuracy', 
                                     cv=3)

In [32]:
hyperparameter_search.fit(X_tr, Y_tr)

print('Meilleur score : {:.5f}'.format(hyperparameter_search.best_score_))
print('Meilleur paramètres : {}'.format(hyperparameter_search.best_params_))

Meilleur score : 0.99787
Meilleur paramètres : {'KNN__n_neighbors': 1, 'PCA__n_components': 10}


In [19]:
#PCA
N=hyperparameter_search.best_params_['PCA__n_components']
pca = PCA(n_components=N, random_state=20)
pca.fit(X_tr);

In [20]:
#Transform
X_tr_PCA = pca.transform(X_tr)
X_te_PCA = pca.transform(X_te)

In [22]:
#KNN
N=hyperparameter_search.best_params_['KNN__n_neighbors']
knn = KNN(n_neighbors=N)
knn.fit(X_tr_PCA, Y_tr);

In [23]:
#Accuracy du training
train_preds = knn.predict(X_tr_PCA)
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(train_preds, Y_tr)))

Accuracy : 1.0


## Unlabelised Data

In [24]:
#Chargement
#df = pd.read_csv("data/filename.csv")
unlabeled_df = pd.DataFrame(X_te_PCA)

### Prediction

In [27]:
#RF
preds = knn.predict(unlabeled_df)
proba = knn.predict_proba(unlabeled_df)

## Result

In [28]:
unlabeled_df = pd.DataFrame(X_te)
unlabeled_df['Predictions'] = preds
for i in range(0, len(proba[0])):
    unlabeled_df[classes[i]] = proba[:, i]

#for k in range(0, len(proba)):
#    print('iris[', k, ']')
#    for i in range(0, len(proba[0])):
#        print('{} {}= {}'.format(classes[i], (10-len(classes[i]))*' ', proba[k][i]))

unlabeled_df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Predictions,e,p
4577,6,6,7,6,6,6,3,2,16,5,...,16,23,15,12,8,22,7,p,0.0,1.0
30,2,19,25,20,12,6,3,2,7,5,...,16,23,15,16,14,14,13,e,1.0,0.0
4551,6,6,25,6,6,6,3,2,16,5,...,16,23,15,12,8,22,16,p,0.0,1.0
3155,6,6,7,20,14,6,3,2,21,20,...,16,23,15,16,11,25,4,e,1.0,0.0
2331,24,25,14,20,14,6,3,2,23,20,...,16,23,15,16,11,22,4,e,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1447,6,6,7,6,14,6,23,2,16,20,...,16,23,15,5,14,19,7,e,1.0,0.0
1810,6,19,14,20,16,6,3,14,14,5,...,16,23,15,16,14,19,7,p,0.0,1.0
4117,24,6,25,6,6,6,3,2,7,5,...,16,23,15,12,8,25,16,p,0.0,1.0
109,24,19,25,20,1,6,3,2,11,5,...,16,23,15,16,11,14,13,e,1.0,0.0


## Matrice de confusion

In [None]:
from sklearn.metrics import confusion_matrix
def show_cm(cm, labels):
    df_cm = pd.DataFrame(cm, labels, labels)
    sns.heatmap(df_cm, annot=True)
    plt.show()

In [None]:
cm_train = confusion_matrix(train_preds, Y_tr, labels=classes)
show_cm(cm_train, classes)
#Accuracy du training
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(train_preds, Y_tr)))

In [None]:
cm_test = confusion_matrix(preds, Y_te, labels=classes)
show_cm(cm_test, classes)
#Accuracy du training
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(preds, Y_te)))