# ML - Supervised - Classification

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split as TTS
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier as RF

## Labelised Data

In [3]:
#Chargement
#https://www.kaggle.com/uciml/mushroom-classification
filename = "mushrooms.csv"
path = "data/{}".format(filename)
df = pd.read_csv(path)

### Cleaning

In [4]:
df.isnull().values.any()

False

### Isolation

In [5]:
#Isolation
#target_feature_name = 'target_feature'
target_feature_name = 'class'
Y = df[target_feature_name]
X = df.drop(columns=target_feature_name)

### Features Engineering

In [6]:
X = X.apply(lambda d : d.apply(lambda d : ord(d)-ord('a')+1))

### Labels

In [7]:
#Classes/Labels
classes = Y.astype('category').cat.categories.tolist()

## Model Building

In [8]:
#TTS
X_tr, X_te, Y_tr, Y_te = TTS(X, Y, stratify=Y, random_state=314)

In [8]:
#Pipeline
pipeline_details = [('PCA', PCA(random_state=20)),
                    ('KNN', KNN())]
pipeline = Pipeline(steps=pipeline_details)

In [11]:
#GridSearchCV

hyperparameters = {}
hyperparameters['PCA__n_components'] = [i for i in range(1, df.shape[1])]
hyperparameters['RF__n_estimators']  = [i for i in range(80, 120 +1, 10)]

In [12]:
hyperparameters

{'PCA__n_components': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22],
 'RF__n_estimators': [80, 90, 100, 110, 120]}

In [13]:
hyperparameter_search = GridSearchCV(pipeline,
                                     hyperparameters,
                                     scoring='accuracy', 
                                     cv=3)

In [None]:
hyperparameter_search.fit(X_tr, Y_tr)

print('Meilleur score : {:.5f}'.format(hyperparameter_search.best_score_))
print('Meilleur paramètres : {}'.format(hyperparameter_search.best_params_))

In [None]:
#PCA
N=hyperparameter_search.best_params_['PCA__n_components']
pca = PCA(n_components=N, random_state=20)
pca.fit(X_tr);

In [None]:
#Transform
X_tr_PCA = pca.transform(X_tr)
X_te_PCA = pca.transform(X_te)

In [None]:
#RF
N=hyperparameter_search.best_params_['RF__n_estimators']
rf = RF(n_estimators=N)
rf.fit(X_tr_PCA, Y_tr);

In [None]:
#Accuracy du training
train_preds = rf.predict(X_tr_PCA)
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(train_preds, Y_tr)))

## Unlabelised Data

In [None]:
#Chargement
#df = pd.read_csv("data/filename.csv")
unlabeled_df = pd.DataFrame(X_te_PCA)

### Prediction

In [None]:
#RF
preds = rf.predict(df)
proba = rf.predict_proba(df)

## Result

In [None]:
unlabeled_df = pd.DataFrame(X_te)
unlabeled_df['Predictions'] = preds
for i in range(0, len(proba[0])):
    unlabeled_df[classes[i]] = proba[:, i]

#for k in range(0, len(proba)):
#    print('iris[', k, ']')
#    for i in range(0, len(proba[0])):
#        print('{} {}= {}'.format(classes[i], (10-len(classes[i]))*' ', proba[k][i]))

unlabeled_df

## Matrice de confusion

In [None]:
from sklearn.metrics import confusion_matrix
def show_cm(cm, labels):
    df_cm = pd.DataFrame(cm, labels, labels)
    sns.heatmap(df_cm, annot=True)
    plt.show()

In [None]:
cm_train = confusion_matrix(train_preds, Y_tr, labels=classes)
show_cm(cm_train, classes)
#Accuracy du training
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(train_preds, Y_tr)))

In [None]:
cm_test = confusion_matrix(preds, Y_te, labels=classes)
show_cm(cm_test, classes)
#Accuracy du training
accuracy = lambda p, y : (p==y).sum()/len(y)
print('Accuracy : {}'.format(accuracy(preds, Y_te)))