In [2]:
# Machine Learning Importation
import pandas as pd
import numpy as np
import matplotlib as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler

# Models importation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [3]:
# Importation of the dataset
data = pd.read_csv ('bacteria.csv')

In [4]:
# delete all incomplete data
data.dropna(axis=0, inplace=True)

# replacement of all bacteria names with numbers
data['target'].replace(['Streptococcus_pyogenes',
                        'Salmonella_enterica',
                        'Enterococcus_hirae',
                        'Streptococcus_pneumoniae',
                        'Staphylococcus_aureus',
                        'Klebsiella_pneumoniae',
                        'Bacteroides_fragilis',
                        'Escherichia_coli',
                        'Campylobacter_jejuni',
                        'Escherichia_fergusonii'],
                      [0,1,2,3,4,5,6,7,8,9], inplace=True)
data.head()

Unnamed: 0,row_id,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,...,A8T0G1C1,A8T0G2C0,A8T1G0C1,A8T1G1C0,A8T2G0C0,A9T0G0C1,A9T0G1C0,A9T1G0C0,A10T0G0C0,target
0,0,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,0
1,1,-9.536743e-07,-1e-05,-4.3e-05,0.000886,-0.0002,0.00076,-0.0002,-0.000114,-4.3e-05,...,-8.6e-05,-4.3e-05,0.000914,0.000914,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,1
2,2,-9.536743e-07,-2e-06,7e-06,0.000129,0.000268,0.00027,0.000243,0.000125,1e-06,...,8.4e-05,4.8e-05,8.1e-05,0.000106,7.2e-05,1e-05,8e-06,1.9e-05,1.046326e-06,1
3,3,4.632568e-08,-6e-06,1.2e-05,0.000245,0.000492,0.000522,0.000396,0.000197,-3e-06,...,0.000151,0.0001,0.00018,0.000202,0.000153,2.1e-05,1.5e-05,4.6e-05,-9.536743e-07,1
4,4,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,2


In [5]:
X = data.drop(['target', 'row_id'], axis=1)
y = data['target']

In [6]:
# Split dataset into two parts: training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Split training set into two parts: training set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

In [7]:
# Scale X_train, X_val, X_test using Z-score normalization
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_val_scaled = sc.fit_transform(X_val)
X_test_scaled = sc.fit_transform(X_test)

In [15]:
# Adding the models to be tested
models_list = []
models_list.append(KNeighborsClassifier(n_neighbors=1, weights='uniform', p=2))
models_list.append(DecisionTreeClassifier())
models_list.append(RandomForestClassifier())
#models_list.append(OneVsRestClassifier(SVC()))
models_list.append(MLPClassifier(random_state=42, max_iter=300))
models_list.append(MLPClassifier(random_state=42, max_iter=2000))
models_list.append(MLPClassifier(random_state=42, max_iter=2000, hidden_layer_sizes=(100,100,100)))

In [16]:
import time

score_list_train = []
score_list_val = []
execution_time = []
models_tested = ['KNN', 
                 'Decision Tree', 
                 'Random Forest',
#                 'SVC'
                 'MLP',
                 'MLP',
                 'MLP'
                 ]

for model in models_list:
    st = time.time()
    model.fit(X_train_scaled, y_train)
    et = time.time()
    elapsed_time = et - st
    score_list_train.append(model.score(X_train_scaled, y_train))
    score_list_val.append(model.score(X_val_scaled, y_val))
    execution_time.append(elapsed_time)
    

In [17]:
result = pd.DataFrame({'Model':models_tested, 'Train score':score_list_train, 
                       'Test score':score_list_val, 'Execution Time':execution_time})

In [18]:
result

Unnamed: 0,Model,Train score,Test score,Execution Time
0,KNN,1.0,0.987156,0.03111
1,Decision Tree,1.0,0.949875,25.069964
2,Random Forest,1.0,0.991437,82.739659
3,MLP,0.99793,0.98525,109.419274
4,MLP,0.99793,0.98525,109.812881
5,MLP,0.997305,0.986281,121.21615


In [16]:
n_neighbors = [1,2,3]
weights = ['uniform', 'distance']
p = [1,2]

score_list_train_all = []
score_list_val_all = []
execution_time_all = []
n_neighbors_all = []
weights_all = []
p_all = []

for n in n_neighbors:
    for w in weights:
        for pp in p:
            st = time.time()
            model = KNeighborsClassifier(n_neighbors=n, weights=w, p=pp)
            model.fit(X_train_scaled, y_train)
            et = time.time()
            elapsed_time = et - st
            score_list_train_all.append(model.score(X_train_scaled, y_train))
            score_list_val_all.append(model.score(X_val_scaled, y_val))
            execution_time_all.append(elapsed_time)
            n_neighbors_all.append(n)
            weights_all.append(w)
            p_all.append(pp)
            result_all = pd.DataFrame({'Model':'KKN', 'Train score':score_list_train_all,
                                   'Test score':score_list_val_all, 'Execution Time':execution_time_all, 'n_neighbors':n_neighbors_all, 'weights':weights_all, 'p': p_all})
            print(result_all)

  Model  Train score  Test score  Execution Time  n_neighbors  weights  p
0   KKN          1.0     0.82625        0.003181            1  uniform  1
  Model  Train score  Test score  Execution Time  n_neighbors  weights  p
0   KKN          1.0    0.826250        0.003181            1  uniform  1
1   KKN          1.0    0.808125        0.001650            1  uniform  2
  Model  Train score  Test score  Execution Time  n_neighbors   weights  p
0   KKN          1.0    0.826250        0.003181            1   uniform  1
1   KKN          1.0    0.808125        0.001650            1   uniform  2
2   KKN          1.0    0.826250        0.001678            1  distance  1
  Model  Train score  Test score  Execution Time  n_neighbors   weights  p
0   KKN          1.0    0.826250        0.003181            1   uniform  1
1   KKN          1.0    0.808125        0.001650            1   uniform  2
2   KKN          1.0    0.826250        0.001678            1  distance  1
3   KKN          1.0    0.8081

In [18]:
#Méthode automatique de recherche de la meilleure combinaison de paramètres afin d'avoir le modèle le plus précis
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors' : [1,2,3,4,5,6,7],
              'weights' : ['uniform', 'distance'],
              'p' : [1, 2]
              }

grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid.fit(X_train_scaled, y_train)

print("Meilleur score:", grid.best_score_)
print("Param:", grid.best_params_)



Meilleur score: 0.84546875
Param: {'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}


In [19]:
#score_list_train_ovr = []
#score_list_val_ovr = []
#execution_time_ovr = []

#for model in models_list:
#    st = time.time()
#    ovr = OneVsRestClassifier(model)
#    ovr.fit(X_train_scaled, y_train)
#    et = time.time()
#    elapsed_time = et - st
#    score_list_train_ovr.append(ovr.score(X_train_scaled, y_train))
#    score_list_val_ovr.append(ovr.score(X_val_scaled, y_val))
#    execution_time_ovr.append(elapsed_time)

In [20]:
#score_list_train_ovo = []
#score_list_val_ovo = []
#execution_time_ovo = []

#for model in models_list:
#    st = time.time()
#    ovo = OneVsOneClassifier(model)
#    ovo.fit(X_train_scaled, y_train)
#    et = time.time()
#    elapsed_time = et - st
#    score_list_train_ovo.append(ovo.score(X_train_scaled, y_train))
#    score_list_val_ovo.append(ovo.score(X_val_scaled, y_val))
#    execution_time_ovo.append(elapsed_time)

In [22]:
#result_ovr = pd.DataFrame({'Model':models_tested, 'Train score':score_list_train_ovr,
#                           'Test score':score_list_val_ovr, 'Execution Time':execution_time_ovr})
#result_ovr

In [23]:
#result_ovo = pd.DataFrame({'Model':models_tested, 'Train score':score_list_train_ovo,
#                           'Test score':score_list_val_ovo, 'Execution Time':execution_time_ovo})
#result_ovo

In [15]:
model_f_knn = KNeighborsClassifier(n_neighbors=1, weights='uniform', p=2)
model_f_knn.fit(X_train_scaled,y_train)
ypredict = model_f_knn.predict(X_test_scaled)

print(accuracy_score(y_test.values,ypredict))
print(model_f_knn.score(X_test_scaled,y_test))

0.986525
0.986525
