<div style="background-color: #eee3d3">
<h1> 5-first_classification_models.ipynb </h1>
</div>

---

### The purpose of this notebook is to test a few different models on our dataset

The ultimate goal of this project is to find potential biomarkers, i.e. which variables (peak table columns) have the strongest ability to separate the two sample groups (Incident vs. Non-case).

To find potential biomarkers, you can find a subset of the peak table columns which have a strong ability to predict the __group sample__, Incident or Non-case (classification). For that, you will find in this notebook a first draft of a few machine learning models tested on our data.

---

In this notebook, we expect you to :
- use proposed methods below
- use other classification methods (logistic regression model for example)
- choose a quantitative criterion to assess the classification quality and explain why you did this choice
- create plots

---

After this notebook, you can create another one presenting which variables (peak table columns) could be potential biomakers based on your analysis in these notebooks.

Once again, feel free to add anything that you find relevant to find potential biomakers.

__Good luck for this project :)__

---

## Import modules

In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
np.random.seed(12345678)

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression

In [19]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report, recall_score
from sklearn.model_selection import learning_curve

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
import glob
import os

## Define useful functions for the analysis

In [22]:
def evaluation(model, X_train, X_test, y_train, y_test,show=True):
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    #print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    if show :
        N, train_score, val_score = learning_curve(model, X_train, y_train, cv=4, scoring='f1', train_sizes=np.linspace(0.1, 1, 10))
        fig = plt.figure(figsize=(12,8))
        plt.plot(N, train_score.mean(axis=1), label='train score')
        plt.plot(N, val_score.mean(axis=1), label='validation score')
        display(fig)
        plt.legend()

In [23]:
def pipeline(missing_cols,file_in_dir):
    
    # We need to have the tables with missing values/normalisation (and demensional reduction if possible) done
    # with the best methods each time.
    #file_in_dir="../data/peakTable/scaled_peak_tables/"
    file_b=file_in_dir+"*.csv"
    
    for normalized_peakTable_file in glob.glob(file_b):
        
        filename=normalized_peakTable_file.replace(file_in_dir,"")
        filename=filename.replace(".csv","")

        filename=filename.split("-")
        
        print(20*'-')
        print(filename)
        #print(200*'-')
        
        # Import normalized peakTable
        peakTable_normalized = pd.read_csv(normalized_peakTable_file, sep=',')
        peakTable_normalized = peakTable_normalized.iloc[: , 1:] # We had a column with the index appear on normalisation step
        
        # Add the first columns that were missing
        peakTable_normalized = pd.concat([missing_cols, peakTable_normalized], axis=1)
        
        normalized_peakTable_file=normalized_peakTable_file.replace(file_in_dir,"")

        
        # Split into train and test sets
        code = {'Incident':1,'Non-case':0}
        y = peakTable_normalized['Groups']
        y = y.map(code)
        X = peakTable_normalized.iloc[:,2:]

        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,test_size=0.2, random_state=0, stratify=y)
        
        # Create and evaluate models
        prepocessor = make_pipeline(PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif, k=10))
        
        RandomForest = make_pipeline(prepocessor, RandomForestClassifier(random_state=0))
        AdaBoost = make_pipeline(prepocessor, AdaBoostClassifier(random_state=0))
        SVM = make_pipeline(prepocessor, StandardScaler(), SVC(random_state=0))
        KNN = make_pipeline(prepocessor, StandardScaler(), KNeighborsClassifier())
        
        dict_models = {'RandomForest': RandomForest,
               'AdaBoost' : AdaBoost,
               'SVM': SVM,
               'KNN': KNN,
               #'LogisticRegression' : LogisticRegression # Doesn't work yet ...
              }
        
        for name, model in dict_models.items():
            print(name)
            evaluation(model, X_train, X_test, y_train, y_test,False)
            #print(100*'-')
          
        
        print(3*'\n')

In [24]:
# We missed the first columns that we needed for the pipeline for every table
path_peakTable = '/'.join(os.getcwd().split('/')[:-1]) + '/data/peakTable/original_peak_table/peakTable_HILIC_POS.csv'
peakTable = pd.read_csv(path_peakTable, sep=',', decimal='.', na_values='NA')
first_cols = peakTable.iloc[:,:2]

In [67]:
pipeline(first_cols,"../data/peakTable/scaled_peak_tables/")

--------------------
['autoscaling', 'X_min_f']
RandomForest
              precision    recall  f1-score   support

           0       0.68      0.89      0.77        19
           1       0.85      0.58      0.69        19

    accuracy                           0.74        38
   macro avg       0.76      0.74      0.73        38
weighted avg       0.76      0.74      0.73        38

AdaBoost
              precision    recall  f1-score   support

           0       0.64      0.84      0.73        19
           1       0.77      0.53      0.62        19

    accuracy                           0.68        38
   macro avg       0.70      0.68      0.68        38
weighted avg       0.70      0.68      0.68        38

SVM
              precision    recall  f1-score   support

           0       0.62      0.95      0.75        19
           1       0.89      0.42      0.57        19

    accuracy                           0.68        38
   macro avg       0.75      0.68      0.66        38


              precision    recall  f1-score   support

           0       0.56      0.79      0.65        19
           1       0.64      0.37      0.47        19

    accuracy                           0.58        38
   macro avg       0.60      0.58      0.56        38
weighted avg       0.60      0.58      0.56        38

AdaBoost
              precision    recall  f1-score   support

           0       0.57      0.63      0.60        19
           1       0.59      0.53      0.56        19

    accuracy                           0.58        38
   macro avg       0.58      0.58      0.58        38
weighted avg       0.58      0.58      0.58        38

SVM
              precision    recall  f1-score   support

           0       0.60      0.95      0.73        19
           1       0.88      0.37      0.52        19

    accuracy                           0.66        38
   macro avg       0.74      0.66      0.63        38
weighted avg       0.74      0.66      0.63        38

KNN
  

              precision    recall  f1-score   support

           0       0.68      0.89      0.77        19
           1       0.85      0.58      0.69        19

    accuracy                           0.74        38
   macro avg       0.76      0.74      0.73        38
weighted avg       0.76      0.74      0.73        38

AdaBoost
              precision    recall  f1-score   support

           0       0.58      0.74      0.65        19
           1       0.64      0.47      0.55        19

    accuracy                           0.61        38
   macro avg       0.61      0.61      0.60        38
weighted avg       0.61      0.61      0.60        38

SVM
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38

KNN
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.50      1.00      0.67        19
           1       0.00      0.00      0.00        19

    accuracy                           0.50        38
   macro avg       0.25      0.50      0.33        38
weighted avg       0.25      0.50      0.33        38

SVM
              precision    recall  f1-score   support

           0       0.62      0.95      0.75        19
           1       0.89      0.42      0.57        19

    accuracy                           0.68        38
   macro avg       0.75      0.68      0.66        38
weighted avg       0.75      0.68      0.66        38

KNN
              precision    recall  f1-score   support

           0       0.58      0.79      0.67        19
           1       0.67      0.42      0.52        19

    accuracy                           0.61        38
   macro avg       0.62      0.61      0.59        38
weighted avg       0.62      0.61      0.59        38





-------

              precision    recall  f1-score   support

           0       0.62      0.79      0.70        19
           1       0.71      0.53      0.61        19

    accuracy                           0.66        38
   macro avg       0.67      0.66      0.65        38
weighted avg       0.67      0.66      0.65        38

SVM
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38

KNN
              precision    recall  f1-score   support

           0       0.62      0.84      0.71        19
           1       0.75      0.47      0.58        19

    accuracy                           0.66        38
   macro avg       0.68      0.66      0.65        38
weighted avg       0.68      0.66      0.65        38





-------

              precision    recall  f1-score   support

           0       0.67      0.84      0.74        19
           1       0.79      0.58      0.67        19

    accuracy                           0.71        38
   macro avg       0.73      0.71      0.71        38
weighted avg       0.73      0.71      0.71        38

SVM
              precision    recall  f1-score   support

           0       0.67      0.95      0.78        19
           1       0.91      0.53      0.67        19

    accuracy                           0.74        38
   macro avg       0.79      0.74      0.72        38
weighted avg       0.79      0.74      0.72        38

KNN
              precision    recall  f1-score   support

           0       0.62      0.84      0.71        19
           1       0.75      0.47      0.58        19

    accuracy                           0.66        38
   macro avg       0.68      0.66      0.65        38
weighted avg       0.68      0.66      0.65        38





-------

              precision    recall  f1-score   support

           0       0.62      0.84      0.71        19
           1       0.75      0.47      0.58        19

    accuracy                           0.66        38
   macro avg       0.68      0.66      0.65        38
weighted avg       0.68      0.66      0.65        38

SVM
              precision    recall  f1-score   support

           0       0.67      0.95      0.78        19
           1       0.91      0.53      0.67        19

    accuracy                           0.74        38
   macro avg       0.79      0.74      0.72        38
weighted avg       0.79      0.74      0.72        38

KNN
              precision    recall  f1-score   support

           0       0.67      0.95      0.78        19
           1       0.91      0.53      0.67        19

    accuracy                           0.74        38
   macro avg       0.79      0.74      0.72        38
weighted avg       0.79      0.74      0.72        38





-------

              precision    recall  f1-score   support

           0       0.70      0.84      0.76        19
           1       0.80      0.63      0.71        19

    accuracy                           0.74        38
   macro avg       0.75      0.74      0.73        38
weighted avg       0.75      0.74      0.73        38

SVM
              precision    recall  f1-score   support

           0       0.62      0.95      0.75        19
           1       0.89      0.42      0.57        19

    accuracy                           0.68        38
   macro avg       0.75      0.68      0.66        38
weighted avg       0.75      0.68      0.66        38

KNN
              precision    recall  f1-score   support

           0       0.65      0.89      0.76        19
           1       0.83      0.53      0.65        19

    accuracy                           0.71        38
   macro avg       0.74      0.71      0.70        38
weighted avg       0.74      0.71      0.70        38





-------

              precision    recall  f1-score   support

           0       0.73      0.84      0.78        19
           1       0.81      0.68      0.74        19

    accuracy                           0.76        38
   macro avg       0.77      0.76      0.76        38
weighted avg       0.77      0.76      0.76        38

SVM
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38

KNN
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38





-------

              precision    recall  f1-score   support

           0       0.52      0.74      0.61        19
           1       0.55      0.32      0.40        19

    accuracy                           0.53        38
   macro avg       0.53      0.53      0.50        38
weighted avg       0.53      0.53      0.50        38

SVM
              precision    recall  f1-score   support

           0       0.64      0.84      0.73        19
           1       0.77      0.53      0.62        19

    accuracy                           0.68        38
   macro avg       0.70      0.68      0.68        38
weighted avg       0.70      0.68      0.68        38

KNN
              precision    recall  f1-score   support

           0       0.64      0.84      0.73        19
           1       0.77      0.53      0.62        19

    accuracy                           0.68        38
   macro avg       0.70      0.68      0.68        38
weighted avg       0.70      0.68      0.68        38





-------

              precision    recall  f1-score   support

           0       0.60      0.79      0.68        19
           1       0.69      0.47      0.56        19

    accuracy                           0.63        38
   macro avg       0.65      0.63      0.62        38
weighted avg       0.65      0.63      0.62        38

SVM
              precision    recall  f1-score   support

           0       0.60      0.79      0.68        19
           1       0.69      0.47      0.56        19

    accuracy                           0.63        38
   macro avg       0.65      0.63      0.62        38
weighted avg       0.65      0.63      0.62        38

KNN
              precision    recall  f1-score   support

           0       0.54      0.68      0.60        19
           1       0.57      0.42      0.48        19

    accuracy                           0.55        38
   macro avg       0.56      0.55      0.54        38
weighted avg       0.56      0.55      0.54        38





-------

              precision    recall  f1-score   support

           0       0.58      0.79      0.67        19
           1       0.67      0.42      0.52        19

    accuracy                           0.61        38
   macro avg       0.62      0.61      0.59        38
weighted avg       0.62      0.61      0.59        38

SVM
              precision    recall  f1-score   support

           0       0.62      0.95      0.75        19
           1       0.89      0.42      0.57        19

    accuracy                           0.68        38
   macro avg       0.75      0.68      0.66        38
weighted avg       0.75      0.68      0.66        38

KNN
              precision    recall  f1-score   support

           0       0.65      0.89      0.76        19
           1       0.83      0.53      0.65        19

    accuracy                           0.71        38
   macro avg       0.74      0.71      0.70        38
weighted avg       0.74      0.71      0.70        38





-------

              precision    recall  f1-score   support

           0       0.61      0.89      0.72        19
           1       0.80      0.42      0.55        19

    accuracy                           0.66        38
   macro avg       0.70      0.66      0.64        38
weighted avg       0.70      0.66      0.64        38

SVM
              precision    recall  f1-score   support

           0       0.66      1.00      0.79        19
           1       1.00      0.47      0.64        19

    accuracy                           0.74        38
   macro avg       0.83      0.74      0.72        38
weighted avg       0.83      0.74      0.72        38

KNN
              precision    recall  f1-score   support

           0       0.58      0.74      0.65        19
           1       0.64      0.47      0.55        19

    accuracy                           0.61        38
   macro avg       0.61      0.61      0.60        38
weighted avg       0.61      0.61      0.60        38





-------

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.50      1.00      0.67        19
           1       0.00      0.00      0.00        19

    accuracy                           0.50        38
   macro avg       0.25      0.50      0.33        38
weighted avg       0.25      0.50      0.33        38

SVM
              precision    recall  f1-score   support

           0       0.63      0.89      0.74        19
           1       0.82      0.47      0.60        19

    accuracy                           0.68        38
   macro avg       0.72      0.68      0.67        38
weighted avg       0.72      0.68      0.67        38

KNN
              precision    recall  f1-score   support

           0       0.71      0.79      0.75        19
           1       0.76      0.68      0.72        19

    accuracy                           0.74        38
   macro avg       0.74      0.74      0.74        38
weighted avg       0.74      0.74      0.74        38





-------

              precision    recall  f1-score   support

           0       0.73      0.84      0.78        19
           1       0.81      0.68      0.74        19

    accuracy                           0.76        38
   macro avg       0.77      0.76      0.76        38
weighted avg       0.77      0.76      0.76        38

SVM
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38

KNN
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38





-------

              precision    recall  f1-score   support

           0       0.64      0.84      0.73        19
           1       0.77      0.53      0.62        19

    accuracy                           0.68        38
   macro avg       0.70      0.68      0.68        38
weighted avg       0.70      0.68      0.68        38

SVM
              precision    recall  f1-score   support

           0       0.67      0.95      0.78        19
           1       0.91      0.53      0.67        19

    accuracy                           0.74        38
   macro avg       0.79      0.74      0.72        38
weighted avg       0.79      0.74      0.72        38

KNN
              precision    recall  f1-score   support

           0       0.69      0.95      0.80        19
           1       0.92      0.58      0.71        19

    accuracy                           0.76        38
   macro avg       0.80      0.76      0.75        38
weighted avg       0.80      0.76      0.75        38





-------

              precision    recall  f1-score   support

           0       0.64      0.84      0.73        19
           1       0.77      0.53      0.62        19

    accuracy                           0.68        38
   macro avg       0.70      0.68      0.68        38
weighted avg       0.70      0.68      0.68        38

SVM
              precision    recall  f1-score   support

           0       0.62      0.95      0.75        19
           1       0.89      0.42      0.57        19

    accuracy                           0.68        38
   macro avg       0.75      0.68      0.66        38
weighted avg       0.75      0.68      0.66        38

KNN
              precision    recall  f1-score   support

           0       0.65      0.89      0.76        19
           1       0.83      0.53      0.65        19

    accuracy                           0.71        38
   macro avg       0.74      0.71      0.70        38
weighted avg       0.74      0.71      0.70        38





-------

              precision    recall  f1-score   support

           0       0.63      0.89      0.74        19
           1       0.82      0.47      0.60        19

    accuracy                           0.68        38
   macro avg       0.72      0.68      0.67        38
weighted avg       0.72      0.68      0.67        38

SVM
              precision    recall  f1-score   support

           0       0.67      0.95      0.78        19
           1       0.91      0.53      0.67        19

    accuracy                           0.74        38
   macro avg       0.79      0.74      0.72        38
weighted avg       0.79      0.74      0.72        38

KNN
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38





-------

              precision    recall  f1-score   support

           0       0.63      0.89      0.74        19
           1       0.82      0.47      0.60        19

    accuracy                           0.68        38
   macro avg       0.72      0.68      0.67        38
weighted avg       0.72      0.68      0.67        38

SVM
              precision    recall  f1-score   support

           0       0.56      0.95      0.71        19
           1       0.83      0.26      0.40        19

    accuracy                           0.61        38
   macro avg       0.70      0.61      0.55        38
weighted avg       0.70      0.61      0.55        38

KNN
              precision    recall  f1-score   support

           0       0.54      0.74      0.62        19
           1       0.58      0.37      0.45        19

    accuracy                           0.55        38
   macro avg       0.56      0.55      0.54        38
weighted avg       0.56      0.55      0.54        38





-------

              precision    recall  f1-score   support

           0       0.68      0.89      0.77        19
           1       0.85      0.58      0.69        19

    accuracy                           0.74        38
   macro avg       0.76      0.74      0.73        38
weighted avg       0.76      0.74      0.73        38

SVM
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38

KNN
              precision    recall  f1-score   support

           0       0.67      0.95      0.78        19
           1       0.91      0.53      0.67        19

    accuracy                           0.74        38
   macro avg       0.79      0.74      0.72        38
weighted avg       0.79      0.74      0.72        38





-------

              precision    recall  f1-score   support

           0       0.61      0.74      0.67        19
           1       0.67      0.53      0.59        19

    accuracy                           0.63        38
   macro avg       0.64      0.63      0.63        38
weighted avg       0.64      0.63      0.63        38

SVM
              precision    recall  f1-score   support

           0       0.63      0.89      0.74        19
           1       0.82      0.47      0.60        19

    accuracy                           0.68        38
   macro avg       0.72      0.68      0.67        38
weighted avg       0.72      0.68      0.67        38

KNN
              precision    recall  f1-score   support

           0       0.57      0.63      0.60        19
           1       0.59      0.53      0.56        19

    accuracy                           0.58        38
   macro avg       0.58      0.58      0.58        38
weighted avg       0.58      0.58      0.58        38





-------

              precision    recall  f1-score   support

           0       0.57      0.68      0.62        19
           1       0.60      0.47      0.53        19

    accuracy                           0.58        38
   macro avg       0.58      0.58      0.57        38
weighted avg       0.58      0.58      0.57        38

SVM
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38

KNN
              precision    recall  f1-score   support

           0       0.58      0.58      0.58        19
           1       0.58      0.58      0.58        19

    accuracy                           0.58        38
   macro avg       0.58      0.58      0.58        38
weighted avg       0.58      0.58      0.58        38





-------

              precision    recall  f1-score   support

           0       0.64      0.84      0.73        19
           1       0.77      0.53      0.62        19

    accuracy                           0.68        38
   macro avg       0.70      0.68      0.68        38
weighted avg       0.70      0.68      0.68        38

SVM
              precision    recall  f1-score   support

           0       0.64      0.95      0.77        19
           1       0.90      0.47      0.62        19

    accuracy                           0.71        38
   macro avg       0.77      0.71      0.69        38
weighted avg       0.77      0.71      0.69        38

KNN
              precision    recall  f1-score   support

           0       0.60      0.79      0.68        19
           1       0.69      0.47      0.56        19

    accuracy                           0.63        38
   macro avg       0.65      0.63      0.62        38
weighted avg       0.65      0.63      0.62        38





-------

              precision    recall  f1-score   support

           0       0.64      0.84      0.73        19
           1       0.77      0.53      0.62        19

    accuracy                           0.68        38
   macro avg       0.70      0.68      0.68        38
weighted avg       0.70      0.68      0.68        38

SVM
              precision    recall  f1-score   support

           0       0.62      0.95      0.75        19
           1       0.89      0.42      0.57        19

    accuracy                           0.68        38
   macro avg       0.75      0.68      0.66        38
weighted avg       0.75      0.68      0.66        38

KNN
              precision    recall  f1-score   support

           0       0.65      0.89      0.76        19
           1       0.83      0.53      0.65        19

    accuracy                           0.71        38
   macro avg       0.74      0.71      0.70        38
weighted avg       0.74      0.71      0.70        38





-------

KeyboardInterrupt: 

###  PCA + Classification models

In [25]:
def pipeline_pca(data_pca):
    
   
        # Split into train and test sets
        code = {'Incident':1,'Non-case':0}
        y = data_pca['Groups']
        y = y.map(code)
        X = data_pca.iloc[:,2:]

        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,test_size=0.2, random_state=0, stratify=y)
        
        # Create and evaluate models
        prepocessor = make_pipeline(PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif, k=10))
        
        RandomForest = make_pipeline(prepocessor, RandomForestClassifier(random_state=0))
        AdaBoost = make_pipeline(prepocessor, AdaBoostClassifier(random_state=0))
        SVM = make_pipeline(prepocessor, StandardScaler(), SVC(random_state=0))
        KNN = make_pipeline(prepocessor, StandardScaler(), KNeighborsClassifier())
        
        dict_models = {'RandomForest': RandomForest,
               'AdaBoost' : AdaBoost,
               'SVM': SVM,
               'KNN': KNN,
               #'LogisticRegression' : LogisticRegression # Doesn't work yet ...
              }
        
        for name, model in dict_models.items():
            print(name)
            evaluation(model, X_train, X_test, y_train, y_test,False)
          
        
        print(3*'\n')

In [26]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap
import cimcb_lite as cb


In [79]:
path='/'.join(os.getcwd().split('/')[:-1]) + '/data/peakTable/scaled_peak_tables/'

f1="autoscaling-subset_multival_MICE_DecisionTreeRegressor.csv"
f2="l1_normalisation-X_KNN_features.csv"
f3="pareto_scaling-subset_multival_MICE_KNeighborsRegressor.csv"
f4="log2-X_python_MICE_KNeighborsRegressor.csv"

filename=path+f4
data_nm = pd.read_csv(filename, index_col=0)

pca = PCA(n_components=5)
scores = pca.fit_transform(data_nm)
scores=pd.DataFrame(scores)

In [73]:
peakTable_pca = pd.concat([first_cols, scores], axis=1)

In [74]:
pipeline_pca(peakTable_pca)

RandomForest
              precision    recall  f1-score   support

           0       0.60      0.79      0.68        19
           1       0.69      0.47      0.56        19

    accuracy                           0.63        38
   macro avg       0.65      0.63      0.62        38
weighted avg       0.65      0.63      0.62        38

AdaBoost
              precision    recall  f1-score   support

           0       0.60      0.79      0.68        19
           1       0.69      0.47      0.56        19

    accuracy                           0.63        38
   macro avg       0.65      0.63      0.62        38
weighted avg       0.65      0.63      0.62        38

SVM
              precision    recall  f1-score   support

           0       0.56      0.74      0.64        19
           1       0.62      0.42      0.50        19

    accuracy                           0.58        38
   macro avg       0.59      0.58      0.57        38
weighted avg       0.59      0.58      0.57     

### KNN on one dataset

In [80]:
path='/'.join(os.getcwd().split('/')[:-1]) + '/data/peakTable/scaled_peak_tables/'
f1="autoscaling-subset_multival_MICE_DecisionTreeRegressor.csv"
f2="l1_normalisation-X_KNN_features.csv"
f3="pareto_scaling-subset_multival_MICE_KNeighborsRegressor.csv"
f4="log10-X_KNN_samples.csv"
filename=path+f2
data_nm = pd.read_csv(filename, index_col=0)
data_nm = pd.concat([first_cols, data_nm], axis=1)


code = {'Incident':1,'Non-case':0}
y = data_nm['Groups']
y = y.map(code)
X = data_nm.iloc[:,2:]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,test_size=0.2, random_state=0, stratify=y)
        
# Create and evaluate models
prepocessor = make_pipeline(PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif, k=10))
KNN = make_pipeline(prepocessor, StandardScaler(), KNeighborsClassifier())
        
        
#evaluation(KNN, X_train, X_test, y_train, y_test,False)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
    
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.79      0.68        19
           1       0.69      0.47      0.56        19

    accuracy                           0.63        38
   macro avg       0.65      0.63      0.62        38
weighted avg       0.65      0.63      0.62        38

