# Importation des librairies

In [None]:
# NUMPY
import numpy as np

# STATS
import scipy.stats as stats
from scipy.stats import norm, skew

# MATPLOTLIB
import matplotlib as mlp
import matplotlib.pyplot as plt
%matplotlib inline 
# plt.style.use('fivethirtyeight') 

# PANDAS
import pandas as pd 
pd.set_option("display.max_rows", None, "display.max_columns", None) 

# SEABRON
import seaborn as sns

# SCIKIT-LEARN: PRE-PROCESSING
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder # encodage des variables catégorielles ordinales
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder # encodage des variables catégorielles nominales
from sklearn.preprocessing import StandardScaler # standardisation des variables numériques
from sklearn.preprocessing import MinMaxScaler # normalisation des variables numériques
from sklearn.preprocessing import RobustScaler # normalisation des variables numériques
from sklearn.impute import SimpleImputer # imputation des valeurs manquantes
from sklearn.impute import KNNImputer # imputation des valeurs manquantes par la méthode KNN
from sklearn.feature_selection  import SelectKBest # sélectionner 
from sklearn. preprocessing import PolynomialFeatures 

# MODELES PREDICTIFS

## REGRESSION
from sklearn.linear_model import LogisticRegression # régréssion logistique

## SVM 
from sklearn.svm import LinearSVC # machines à vecteurs de support (linéaire)
from sklearn.svm import SVC # machines à vecteurs de support (non-linéaire)

## SGD
from sklearn.linear_model import SGDClassifier #  classifieurs (SVM, régression logistique, etc.) avec un algorithme SGD

## ARBRES, FORETS, APRRENTISSAGE D'ENSEMBLE
from sklearn.tree import DecisionTreeClassifier # arbres classification
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier 

## KNN
from sklearn.neighbors import KNeighborsClassifier # KPP voisins

# VALIDATION CROISEE + OPTIMISATION
from sklearn.model_selection import train_test_split # séparation des données en train et test set
from sklearn.model_selection import cross_val_score # validation croisée pour comparaison entre modèles
from sklearn.model_selection import validation_curve # courbe de validation: visulaisr les scores lors du choix d'un hyperparamétre
from sklearn.model_selection import GridSearchCV # tester plusieurs hyperparamètres
from sklearn.model_selection import RandomizedSearchCV # tester arbitrairement plusieurs hyperparamètres
from sklearn.model_selection import learning_curve # courbe d'apprentissage: visualisation les scores du train et du validation sets en fonction des quanitiés des données
 
## EVALUATION: METRIQUES DE CLASSIFICATION
from sklearn.metrics import accuracy_score # exactitude (accuracy)
from sklearn.metrics import f1_score # F1-score
from sklearn.metrics import confusion_matrix # matrice de confusion
from sklearn.metrics import plot_confusion_matrix # graphique de la matrice de confusion
from sklearn.metrics import classification_report # rapport pour le modèle de classification

## EVALUATION: COURBE ROC
from sklearn.metrics import auc # aire sous la courbe 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import precision_recall_curve #
from sklearn.metrics import f1_score, recall_score


# PIPELINE
from sklearn.pipeline import make_pipeline

# TRANSFORMATEUR COMPOSITE (PRE-PROCESSOR + MODELE)
from sklearn.compose import make_column_transformer


# WARNINGS
import warnings
warnings.filterwarnings('ignore')

# Chargement des données


In [None]:
data_covid = pd.read_excel('../data/covid-19.xlsx', engine='openpyxl')
df = data_covid.copy()

In [None]:
df.head()

In [None]:
# information
df.info()

# Nettoyage générale

In [None]:
missing_rate = (df.isna().sum() / df.shape[0])
# missing_rate.sort_values(ascending=True)

In [None]:
cols_tests_viraux = list(df.columns[(missing_rate < 0.80) & (missing_rate >0.75)])
cols_taux_sanguins = list(df.columns[(missing_rate < 0.9) & (missing_rate >0.88)])

In [None]:
cols_age_cible = ['Patient age quantile', 'SARS-Cov-2 exam result']

In [None]:
# filter notre dataframe
df = df[cols_age_cible + cols_taux_sanguins + cols_tests_viraux] # ['a'] + ['b'] = ['a', 'b']
df.info()

# Démarches de travail  pour tester différentes idées du pre-processing: TrainTest - Nettoyage - Encodage (Test 0)


## Train set , Test set 

In [None]:
trainset, testset = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
trainset.shape, testset.shape

In [None]:
trainset['SARS-Cov-2 exam result'].value_counts()

In [None]:
testset['SARS-Cov-2 exam result'].value_counts()

## Encodage

In [None]:
for col in df.select_dtypes('object'):
    print(f'{col :-<50} {df[col].unique()}') # créer un sytème de marge

In [None]:
code = {'positive': 1,
       'negative': 0,
       'detected': 1,
       'not_detected': 0}

In [None]:
for col in df.select_dtypes('object'):
    df[col] = df[col].map(code)

In [None]:
df.head()

In [None]:
df.dtypes.value_counts() # pas de variables de type object dans notre dataset

In [None]:
# fonction encodage 
def encodage(df): # on peut passer soit le trainset ou le testset
    code = {'positive': 1,
       'negative': 0,
       'detected': 1,
       'not_detected': 0}
    for col in df.select_dtypes('object'):
        df[col] = df[col].map(code)
    return df

In [None]:
# fonction nettoyage
def nettoyage(df):
    return df.dropna(axis=0) # le plus simple possible

In [None]:
# focntion preprocessing
def preprocessing(df):
    df = encodage(df)
    df = nettoyage(df)
    X = df.drop('SARS-Cov-2 exam result', axis=1)
    y = df['SARS-Cov-2 exam result']
    print(y.value_counts())
    return X, y 

In [None]:
X_train, y_train = preprocessing(trainset)

In [None]:
X_test, y_test = preprocessing(testset)

## Modélisation - Évaluation 

In [None]:
treeModel = DecisionTreeClassifier(random_state=0)

In [None]:
def evaluation(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    N, train_score, val_score = learning_curve(model, 
                                              X_train, 
                                              y_train, 
                                              cv=5, 
                                              scoring='f1',
                                              train_sizes=np.linspace(0.1, 1, 10))
    plt.figure(figsize=(12,8))
    plt.plot(N, train_score.mean(axis=1), label='train_score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend()

In [None]:
evaluation(treeModel)


In [None]:
treeModel.feature_importances_

In [None]:
pd.DataFrame(treeModel.feature_importances_, index=X_train.columns).plot.bar(figsize=(12,8))
plt.show()

# TrainTest - Nettoyage - Encodage (Test 1)

In [None]:
df1 = df[cols_age_cible + cols_taux_sanguins]# + cols_tests_viraux] # ['a'] + ['b'] = ['a', 'b']

In [None]:
trainset1, testset1 = train_test_split(df1, test_size=0.2, random_state=0)

In [None]:
X_train1, y_train1 = preprocessing(trainset1)
X_test1, y_test1 = preprocessing(testset1)

In [None]:
def evaluation1(model):
    model.fit(X_train1, y_train1)
    y_pred = model.predict(X_test1)
    print(confusion_matrix(y_test1, y_pred))
    print(classification_report(y_test1, y_pred))
    
    N, train_score, val_score = learning_curve(model, 
                                              X_train1, 
                                              y_train1, 
                                              cv=5, 
                                              scoring='f1',
                                              train_sizes=np.linspace(0.1, 1, 10))
    plt.figure(figsize=(12,8))
    plt.plot(N, train_score.mean(axis=1), label='train_score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend()

In [None]:
evaluation(treeModel)

In [None]:
rdforestModel = RandomForestClassifier(random_state=0) 

In [None]:
evaluation(rdforestModel)

In [None]:
rdforestModel.feature_importances_

In [None]:
# on va injecter ce tableau dans un dataframe
pd.DataFrame(rdforestModel.feature_importances_, index=X_train.columns).plot.bar(figsize=(12,8))
plt.show()

In [None]:
def feature_engineering(df):
    df['etre_malde'] = df[cols_tests_viraux].sum(axis=1) >= 1 # patient a au moins une maladie
    df = df.drop(cols_tests_viraux, axis=1)
    return df 


In [None]:
def preprocessing(df):
    df = encodage(df)
    df = feature_engineering(df)
    df = nettoyage(df)
    X = df.drop('SARS-Cov-2 exam result', axis=1)
    y = df['SARS-Cov-2 exam result']
    print(y.value_counts())
    return X, y 

In [None]:
X_train, y_train = preprocessing(trainset)
X_test, y_test = preprocessing(testset)

In [None]:
evaluation(rdforestModel)

In [None]:
pd.DataFrame(rdforestModel.feature_importances_, index=X_train.columns).plot.bar(figsize=(12,8))
plt.show()

In [None]:
# on va faire un pipleline 
from sklearn.feature_selection  import SelectKBest, f_classif

In [None]:
model = make_pipeline(SelectKBest(f_classif, k=10), 
                     RandomForestClassifier(random_state=0))


In [None]:
evaluation(model)

In [None]:
model = make_pipeline(SelectKBest(f_classif, k=5), 
                     RandomForestClassifier(random_state=0))
evaluation(model)

In [None]:
model = make_pipeline(PolynomialFeatures(2), SelectKBest(f_classif, k=10),
                     RandomForestClassifier(random_state=0))
evaluation(model)

In [None]:
preprocessor = make_pipeline(PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif, k=10))

# on laisse les modèles sur leurs hyperpramètres de base
RandomForest = make_pipeline(preprocessor, RandomForestClassifier(random_state=0))
AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))
KNN = make_pipeline(preprocessor, StandardScaler(), KNeighborsClassifier())

In [None]:
list_of_models = {'RandomForest':RandomForest,
                  'AdaBoost':AdaBoost,
                  'SVM':SVM,
                  'KNN':KNN}

In [None]:
for name, model in list_of_models.items():
    print('*' *90)
    print(name)
    print('*' *90)
    evaluation(model)

## Optimisation


In [None]:
SVM

In [None]:
hyper_params = {'svc__gamma':[1e-3, 1e-4], 
                'svc__C':[1, 10, 100, 1000]} # attention ne mettre pas d'espace

In [None]:
grid = GridSearchCV(SVM, hyper_params, scoring='recall', cv=5)


In [None]:
grid.fit(X_train, y_train)
print(grid.best_params_)
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred)) 

In [None]:
evaluation(grid.best_estimator_)

In [None]:
hyper_params = {'svc__gamma':[1e-3, 1e-4], 
                'svc__C':[1, 10, 100, 1000],
                'pipeline__polynomialfeatures__degree':[2, 3],
                'pipeline__selectkbest__k':range(50, 70)} 

In [None]:
random_grid = RandomizedSearchCV(SVM, hyper_params, scoring='recall', cv=5, n_iter=50)
random_grid.fit(X_train, y_train)
print(random_grid.best_params_)
y_pred = random_grid.predict(X_test)
print(classification_report(y_test, y_pred)) #

In [None]:
evaluation(random_grid.best_estimator_)

# Courbe Précision / Recalll 
# seuil de la frontière de décision pour notre modèle 


In [None]:
precision, recall, threshold = \
precision_recall_curve(y_test, random_grid.best_estimator_.decision_function(X_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(threshold, precision[:-1], label='precision')
plt.plot(threshold, recall[:-1], label='recall')
plt.legend()

In [None]:
def model_final(model, X, threshold=0):
    return model.decision_function(X) > threshold

In [None]:
y_pred = model_final(random_grid.best_estimator_, X_test, threshold=-1)

In [None]:
f1_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
random_grid.best_estimator_.decision_function(X_test).shape

## Optimisation Adaboost 

In [None]:
AdaBoost

In [None]:
# DecisionTreeClassifier(max_depth=1), n_estimators=200,
    # algorithm="SAMME.R", learning_rate=0.5, random_state=42)
    
# AdaBoostClassifier(base_estimator=None,
#                     n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)[source]¶

In [None]:
hyper_params = {'adaboostclassifier__n_estimators':[60, 100], 
                'adaboostclassifier__learning_rate':[1e-1, 1]} # attention ne mettre pas d'espace

In [None]:
grid = GridSearchCV(AdaBoost, hyper_params, scoring='recall', cv=5)

In [None]:
grid.fit(X_train, y_train)
print(grid.best_params_)
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred)) # l'ordre est tres important sinon tous les calculs sont inversés
# dans la matrice de confusion 

In [None]:
evaluation(grid.best_estimator_)

In [None]:
AdaBoost

In [None]:
hyper_params = { 
                'adaboostclassifier__base_estimator__max_depth':[1,2],
                'adaboostclassifier__n_estimators':[1, 2, 3, 4, 5], 
                'adaboostclassifier__learning_rate':[2],
                'pipeline__polynomialfeatures__degree':[2, 3],
                'pipeline__selectkbest__k':range(50, 60)} # attention ne mettre pas d'espace  # 50, 70
# si on fait çà avec gridsearch cv on est mort

In [None]:
random_grid = RandomizedSearchCV(AdaBoost, hyper_params, scoring='recall', cv=5, n_iter=50)
random_grid.fit(X_train, y_train)
print(random_grid.best_params_)
y_pred = random_grid.predict(X_test)
print(classification_report(y_test, y_pred)) #

In [None]:
# AdaBoost.get_params()

In [None]:
evaluation(random_grid.best_estimator_)