In [175]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import scipy

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, confusion_matrix, classification_report


In [177]:
url = 'Covid Data.csv'
covid = pd.read_csv("Covid Data.csv", low_memory = False)
covid.head()


Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97


In [178]:
covid.shape

(1048575, 21)

In [179]:
columns= [
    "SEX", "USMER", "PATIENT_TYPE", "PNEUMONIA", "DIABETES", 
    "COPD", "ASTHMA", "INMSUPR", "HIPERTENSION", "OTHER_DISEASE",
    "CARDIOVASCULAR", "OBESITY", "RENAL_CHRONIC", "TOBACCO"
]
covid = covid.loc[covid.CLASIFFICATION_FINAL < 4]
for column in columns:
    covid = covid.loc[(covid[column] == 1) | (covid[column] == 2)]

In [180]:
columns = [
    "SEX", "USMER", "PNEUMONIA", "DIABETES", "COPD", 
    "ASTHMA", "INMSUPR", "HIPERTENSION", "OTHER_DISEASE", 
    "CARDIOVASCULAR", "OBESITY", "RENAL_CHRONIC", "TOBACCO", 
    "PREGNANT", "INTUBED", "ICU"
]
for column in columns:
    covid[column] = covid[column].apply(lambda x: x if x == 1 else 0)
covid["PATIENT_TYPE"] = covid["PATIENT_TYPE"].apply(lambda x: 0 if x == 1 else 1)
covid["DATE_DIED"] = covid["DATE_DIED"].apply(lambda x: 0 if x == "9999-99-99" else 1)

In [181]:
covid.shape

(388878, 21)

In [182]:


covid['AT_RISK'] = covid['DATE_DIED'] + covid['INTUBED'] + covid['ICU']
covid.AT_RISK = covid.AT_RISK.apply(lambda x: 1 if x > 0 else 0) 

# Drop a few columns which are intuitively not longer useful
covid.drop(columns = ['CLASIFFICATION_FINAL', 'INTUBED', 'ICU', 'DATE_DIED'], inplace=True)



In [185]:
print(covid.info())

<class 'pandas.core.frame.DataFrame'>
Index: 388878 entries, 0 to 1047937
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   USMER           388878 non-null  int64
 1   MEDICAL_UNIT    388878 non-null  int64
 2   SEX             388878 non-null  int64
 3   PATIENT_TYPE    388878 non-null  int64
 4   PNEUMONIA       388878 non-null  int64
 5   AGE             388878 non-null  int64
 6   PREGNANT        388878 non-null  int64
 7   DIABETES        388878 non-null  int64
 8   COPD            388878 non-null  int64
 9   ASTHMA          388878 non-null  int64
 10  INMSUPR         388878 non-null  int64
 11  HIPERTENSION    388878 non-null  int64
 12  OTHER_DISEASE   388878 non-null  int64
 13  CARDIOVASCULAR  388878 non-null  int64
 14  OBESITY         388878 non-null  int64
 15  RENAL_CHRONIC   388878 non-null  int64
 16  TOBACCO         388878 non-null  int64
 17  AT_RISK         388878 non-null  int64
dtypes: int64

In [191]:
covid['AT_RISK'].value_counts()

AT_RISK
0    328899
1     59979
Name: count, dtype: int64

In [193]:
train, validation = train_test_split(covid, test_size=0.20, shuffle=True)
validation, test = train_test_split(validation, test_size=0.5, shuffle=True)

validation_y = validation.AT_RISK.to_numpy()
validation_x = validation.drop(columns = ['AT_RISK']).to_numpy()

test_y = test.AT_RISK.to_numpy()
test_x = test.drop(columns = ['AT_RISK']).to_numpy()


In [195]:
train.shape

(311102, 18)

In [197]:
validation.shape

(38888, 18)

In [202]:
part_train, _ = train_test_split(train, test_size=0.9, shuffle=True)


In [204]:
atrisk = part_train[part_train.AT_RISK==1][:2000]
nonrisk = part_train[part_train.AT_RISK==0][:2000]
part_traint = pd.concat([nonrisk, atrisk])
train_y = part_traint.AT_RISK.to_numpy()
train_x = part_traint.drop(columns = ['AT_RISK']).to_numpy()

In [206]:
train_y.shape

(4000,)

In [208]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

In [210]:
def evaluation(model):
    
    model.fit(train_x, train_y)
    ypred = model.predict(test_x)
    
    print(confusion_matrix(test_y, ypred))
    print(classification_report(test_y, ypred))
   
    
    
    
   
    

In [213]:
preprocessor = make_pipeline(PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif, k=10))

In [216]:
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))

In [218]:
evaluation(SVM)

  f = msb / msw


[[27024  5954]
 [  250  5660]]
              precision    recall  f1-score   support

           0       0.99      0.82      0.90     32978
           1       0.49      0.96      0.65      5910

    accuracy                           0.84     38888
   macro avg       0.74      0.89      0.77     38888
weighted avg       0.91      0.84      0.86     38888



In [220]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [222]:
SVM

In [230]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, classification_report

# Création du pipeline de prétraitement
preprocessor = make_pipeline(
    PolynomialFeatures(2, include_bias=False), 
    SelectKBest(f_classif, k=10)
)

# Création du pipeline complet avec SVM
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))

# Définition de la grille des hyperparamètres à tester
param_grid = {
    
    'svc__C': [0.1, 1, 10],  # Hyperparamètre pour SVC
    'svc__kernel': ['linear', 'rbf'],  # Type de noyau pour SVC
}

# Configuration de GridSearchCV
grid_search = GridSearchCV(SVM, param_grid, cv=5, scoring='f1', n_jobs=-1)

# Entraînement du modèle avec GridSearchCV
grid_search.fit(train_x, train_y)

# Meilleur modèle et ses paramètres
best_model = grid_search.best_estimator_
 
# Affichage des meilleurs paramètres
print("Meilleurs paramètres ( SVM ) : ", grid_search.best_params_)
y_pred = grid_search.predict(test_x)
print("\nMatrice de confusion :\n", confusion_matrix(test_y, y_pred))


print(classification_report(test_y, y_pred))
print("\nF1-Score : ", f1_score(test_y, y_pred, average='weighted'))

  f = msb / msw


Meilleurs paramètres ( SVM ) :  {'svc__C': 10, 'svc__kernel': 'rbf'}

Matrice de confusion :
 [[27436  5542]
 [  303  5607]]
              precision    recall  f1-score   support

           0       0.99      0.83      0.90     32978
           1       0.50      0.95      0.66      5910

    accuracy                           0.85     38888
   macro avg       0.75      0.89      0.78     38888
weighted avg       0.92      0.85      0.87     38888


F1-Score :  0.8662919472105224


In [225]:
evaluation(grid_search.best_estimator_)

  f = msb / msw


[[27436  5542]
 [  303  5607]]
              precision    recall  f1-score   support

           0       0.99      0.83      0.90     32978
           1       0.50      0.95      0.66      5910

    accuracy                           0.85     38888
   macro avg       0.75      0.89      0.78     38888
weighted avg       0.92      0.85      0.87     38888



In [226]:

# Création du pipeline complet
pipeline = Pipeline([
    ('selectkbest', SelectKBest(score_func=f_classif)),              
    ('scaler', StandardScaler()),                                  
    ('naive_bayes', GaussianNB())                                    
])
# Définition de la grille des hyperparamètres
param_grid = {
    'selectkbest__k': [1,3,2,5, 10, 15],            # Nombre de meilleures caractéristiques à sélectionner
    'naive_bayes__priors': [[0.6, 0.4], [0.5, 0.5], [0.4, 0.6]]  # Probabilités a priori pour les classes
}

# Configuration de GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid=param_grid, 
    cv=5,  # Validation croisée à 5 plis
    scoring='f1',  # Métrique F1
    n_jobs=-1      # Utilisation de tous les cœurs disponibles
)

# Entraînement du modèle avec GridSearchCV
grid_search.fit(train_x, train_y)

# Meilleur modèle et ses paramètres
best_model = grid_search.best_estimator_

# Affichage des meilleurs paramètres
print("Meilleurs paramètres (Naïve Bayes) : ", grid_search.best_params_)

# Prédictions avec le meilleur modèle
y_pred = best_model.predict(test_x)

# Évaluation des performances
print("\nF1-Score : ", f1_score(test_y, y_pred, average='weighted'))
print("\nMatrice de confusion :\n", confusion_matrix(test_y, y_pred))
print("\nRapport de classification :\n", classification_report(test_y, y_pred))


Meilleurs paramètres (Naïve Bayes) :  {'naive_bayes__priors': [0.4, 0.6], 'selectkbest__k': 5}

F1-Score :  0.8747351282742788

Matrice de confusion :
 [[28065  4913]
 [  500  5410]]

Rapport de classification :
               precision    recall  f1-score   support

           0       0.98      0.85      0.91     32978
           1       0.52      0.92      0.67      5910

    accuracy                           0.86     38888
   macro avg       0.75      0.88      0.79     38888
weighted avg       0.91      0.86      0.87     38888

