In [25]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import scipy

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [27]:
url = 'Covid Data.csv'
covid = pd.read_csv("Covid Data.csv", low_memory = False)
covid.head()


Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97


In [28]:
covid.shape

(1048575, 21)

# Encodage et suppression des valeurs manquante

In [30]:
columns= [
    "SEX", "USMER", "PATIENT_TYPE", "PNEUMONIA", "DIABETES", 
    "COPD", "ASTHMA", "INMSUPR", "HIPERTENSION", "OTHER_DISEASE",
    "CARDIOVASCULAR", "OBESITY", "RENAL_CHRONIC", "TOBACCO"
]
covid = covid.loc[covid.CLASIFFICATION_FINAL < 4]
for column in columns:
    covid = covid.loc[(covid[column] == 1) | (covid[column] == 2)]

In [31]:
columns = [
    "SEX", "USMER", "PNEUMONIA", "DIABETES", "COPD", 
    "ASTHMA", "INMSUPR", "HIPERTENSION", "OTHER_DISEASE", 
    "CARDIOVASCULAR", "OBESITY", "RENAL_CHRONIC", "TOBACCO", 
    "PREGNANT", "INTUBED", "ICU"
]
for column in columns:
    covid[column] = covid[column].apply(lambda x: x if x == 1 else 0)
covid["PATIENT_TYPE"] = covid["PATIENT_TYPE"].apply(lambda x: 0 if x == 1 else 1)
covid["DATE_DIED"] = covid["DATE_DIED"].apply(lambda x: 0 if x == "9999-99-99" else 1)

In [32]:
covid.shape

(388878, 21)

# feature engeniring


In [36]:


covid['AT_RISK'] = covid['DATE_DIED'] + covid['INTUBED'] + covid['ICU']
covid.AT_RISK = covid.AT_RISK.apply(lambda x: 1 if x > 0 else 0) 

# Drop a few columns which are intuitively not longer useful
covid.drop(columns = ['CLASIFFICATION_FINAL', 'INTUBED', 'ICU', 'DATE_DIED'], inplace=True)



In [37]:
print(covid.info())

<class 'pandas.core.frame.DataFrame'>
Index: 388878 entries, 0 to 1047937
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   USMER           388878 non-null  int64
 1   MEDICAL_UNIT    388878 non-null  int64
 2   SEX             388878 non-null  int64
 3   PATIENT_TYPE    388878 non-null  int64
 4   PNEUMONIA       388878 non-null  int64
 5   AGE             388878 non-null  int64
 6   PREGNANT        388878 non-null  int64
 7   DIABETES        388878 non-null  int64
 8   COPD            388878 non-null  int64
 9   ASTHMA          388878 non-null  int64
 10  INMSUPR         388878 non-null  int64
 11  HIPERTENSION    388878 non-null  int64
 12  OTHER_DISEASE   388878 non-null  int64
 13  CARDIOVASCULAR  388878 non-null  int64
 14  OBESITY         388878 non-null  int64
 15  RENAL_CHRONIC   388878 non-null  int64
 16  TOBACCO         388878 non-null  int64
 17  AT_RISK         388878 non-null  int64
dtypes: int64

In [38]:
covid['AT_RISK'].value_counts()

AT_RISK
0    328899
1     59979
Name: count, dtype: int64

In [39]:
train, validation = train_test_split(covid, test_size=0.1, shuffle=True)
validation, test = train_test_split(validation, test_size=0.5, shuffle=True)

validation_y = validation.AT_RISK.to_numpy()
validation_x = validation.drop(columns = ['AT_RISK']).to_numpy()

test_y = test.AT_RISK.to_numpy()
test_x = test.drop(columns = ['AT_RISK']).to_numpy()


## Calculate results
### function that returns F-measure (F1-score) for given data and labels

In [42]:
# Fonction d'evaluation
def get_f_measure(predictions, labels):
    """
    Calculate the F-measure (F1-score) based on predictions and true labels.

    Parameters:
    - predictions: numpy array of predicted labels (binary: 0 or 1).
    - labels: numpy array of true labels (binary: 0 or 1).

    Returns:
    - f_measure: The F1-score, a harmonic mean of precision and recall.
    """
    # Calculate true positives, false positives, and false negatives
    true_positive = np.sum((predictions == 1) & (labels == 1))
    false_positive = np.sum((predictions == 1) & (labels == 0))
    false_negative = np.sum((predictions == 0) & (labels == 1))

    # Avoid division by zero with small epsilon
    epsilon = 1e-10

    # Calculate precision and recall
    precision = true_positive / (true_positive + false_positive + epsilon)
    recall = true_positive / (true_positive + false_negative + epsilon)

    # Calculate F1-score
    f_measure = 2 * (precision * recall) / (precision + recall + epsilon)

    return f_measure


### Umdersamplinfg the train set

In [44]:
atrisk = train[train.AT_RISK==1][:2000]
nonrisk = train[train.AT_RISK==0][:2000]
part_train = pd.concat([nonrisk, atrisk])
train_y = part_train.AT_RISK.to_numpy()
train_x = part_train.drop(columns = ['AT_RISK']).to_numpy()

In [47]:
train_y.shape

(4000,)

In [52]:
train_x.shape

(4000, 17)

## Implementation de KNN

In [70]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import classification_report, accuracy_score

# # Initialiser le classifieur KNN avec 5 voisins
# knn_model = KNeighborsClassifier(n_neighbors=5)

# # Entraîner le modèle sur l'ensemble d'entraînement
# knn_model.fit(train_x, train_y)

# # Prédire sur l'ensemble de validation
# validation_predictions = knn_model.predict(validation_x)

# # Évaluer les performances sur l'ensemble de validation
# print("Rapport de classification (validation) :")
# print(classification_report(validation_y, validation_predictions))
# print("Précision globale (validation) :", accuracy_score(validation_y, validation_predictions))

# # Prédire sur l'ensemble de test
# test_predictions = knn_model.predict(test_x)

# # Évaluer les performances sur l'ensemble de test
# print("Rapport de classification (test) :")
# print(classification_report(test_y, test_predictions))
# print("Précision globale (test) :", accuracy_score(test_y, test_predictions))


### Implementation 2 de KNN avec une fonction pour savoir la meilleure valeur de K possible pour l'algo

In [77]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import classification_report, accuracy_score, f1_score
# import numpy as np

# # Standardisation des données
# scaler = StandardScaler()

# # Ajuster le scaler sur les données d'entraînement et transformer
# train_x = scaler.fit_transform(train_x)

# # Transformer les ensembles de validation et de test
# validation_x = scaler.transform(validation_x)
# test_x = scaler.transform(test_x)

# # Fonction pour tester différentes valeurs de k
# def find_best_k(train_x, train_y, validation_x, validation_y, k_values):
#     best_k = None
#     best_accuracy = 0
#     accuracies = []

#     print("Test des différentes valeurs de k :")
#     for k in k_values:
#         # Initialiser le modèle KNN avec k voisins
#         knn_model = KNeighborsClassifier(n_neighbors=k)
        
#         # Entraîner le modèle
#         knn_model.fit(train_x, train_y)
        
#         # Prédire sur l'ensemble de validation
#         validation_predictions = knn_model.predict(validation_x)
        
#         # Calculer la précision
#         accuracy = accuracy_score(validation_y, validation_predictions)
#         accuracies.append(accuracy)

#         print(f"k = {k}, Précision = {accuracy:.4f}")

#         # Mettre à jour la meilleure valeur de k si nécessaire
#         if accuracy > best_accuracy:
#             best_accuracy = accuracy
#             best_k = k

#     return best_k, best_accuracy, accuracies

# # Liste des valeurs de k à tester
# k_values = range(1, 21)  # Par exemple, tester de 1 à 20 voisins

# # Trouver la meilleure valeur de k
# best_k, best_accuracy, accuracies = find_best_k(train_x, train_y, validation_x, validation_y, k_values)

# print(f"\nLa meilleure valeur de k est {best_k} avec une précision de {best_accuracy:.4f}.")

# # Entraîner le modèle KNN avec la meilleure valeur de k et évaluer sur l'ensemble de test
# knn_model = KNeighborsClassifier(n_neighbors=best_k)
# knn_model.fit(train_x, train_y)

# # Prédictions sur l'ensemble de test
# test_predictions = knn_model.predict(test_x)

# # Évaluation des performances
# print("\nRapport de classification (test) :")
# print(classification_report(test_y, test_predictions))
# print("Précision globale (test) :", accuracy_score(test_y, test_predictions))
# print("F1-score (test) :", f1_score(test_y, test_predictions))

# # Tracer les précisions en fonction de k (optionnel)
# plt.figure(figsize=(10, 6))
# plt.plot(k_values, accuracies, marker='o', linestyle='-', color='b')
# plt.title("Précision sur l'ensemble de validation en fonction de k")
# plt.xlabel("Nombre de voisins (k)")
# plt.ylabel("Précision")
# plt.grid(True)
# plt.show()


## Implementation  de KNN avec GridSearchCV

In [65]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Standardisation des données
scaler = StandardScaler()

# Ajuster le scaler sur les données d'entraînement et transformer
train_x = scaler.fit_transform(train_x)

# Transformer les ensembles de validation et de test
validation_x = scaler.transform(validation_x)
test_x = scaler.transform(test_x)

# Définir le modèle KNN
knn_model = KNeighborsClassifier()

# Définir les paramètres à tester (ici les valeurs de k)
param_grid = {'n_neighbors': range(1, 21)}  # Tester les k de 1 à 20

# Initialiser GridSearchCV
grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Effectuer la recherche sur l'ensemble d'entraînement
grid_search.fit(train_x, train_y)

# Meilleure valeur de k et meilleure précision
best_k = grid_search.best_params_['n_neighbors']
best_accuracy = grid_search.best_score_

print(f"Meilleure valeur de k : {best_k}")
print(f"Précision moyenne sur la validation (cross-validation) : {best_accuracy:.4f}")

# Évaluer le modèle final sur l'ensemble de test
final_model = grid_search.best_estimator_
test_predictions = final_model.predict(test_x)

print("Matrice de confusion\n", confusion_matrix(test_y, test_predictions))

print("\nRapport de classification (test) :")
print(classification_report(test_y, test_predictions))
print("Précision globale (test) :", accuracy_score(test_y, test_predictions))
print("F1-score (test) :", f1_score(test_y, test_predictions))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Meilleure valeur de k : 11
Précision moyenne sur la validation (cross-validation) : 0.8820
Matrice de confusion
 [[13624  2871]
 [  209  2740]]

Rapport de classification (test) :
              precision    recall  f1-score   support

           0       0.98      0.83      0.90     16495
           1       0.49      0.93      0.64      2949

    accuracy                           0.84     19444
   macro avg       0.74      0.88      0.77     19444
weighted avg       0.91      0.84      0.86     19444

Précision globale (test) : 0.8415963793458137
F1-score (test) : 0.6401869158878505


## Implementation Regression Logistique 

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Standardisation des données
scaler = StandardScaler()

train_x = scaler.fit_transform(train_x)
validation_x = scaler.transform(validation_x)
test_x = scaler.transform(test_x)

# Initialisation du modèle
logistic_model = LogisticRegression(random_state=42, max_iter=1000)

# Entraînement du modèle
logistic_model.fit(train_x, train_y)

# Prédiction sur l'ensemble de test
test_predictions = logistic_model.predict(test_x)

# Évaluation
print("Rapport de classification (test) :")
print(classification_report(test_y, test_predictions))

print("Précision globale (test) :", accuracy_score(test_y, test_predictions))


Rapport de classification (test) :
              precision    recall  f1-score   support

           0       0.98      0.85      0.91     16445
           1       0.53      0.91      0.67      2999

    accuracy                           0.86     19444
   macro avg       0.75      0.88      0.79     19444
weighted avg       0.91      0.86      0.87     19444

Précision globale (test) : 0.8602653774943427


## Implementation de la regression lineaireavec gridSearchCV pour justifier le choix des hyperparametres

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Créez un pipeline avec standardisation et régression logistique
pipeline = make_pipeline(StandardScaler(), LogisticRegression())

# Définissez la grille de recherche des paramètres
param_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],  # Valeurs possibles de C
    'logisticregression__solver': ['liblinear', 'newton-cg', 'lbfgs', 'saga'],  # Algorithmes d'optimisation
    'logisticregression__penalty': ['l1', 'l2', 'elasticnet'],  # Types de régularisation
    'logisticregression__max_iter': [100, 200, 300]  # Nombre maximal d'itérations
}

# Initialiser GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Entraînement avec les données d'entraînement
grid_search.fit(train_x, train_y)

# Affichage des meilleurs paramètres
print("Meilleurs paramètres :", grid_search.best_params_)

# Affichage du meilleur modèle
best_model = grid_search.best_estimator_

# Évaluation du modèle sur le jeu de test
test_predictions = best_model.predict(test_x)

print("Matrice de confusion\n", confusion_matrix(test_y, test_predictions))

from sklearn.metrics import classification_report
print("Rapport de classification :")
print(classification_report(test_y, test_predictions))

# Précision globale
from sklearn.metrics import accuracy_score
print("Précision globale :", accuracy_score(test_y, test_predictions))
print("F1-score (test) :", f1_score(test_y, test_predictions))


Meilleurs paramètres : {'logisticregression__C': 0.1, 'logisticregression__max_iter': 100, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear'}
Matrice de confusion
 [[14068  2427]
 [  244  2705]]
Rapport de classification :
              precision    recall  f1-score   support

           0       0.98      0.85      0.91     16495
           1       0.53      0.92      0.67      2949

    accuracy                           0.86     19444
   macro avg       0.76      0.89      0.79     19444
weighted avg       0.91      0.86      0.88     19444

Précision globale : 0.8626311458547624
F1-score (test) : 0.6694716000494988


450 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\NEW\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\NEW\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\NEW\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\NEW\anaconda3\Lib\site-packages\sklearn\base.py", line 147

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Créez un pipeline avec standardisation et régression logistique
pipeline = make_pipeline(StandardScaler(), LogisticRegression())

# Définissez la grille de recherche des paramètres avec des combinaisons compatibles
param_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],  # Valeurs possibles de C
    'logisticregression__solver': ['liblinear', 'saga'],  # Choisir des solveurs compatibles
    'logisticregression__penalty': ['l1', 'l2'],  # Choisir les pénalités compatibles
    'logisticregression__max_iter': [100, 200, 300]  # Nombre maximal d'itérations
}

# Initialiser GridSearchCV avec `error_score='raise'` pour détecter les erreurs
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', error_score='raise')

# Entraînement avec les données d'entraînement
grid_search.fit(train_x, train_y)

# Affichage des meilleurs paramètres
print("Meilleurs paramètres :", grid_search.best_params_)

# Affichage du meilleur modèle
best_model = grid_search.best_estimator_

# Évaluation du modèle sur le jeu de test
test_predictions = best_model.predict(test_x)

print("Matrice de confusion\n", confusion_matrix(test_y, test_predictions))

from sklearn.metrics import classification_report
print("Rapport de classification :")
print(classification_report(test_y, test_predictions))

# Précision globale
from sklearn.metrics import accuracy_score
print("Précision globale :", accuracy_score(test_y, test_predictions))
print("F1-score (test) :", f1_score(test_y, test_predictions))


Meilleurs paramètres : {'logisticregression__C': 0.1, 'logisticregression__max_iter': 100, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear'}
Matrice de confusion
 [[14068  2427]
 [  244  2705]]
Rapport de classification :
              precision    recall  f1-score   support

           0       0.98      0.85      0.91     16495
           1       0.53      0.92      0.67      2949

    accuracy                           0.86     19444
   macro avg       0.76      0.89      0.79     19444
weighted avg       0.91      0.86      0.88     19444

Précision globale : 0.8626311458547624
F1-score (test) : 0.6694716000494988
