In [None]:
## Chargement des librairies 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
## Importation des données 
cardio = pd.read_csv("data_final.csv")
cardio.head()

Unnamed: 0,age,gender,height,weight,imc,ap_hi,ap_lo,gluc,smoke,alco,active,hypertendu,cardio,chol_nor,chol_sp_nor,chol_trs_sp_nor
0,-0.493716,1,0.46793,-0.904326,-1.133974,-1.042765,-0.221458,1,0,0,1,0,0,1,0,0
1,0.245531,0,-1.095502,0.866165,1.618418,0.841858,0.964393,1,0,0,1,1,1,0,0,1
2,-0.198017,0,0.077072,-0.750371,-0.806915,0.213651,-1.407309,1,0,0,0,0,1,0,0,1
3,-0.789415,1,0.598216,0.635232,0.29744,1.470066,2.150245,1,0,0,1,1,1,1,0,0
4,-0.789415,0,-1.095502,-1.366194,-0.913103,-1.670972,-1.407309,1,0,0,0,0,0,1,0,0


In [None]:
## Infots table
cardio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              70000 non-null  float64
 1   gender           70000 non-null  int64  
 2   height           70000 non-null  float64
 3   weight           70000 non-null  float64
 4   imc              70000 non-null  float64
 5   ap_hi            70000 non-null  float64
 6   ap_lo            70000 non-null  float64
 7   gluc             70000 non-null  int64  
 8   smoke            70000 non-null  int64  
 9   alco             70000 non-null  int64  
 10  active           70000 non-null  int64  
 11  hypertendu       70000 non-null  int64  
 12  cardio           70000 non-null  int64  
 13  chol_nor         70000 non-null  int64  
 14  chol_sp_nor      70000 non-null  int64  
 15  chol_trs_sp_nor  70000 non-null  int64  
dtypes: float64(6), int64(10)
memory usage: 8.5 MB


In [None]:
## Séparation des données 
X = cardio.drop("cardio", axis=1)
y = cardio["cardio"]

## Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, stratify=y, random_state = 234)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(56000, 15) (14000, 15) (56000,) (14000,)


### **VOTING CLASSIFIERS**

In [None]:
## Nos premiers modeles : VotingClassifier
model_1 = SGDClassifier(random_state = 23)
model_2 = DecisionTreeClassifier(random_state=23)
model_3 = KNeighborsClassifier(n_neighbors=3)

model_4 = VotingClassifier([("SDG", model_1), ("TREE", model_2), ("KNN", model_3)], voting="hard")

In [None]:
## La justesse des modèles : Le meilleur performance revient au modele SGDClassifier 
for model in (model_1, model_2, model_3, model_4): 
    model.fit(X_train, y_train)
    print(model.__class__.__name__, model.score(X_test, y_test))

SGDClassifier 0.7227857142857143
DecisionTreeClassifier 0.6395
KNeighborsClassifier 0.6810714285714285
VotingClassifier 0.7059285714285715


### **BAGGING METHODS**

In [None]:
## Importation des modeles de bagging 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier 

In [None]:
## Entrainement du modele BaggingClassifier
model_bag = BaggingClassifier(SGDClassifier(), n_estimators=200)
model_bag.fit(X_train, y_train)

## Justesse du modèle 
print("BaggingClassifier :", model_bag.score( X_test, y_test))

BaggingClassifier : 0.7227857142857143


In [None]:
## Entrainement du modele RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=200)
model_rf.fit(X_train, y_train)

## Justesse du modèle
print("RandomForestClassifier :", model_rf.score(X_test, y_test))

RandomForestClassifier : 0.7068571428571429


### **BOOSTING METHODS**

In [None]:
## Importation des modeles de boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

In [None]:
## Entrainement du modele AdaBoostClassifier
model_ada = AdaBoostClassifier(n_estimators = 200)
model_ada.fit(X_train, y_train)

## Justesse du modèle
print("AdaBoostClassifier :", model_ada.score(X_test, y_test))

AdaBoostClassifier : 0.7292142857142857


In [None]:
## Entrainement du modele GradientBoostingClassifier
model_grad = GradientBoostingClassifier(n_estimators=200, random_state=0)
model_grad.fit(X_train, y_train)

## Justesse du modèle
print("GradientBoostingClassifier :", model_grad.score(X_test, y_test))

GradientBoostingClassifier : 0.7395714285714285


In [None]:
## Entrainement du modele HistGradientBoostingClassifier
model_histgrad = HistGradientBoostingClassifier(random_state=0, max_iter=500, learning_rate=0.01)
model_histgrad.fit(X_train, y_train)

## Justesse du modèle
print("HistGradientBoostingClassifier :", model_histgrad.score(X_test, y_test))

HistGradientBoostingClassifier : 0.7388571428571429


### **STACKING METHODS**

In [None]:
## Importation du modele StackingClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
## Entrainement du modele StackingClassifier
model_stack = StackingClassifier([("SDG", model_1), ("TREE", model_2), ("KNN", model_3)], 
final_estimator = GradientBoostingClassifier(n_estimators = 200))
model_stack.fit(X_train, y_train)

## Justesse du modèle
print("StackingClassifier :", model_stack.score(X_test, y_test))

GradientBoostingClassifier : 0.7261428571428571


### **CHOIX DE MODELE**

Nous avons entrainé plusieurs modèles de machine learning. 
De tous, nous retenons les modeles ***GradientBoostingClassifier*** et ***HistGradientBoostingClassifier*** pour leur 
justesse sur l'ensemble d'entrainement plus élevée. Nous passerons à la phase d'évaluation. 

### **EVALUATION MODELS : HistGradientBoostingClassifier et GradientBoostingClassifier**

In [None]:
## Importation de GridSearchCV pour la séléection des meilleurs parametres 
from sklearn.model_selection import GridSearchCV

In [1]:
## Importation de classification_report
from sklearn.metrics import classification_report

In [2]:
## Importation metrics 
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score

#### **Modèle HistGradientBoostingClassifier**

In [None]:
# Définition du modèle
hist_gradient = HistGradientBoostingClassifier(random_state=0)

# Grille des hyperparamètres à tester
param_grid = {
    "learning_rate": [0.001, 0.01, 0.5, 0.1],
    "max_iter": [100, 150, 200, 300],
    "max_leaf_nodes": [10, 15, 20, 31], 
    "min_samples_leaf": [5, 10, 20]
}

# Grid Search avec validation croisée
grid_search = GridSearchCV(
    estimator=hist_gradient,
    param_grid=param_grid,
    scoring="accuracy",         # optimise le F1 score
    cv=5,                 # Cross-validation
    n_jobs=-1,            # Utilisation de tous les CPU
    verbose=2
)

grid_search.fit(X_train, y_train)

# Meilleurs hyperparamètres
print("\nBest Parameters:")
print(grid_search.best_params_)

# Meilleur modèle optimisé
best_hist_gradient = grid_search.best_estimator_

# Évaluation sur les données test
y_pred_test = best_hist_gradient.predict(X_test)

print("\n===== Best Model Performance on Test Set =====")
print(classification_report(y_test, y_pred_test))

In [None]:

# Prédictions
histboost_train_pred = best_hist_gradient.predict(X_train)
histboost_test_pred = best_hist_gradient.predict(X_test)

# Scores Training
metrics_train = {
    "Accuracy": accuracy_score(y_train, histboost_train_pred),
    "Recall": recall_score(y_train, histboost_train_pred),
    "Precision": precision_score(y_train, histboost_train_pred),
    "ROC AUC": roc_auc_score(y_train, histboost_train_pred),
    "F1 Score": f1_score(y_train, histboost_train_pred)
}

# Scores Test
metrics_test = {
    "Accuracy": accuracy_score(y_test, histboost_test_pred),
    "Recall": recall_score(y_test, histboost_test_pred),
    "Precision": precision_score(y_test, histboost_test_pred),
    "ROC AUC": roc_auc_score(y_test, histboost_test_pred),
    "F1 Score": f1_score(y_test, histboost_test_pred)
}

print("===== Performance Train =====")
for k, v in metrics_train.items():
    print(f"{k:10s}: {v:.4f}")

print("\n===== Performance Test =====")
for k, v in metrics_test.items():
    print(f"{k:10s}: {v:.4f}")

===== Performance Train =====
Accuracy  : 0.7401
Recall    : 0.7032
Precision : 0.7590
ROC AUC   : 0.7401
F1 Score  : 0.7300

===== Performance Test =====
Accuracy  : 0.7386
Recall    : 0.6970
Precision : 0.7600
ROC AUC   : 0.7385
F1 Score  : 0.7271


#### **Modèle GradientBoostingClassifier**

In [None]:
# Définition du modèle
model_grad = GradientBoostingClassifier(random_state=42)

# Grille des hyperparamètres à tester
param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [2, 3, 4],
    "subsample": [0.7, 0.8, 1.0],
    "max_features": ["sqrt", "log2"]
}

# Grid Search avec validation croisée
grid_search = GridSearchCV(
    estimator=model_grad,
    param_grid=param_grid,
    scoring="accuracy",         # optimise le F1 score
    cv=5,                 # Cross-validation
    n_jobs=-1,            # Utilisation de tous les CPU
    verbose=2
)

grid_search.fit(X_train, y_train)

# Meilleurs hyperparamètres
print("\n✅ Best Parameters:")
print(grid_search.best_params_)

# Meilleur modèle optimisé
best_gb = grid_search.best_estimator_

# Évaluation sur les données test
y_pred_test = best_gb.predict(X_test)

print("\n===== Best Model Performance on Test Set =====")
print(classification_report(y_test, y_pred_test))


In [None]:
# Prédictions
gradient_train_pred = best_gb.predict(X_train)
gradient_test_pred = best_gb.predict(X_test)

# Scores Training
metrics_train = {
    "Accuracy": accuracy_score(y_train, gradient_train_pred),
    "Recall": recall_score(y_train, gradient_train_pred),
    "Precision": precision_score(y_train, gradient_train_pred),
    "ROC AUC": roc_auc_score(y_train, gradient_train_pred),
    "F1 Score": f1_score(y_train, gradient_train_pred)
}

# Scores Test
metrics_test = {
    "Accuracy": accuracy_score(y_test, gradient_test_pred),
    "Recall": recall_score(y_test, gradient_test_pred),
    "Precision": precision_score(y_test, gradient_test_pred),
    "ROC AUC": roc_auc_score(y_test, gradient_test_pred),
    "F1 Score": f1_score(y_test, gradient_test_pred)
}

print("===== Performance Train =====")
for k, v in metrics_train.items():
    print(f"{k:10s}: {v:.4f}")

print("\n===== Performance Test =====")
for k, v in metrics_test.items():
    print(f"{k:10s}: {v:.4f}")

===== Performance Train =====
Accuracy  : 0.7415
Recall    : 0.7025
Precision : 0.7617
ROC AUC   : 0.7415
F1 Score  : 0.7309

===== Performance Test =====
Accuracy  : 0.7396
Recall    : 0.6984
Precision : 0.7608
ROC AUC   : 0.7395
F1 Score  : 0.7283
