# Import du Dataset

In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv('heart_data.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# Preprocessing

In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

X = df.copy()
y = X.pop('HeartDisease')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

numeric_features = list(X.select_dtypes('int64', 'float64').columns)
categorical_features = list(X.select_dtypes('object').columns)

transformer_num = make_pipeline(
    SimpleImputer(strategy="mean"),
    RobustScaler(),
)

transformer_cat = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", transformer_num, numeric_features),
        ("cat", transformer_cat, categorical_features),
    ]
)

preprocessor

# Baseline

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", model)]
)

clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report =  classification_report(y_test,y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(conf_matrix)
print(report)

Accuracy: 0.8845
[[113  10]
 [ 25 155]]
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       123
           1       0.94      0.86      0.90       180

    accuracy                           0.88       303
   macro avg       0.88      0.89      0.88       303
weighted avg       0.89      0.88      0.89       303



# Cross Validation

In [5]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X, y, cv = 5)
print(scores, 'Moyenne :',scores.mean())

[0.88043478 0.80434783 0.82608696 0.83060109 0.75956284] Moyenne : 0.8202066999287242


# Model Tuning

In [6]:
from sklearn.model_selection import GridSearchCV

X = df.copy()
y = X.pop('HeartDisease')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

numeric_features = list(X.select_dtypes('int64', 'float64').columns)
categorical_features = list(X.select_dtypes('object').columns)

transformer_num = make_pipeline(
    SimpleImputer(strategy="constant"),
    RobustScaler(),
)

transformer_cat = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", transformer_num, numeric_features),
        ("cat", transformer_cat, categorical_features),
    ]
)

pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", model)]
)

scores = cross_val_score(pipeline, X, y, cv = 5)
print('CrossVal avec model de base :',scores.mean())

n_estimators = [100, 500, 1000]
max_depth = [1, 5, 10, 25]
max_features = [*np.arange(0.1, 1.1, 0.1)]

parameters = {
    "model__n_estimators": n_estimators,
    "model__max_depth": max_depth,
    "model__max_features": max_features,
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=5, scoring='accuracy', verbose=1)

# Exécution de la recherche sur grille sur les données d'entraînement
grid_search.fit(X_train, y_train)

# Affichage des meilleurs paramètres et du meilleur score
print("Meilleurs paramètres trouvés : ", grid_search.best_params_)
print("Meilleur score de validation croisée : ", grid_search.best_score_)

CrossVal avec model de base : 0.8180327868852458
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Meilleurs paramètres trouvés :  {'model__max_depth': 5, 'model__max_features': np.float64(0.1), 'model__n_estimators': 100}
Meilleur score de validation croisée :  0.8617886178861788


In [7]:
# Évaluation du meilleur modèle sur l'ensemble de test
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Précision sur l'ensemble de test : ", test_accuracy)

Précision sur l'ensemble de test :  0.9075907590759076


In [8]:
tuned_rdf_pipe = grid_search.best_estimator_
print("CrossVal sur l'ensemble de test : ",cross_val_score(tuned_rdf_pipe, X_train, y_train, cv=5, scoring='accuracy').mean())

CrossVal sur l'ensemble de test :  0.8617886178861788


# Export du model

In [10]:
from sklearn.ensemble import RandomForestRegressor
import joblib

# Exemple de création et d'entraînement du modèle
model = tuned_rdf_pipe
model.fit(X_train, y_train)  # X_train et y_train doivent être définis dans votre script

# Sauvegarde du modèle entraîné
joblib.dump(model, 'model_heart.pkl')

['final_model.pkl']