# Predicting Heart Disease

In [None]:
# Import all packages necessary for the project
import pandas as pd
import time
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

In [None]:
# Import train data & test data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [None]:
# show train data
train_data.head()

In [None]:
# Show info of train data
train_data.info()

No EDA needed here because the data is already clean and ready for modeling. We can directly move to model training and evaluation.

## Modeling

In [None]:
# Separate features and the target variable of data on train
X_train = train_data.drop("Heart Disease", axis=1)
y_train = (train_data["Heart Disease"] == "Presence").astype(int)

In [None]:
# Create the pipeline for all models and the parameter grid for all models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "SVM": SVC(probability=True)
}

In [None]:
# Grilles de param√®tres optimis√©es pour GridSearchCV
# Focus sur les hyperparam√®tres les plus importants pour chaque mod√®le
param_grids = {
    "Logistic Regression": {
        "C": [0.1, 1, 10],
        "solver": ["lbfgs", "saga"],
        "max_iter": [100]
    },
    "Random Forest": {
        "n_estimators": [100, 150, 200],
        "max_depth": [15, 25, None],
        "min_samples_split": [5, 10],
        "min_samples_leaf": [2, 4],
        "n_jobs": [-1]  # Parall√©lisation interne
    },
    "XGBoost": {
        "n_estimators": [100, 150, 200],
        "max_depth": [4, 6, 8],
        "learning_rate": [0.05, 0.1, 0.2],
        "subsample": [0.8, 0.9, 1.0],
        "colsample_bytree": [0.8, 0.9, 1.0],
        "tree_method": ["hist"],
        "n_jobs": [-1]  # Parall√©lisation interne
    },
    "SVM": {
        "C": [0.1, 1, 10, 100],
        "kernel": ["rbf"],
        "gamma": ["scale", "auto"]
    }
}

In [None]:
# Ignorer les avertissements pour une sortie plus propre
warnings.filterwarnings('ignore')

# Convertir le target en format num√©rique (d√©j√† fait, juste pour confirmation)
y_train_binary = y_train

best_models = {}
search_times = {}

print("\n" + "="*70)
print("GRID SEARCH - ENTRA√éNEMENT OPTIMIS√â")
print("="*70)

# Entra√Æner les mod√®les un par un (s√©quentiel)
for model_name, model in models.items():
    print(f"\n[{model_name}]")
    print("-" * 70)
    
    start_time = time.time()
    
    # Cr√©er le GridSearchCV avec tous les c≈ìurs CPU disponibles
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring="roc_auc",
        n_jobs=-1,  # Utiliser tous les c≈ìurs disponibles
        verbose=1,
        error_score='raise'
    )
    
    # Entra√Æner
    grid_search.fit(X_train, y_train_binary)
    
    elapsed_time = time.time() - start_time
    
    # Stocker les r√©sultats
    best_models[model_name] = {
        "model": grid_search.best_estimator_,
        "best_score": grid_search.best_score_,
        "best_params": grid_search.best_params_,
        "cv_results": grid_search.cv_results_
    }
    
    search_times[model_name] = elapsed_time
    
    # Afficher les r√©sultats
    print(f"‚úì Meilleur score AUC-ROC: {grid_search.best_score_:.4f}")
    print(f"  Temps total: {elapsed_time:.1f}s")
    print(f"  Meilleurs param√®tres:")
    for param, value in grid_search.best_params_.items():
        if param != 'n_jobs':
            print(f"    - {param}: {value}")

print(f"\n" + "="*70)
print(f"Temps total d'entra√Ænement: {sum(search_times.values()):.1f}s")
print("="*70)

In [None]:
# R√©sum√© des r√©sultats - Classement des mod√®les
results_summary = pd.DataFrame([
    {
        "Mod√®le": model_name,
        "AUC-ROC": best_models[model_name]["best_score"],
        "Temps (s)": search_times[model_name],
        "Nb param√®tres test√©s": len(best_models[model_name]["cv_results_"]["params"])
    }
    for model_name in best_models.keys()
])

results_summary = results_summary.sort_values("AUC-ROC", ascending=False).reset_index(drop=True)

print("\n" + "="*70)
print("R√âSUM√â DES R√âSULTATS")
print("="*70)

# Afficher le meilleur mod√®le
best_model_name = results_summary.iloc[0]["Mod√®le"]
best_model_obj = best_models[best_model_name]["model"]

print(f"\nüèÜ Meilleur mod√®le: {best_model_name}")
print(f"   Score AUC-ROC: {results_summary.iloc[0]['AUC-ROC']:.4f}")
print(f"   Temps d'entra√Ænement: {results_summary.iloc[0]['Temps (s)']:.1f}s")

results_summary