In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
import os
warnings.filterwarnings('ignore')

# Cr√©er dossiers si n√©cessaire
os.makedirs('../models/pima', exist_ok=True)
os.makedirs('../results/metrics', exist_ok=True)
os.makedirs('../results/visualizations', exist_ok=True)

# ============================================
# 1. CHARGER LES DONN√âES PIMA
# ============================================
print("="*80)
print("CHARGEMENT DATASET PIMA INDIANS")
print("="*80)

df = pd.read_csv('../data/Dataset_Pregnancies/pima_ready_for_ml.csv')

print(f"Shape: {df.shape}")
print(f"\nColonnes ({len(df.columns)}):")
print(df.columns.tolist())
print(f"\nAper√ßu des donn√©es:")
print(df.head())
print(f"\nInfo:")
print(df.info())

# ============================================
# 2. S√âPARER X et y
# ============================================
print("\n" + "="*80)
print("S√âPARATION FEATURES ET TARGET")
print("="*80)

# IMPORTANT: Pour dataset Pima, la cible est 'Outcome'
X = df.drop('Outcome', axis=1)
y = df['Outcome']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nDistribution de la variable cible:")
print(y.value_counts())
print(f"\nPourcentages:")
print(y.value_counts(normalize=True) * 100)

# ============================================
# 3. SPLIT TRAIN/VAL/TEST (70/15/15)
# ============================================
print("\n" + "="*80)
print("SPLIT DES DONN√âES")
print("="*80)

# Train: 70%, Temp: 30%
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Val: 15%, Test: 15% (diviser temp en deux)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"  Diab√©tiques: {y_train.sum()} ({y_train.sum()/len(y_train)*100:.1f}%)")
print(f"\nValidation: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"  Diab√©tiques: {y_val.sum()} ({y_val.sum()/len(y_val)*100:.1f}%)")
print(f"\nTest: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"  Diab√©tiques: {y_test.sum()} ({y_test.sum()/len(y_test)*100:.1f}%)")

# ============================================
# 4. FONCTION D'√âVALUATION
# ============================================
def evaluate_model(model, X_test, y_test, model_name="Model"):
    """√âvalue un mod√®le et retourne les m√©triques"""
    
    # Pr√©dictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # M√©triques
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }
    
    print(f"\n{'='*80}")
    print(f"R√âSULTATS - {model_name}")
    print(f"{'='*80}")
    for k, v in metrics.items():
        if v is not None and k != 'Model':
            print(f"{k}: {v:.4f}")
    
    # Matrice de confusion
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nMatrice de Confusion:")
    print(cm)
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    
    return metrics, y_pred, y_proba

# ============================================
# 5. ENTRA√éNER PLUSIEURS MOD√àLES
# ============================================
print("\n" + "="*80)
print("ENTRA√éNEMENT DES MOD√àLES - DATASET PIMA")
print("="*80)

results = []

# 5.1 LOGISTIC REGRESSION
print("\n" + "üî∑"*40)
print("1/8 - LOGISTIC REGRESSION")
print("üî∑"*40)
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
metrics_lr, _, _ = evaluate_model(lr, X_val, y_val, "Logistic Regression")
results.append(metrics_lr)
joblib.dump(lr, '../models/pima/logistic_regression.pkl')
print("‚úÖ Mod√®le sauvegard√©: logistic_regression.pkl")

# 5.2 DECISION TREE
print("\n" + "üî∑"*40)
print("2/8 - DECISION TREE")
print("üî∑"*40)
dt = DecisionTreeClassifier(max_depth=8, min_samples_split=15, random_state=42)
dt.fit(X_train, y_train)
metrics_dt, _, _ = evaluate_model(dt, X_val, y_val, "Decision Tree")
results.append(metrics_dt)
joblib.dump(dt, '../models/pima/decision_tree.pkl')
print("‚úÖ Mod√®le sauvegard√©: decision_tree.pkl")

# 5.3 RANDOM FOREST
print("\n" + "üî∑"*40)
print("3/8 - RANDOM FOREST")
print("üî∑"*40)
rf = RandomForestClassifier(
    n_estimators=100, 
    max_depth=12, 
    min_samples_split=10,
    random_state=42, 
    n_jobs=-1
)
rf.fit(X_train, y_train)
metrics_rf, _, _ = evaluate_model(rf, X_val, y_val, "Random Forest")
results.append(metrics_rf)
joblib.dump(rf, '../models/pima/random_forest.pkl')
print("‚úÖ Mod√®le sauvegard√©: random_forest.pkl")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nTop 10 Features Important (Random Forest):")
print(feature_importance.head(10))

# 5.4 GRADIENT BOOSTING
print("\n" + "üî∑"*40)
print("4/8 - GRADIENT BOOSTING")
print("üî∑"*40)
gb = GradientBoostingClassifier(
    n_estimators=100, 
    max_depth=4,
    learning_rate=0.1,
    random_state=42
)
gb.fit(X_train, y_train)
metrics_gb, _, _ = evaluate_model(gb, X_val, y_val, "Gradient Boosting")
results.append(metrics_gb)
joblib.dump(gb, '../models/pima/gradient_boosting.pkl')
print("‚úÖ Mod√®le sauvegard√©: gradient_boosting.pkl")

# 5.5 XGBOOST
print("\n" + "üî∑"*40)
print("5/8 - XGBOOST")
print("üî∑"*40)
xgb_model = xgb.XGBClassifier(
    n_estimators=100, 
    max_depth=5,
    learning_rate=0.1,
    random_state=42, 
    eval_metric='logloss',
    use_label_encoder=False
)
xgb_model.fit(X_train, y_train)
metrics_xgb, _, _ = evaluate_model(xgb_model, X_val, y_val, "XGBoost")
results.append(metrics_xgb)
joblib.dump(xgb_model, '../models/pima/xgboost.pkl')
print("‚úÖ Mod√®le sauvegard√©: xgboost.pkl")

# 5.6 SVM
print("\n" + "üî∑"*40)
print("6/8 - SVM")
print("üî∑"*40)
svm = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
svm.fit(X_train, y_train)
metrics_svm, _, _ = evaluate_model(svm, X_val, y_val, "SVM")
results.append(metrics_svm)
joblib.dump(svm, '../models/pima/svm.pkl')
print("‚úÖ Mod√®le sauvegard√©: svm.pkl")

# 5.7 KNN
print("\n" + "üî∑"*40)
print("7/8 - K-NEAREST NEIGHBORS")
print("üî∑"*40)
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)
metrics_knn, _, _ = evaluate_model(knn, X_val, y_val, "KNN")
results.append(metrics_knn)
joblib.dump(knn, '../models/pima/knn.pkl')
print("‚úÖ Mod√®le sauvegard√©: knn.pkl")

# 5.8 XGBOOST OPTIMIS√â
print("\n" + "üî∑"*40)
print("8/8 - XGBOOST OPTIMIS√â")
print("üî∑"*40)
xgb_opt = xgb.XGBClassifier(
    n_estimators=150, 
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    random_state=42, 
    eval_metric='logloss',
    use_label_encoder=False
)
xgb_opt.fit(X_train, y_train)
metrics_xgb_opt, _, _ = evaluate_model(xgb_opt, X_val, y_val, "XGBoost Optimized")
results.append(metrics_xgb_opt)
joblib.dump(xgb_opt, '../models/pima/xgboost_optimized.pkl')
print("‚úÖ Mod√®le sauvegard√©: xgboost_optimized.pkl")

# ============================================
# 6. COMPARER TOUS LES MOD√àLES
# ============================================
print("\n" + "="*80)
print("üìä COMPARAISON FINALE - DATASET PIMA")
print("="*80)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('F1-Score', ascending=False)
print(results_df.to_string(index=False))

# Identifier le meilleur
best_model = results_df.iloc[0]
print(f"\nüèÜ MEILLEUR MOD√àLE: {best_model['Model']}")
print(f"   F1-Score: {best_model['F1-Score']:.4f}")
print(f"   Accuracy: {best_model['Accuracy']:.4f}")
print(f"   ROC-AUC: {best_model['ROC-AUC']:.4f}")

# Sauvegarder les r√©sultats
results_df.to_csv('../results/metrics/pima_models_comparison.csv', index=False)
print(f"\n‚úÖ R√©sultats sauvegard√©s: pima_models_comparison.csv")

# ============================================
# 7. VISUALISATIONS
# ============================================
print("\n" + "="*80)
print("üìà CR√âATION DES VISUALISATIONS")
print("="*80)

# Graphique comparatif des m√©triques
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']

for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx//2, idx%2]
    bars = ax.barh(results_df['Model'], results_df[metric], color=colors[idx])
    ax.set_xlabel(metric, fontsize=12, fontweight='bold')
    ax.set_title(f'Comparaison - {metric}', fontsize=14, fontweight='bold')
    ax.set_xlim([0, 1])
    ax.grid(axis='x', alpha=0.3)
    
    # Ajouter les valeurs sur les barres
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax.text(width, bar.get_y() + bar.get_height()/2, 
                f'{width:.3f}', 
                ha='left', va='center', fontsize=9, fontweight='bold')

plt.suptitle('Comparaison des Mod√®les - Dataset Pima Indians', 
             fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.savefig('../results/visualizations/pima_models_comparison.png', dpi=300, bbox_inches='tight')
print("‚úÖ Graphique sauvegard√©: pima_models_comparison.png")
plt.close()

# Graphique ROC-AUC
plt.figure(figsize=(10, 6))
results_df_sorted = results_df.sort_values('ROC-AUC', ascending=True)
bars = plt.barh(results_df_sorted['Model'], results_df_sorted['ROC-AUC'], color='#9b59b6')
plt.xlabel('ROC-AUC Score', fontsize=12, fontweight='bold')
plt.title('Comparaison ROC-AUC - Dataset Pima', fontsize=14, fontweight='bold')
plt.xlim([0, 1])
plt.grid(axis='x', alpha=0.3)

for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2, 
            f'{width:.3f}', 
            ha='left', va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('../results/visualizations/pima_roc_auc_comparison.png', dpi=300, bbox_inches='tight')
print("‚úÖ Graphique sauvegard√©: pima_roc_auc_comparison.png")
plt.close()

# ============================================
# 8. R√âSUM√â FINAL
# ============================================
print("\n" + "="*80)
print("‚úÖ ENTRA√éNEMENT TERMIN√â - DATASET PIMA")
print("="*80)
print(f"üìÅ Mod√®les sauvegard√©s dans: models/pima/")
print(f"üìä M√©triques sauvegard√©es dans: results/metrics/pima_models_comparison.csv")
print(f"üìà Visualisations dans: results/visualizations/")
print(f"\nüèÜ Champion: {best_model['Model']} (F1={best_model['F1-Score']:.4f})")
print("="*80)

SyntaxError: invalid syntax (1232241867.py, line 4)