In [1]:
"""
NOTEBOOK 7: CORRECTION DATA LEAKAGE ET R√âENTRA√éNEMENT
======================================================
Le Diabetes_Score cause un data leakage (97.9% importance!)
Ce score est probablement calcul√© √Ä PARTIR du diab√®te, donc le mod√®le "triche"

Solution:
1. Supprimer Diabetes_Score
2. R√©entra√Æner le mod√®le Clinical
3. R√©√©valuer la performance R√âELLE
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
import os
warnings.filterwarnings('ignore')

print("="*80)
print("üîß CORRECTION DATA LEAKAGE - DATASET CLINICAL")
print("="*80)

# ============================================
# PARTIE 1: IDENTIFIER LE PROBL√àME
# ============================================
print("\n" + "="*80)
print("üîç PARTIE 1: ANALYSE DU PROBL√àME")
print("="*80)

# Charger les donn√©es
df_clinical = pd.read_csv('../data/Dataset_10000_Lignes/dataset_clinical_ready.csv')

print(f"\nüìä Dataset original:")
print(f"   Shape: {df_clinical.shape}")
print(f"   Colonnes: {df_clinical.columns.tolist()}")

# V√©rifier corr√©lation Diabetes_Score avec Diabetes
if 'Diabetes_Score' in df_clinical.columns:
    correlation = df_clinical['Diabetes_Score'].corr(df_clinical['Diabetes'])
    print(f"\n‚ö†Ô∏è  PROBL√àME D√âTECT√â:")
    print(f"   Corr√©lation Diabetes_Score ‚Üî Diabetes: {correlation:.4f}")
    
    if abs(correlation) > 0.7:
        print(f"   üö® DATA LEAKAGE CONFIRM√â!")
        print(f"   Le Diabetes_Score est probablement calcul√© depuis la variable cible")
        print(f"   ‚Üí Le mod√®le a 100% accuracy parce qu'il 'triche'")

# ============================================
# PARTIE 2: CR√âER DATASET SANS LEAKAGE
# ============================================
print("\n" + "="*80)
print("üîß PARTIE 2: SUPPRESSION DIABETES_SCORE")
print("="*80)

# Supprimer Diabetes_Score
df_clinical_fixed = df_clinical.drop('Diabetes_Score', axis=1)

print(f"\n‚úÖ Dataset corrig√©:")
print(f"   Shape: {df_clinical_fixed.shape}")
print(f"   Colonnes restantes: {df_clinical_fixed.shape[1]}")

# Sauvegarder
df_clinical_fixed.to_csv('../data/Dataset_10000_Lignes/dataset_clinical_ready_fixed.csv', index=False)
print(f"‚úÖ Sauvegard√©: dataset_clinical_ready_fixed.csv")

# ============================================
# PARTIE 3: R√âENTRA√éNEMENT MOD√àLE
# ============================================
print("\n" + "="*80)
print("ü§ñ PARTIE 3: R√âENTRA√éNEMENT DU MOD√àLE CLINICAL")
print("="*80)

# S√©parer X et y
X = df_clinical_fixed.drop('Diabetes', axis=1)
y = df_clinical_fixed['Diabetes']

print(f"\nüìä Donn√©es pour entra√Ænement:")
print(f"   Features: {X.shape[1]}")
print(f"   Samples: {X.shape[0]}")
print(f"   Distribution classe:")
print(f"   - Non-diab√©tiques: {(y==0).sum()} ({(y==0).sum()/len(y)*100:.1f}%)")
print(f"   - Diab√©tiques: {(y==1).sum()} ({(y==1).sum()/len(y)*100:.1f}%)")

# Split train/val/test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"\nSplit:")
print(f"   Train: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Val: {len(X_val)} ({len(X_val)/len(X)*100:.1f}%)")
print(f"   Test: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")

# Fonction d'√©valuation
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }
    
    print(f"\n{'='*80}")
    print(f"R√âSULTATS - {model_name}")
    print(f"{'='*80}")
    for k, v in metrics.items():
        if v is not None and k != 'Model':
            print(f"{k}: {v:.4f}")
    
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nMatrice de Confusion:")
    print(cm)
    
    return metrics, y_pred, y_proba, cm

# ============================================
# ENTRA√éNER PLUSIEURS MOD√àLES
# ============================================
print("\n" + "="*80)
print("üéØ ENTRA√éNEMENT DES MOD√àLES (SANS DATA LEAKAGE)")
print("="*80)

results = []

# XGBoost
print("\nüî∑ 1/3 - XGBoost")
xgb_model = xgb.XGBClassifier(
    n_estimators=100, 
    max_depth=6,
    learning_rate=0.1,
    random_state=42, 
    eval_metric='logloss',
    use_label_encoder=False
)
xgb_model.fit(X_train, y_train)
metrics_xgb, _, _, _ = evaluate_model(xgb_model, X_val, y_val, "XGBoost")
results.append(metrics_xgb)

# XGBoost Optimis√©
print("\nüî∑ 2/3 - XGBoost Optimis√©")
xgb_opt = xgb.XGBClassifier(
    n_estimators=200, 
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42, 
    eval_metric='logloss',
    use_label_encoder=False
)
xgb_opt.fit(X_train, y_train)
metrics_xgb_opt, _, _, _ = evaluate_model(xgb_opt, X_val, y_val, "XGBoost Optimized")
results.append(metrics_xgb_opt)

# Random Forest
print("\nüî∑ 3/3 - Random Forest")
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
metrics_rf, _, _, _ = evaluate_model(rf_model, X_val, y_val, "Random Forest")
results.append(metrics_rf)

# ============================================
# COMPARAISON
# ============================================
print("\n" + "="*80)
print("üìä COMPARAISON DES MOD√àLES (SANS LEAKAGE)")
print("="*80)

results_df = pd.DataFrame(results).sort_values('F1-Score', ascending=False)
print(results_df.to_string(index=False))

best_model_name = results_df.iloc[0]['Model']
print(f"\nüèÜ MEILLEUR MOD√àLE: {best_model_name}")
print(f"   F1-Score: {results_df.iloc[0]['F1-Score']:.4f}")

# Sauvegarder le meilleur
if best_model_name == "XGBoost":
    best_model = xgb_model
elif best_model_name == "XGBoost Optimized":
    best_model = xgb_opt
else:
    best_model = rf_model

os.makedirs('../models/clinical', exist_ok=True)
joblib.dump(best_model, '../models/clinical/xgboost_fixed.pkl')
print(f"\n‚úÖ Mod√®le sauvegard√©: xgboost_fixed.pkl")

# ============================================
# TEST SET √âVALUATION
# ============================================
print("\n" + "="*80)
print("üéØ √âVALUATION FINALE SUR TEST SET")
print("="*80)

metrics_test, y_pred_test, y_proba_test, cm_test = evaluate_model(
    best_model, X_test, y_test, f"{best_model_name} (TEST SET)"
)

# Visualisation matrice confusion
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Non-Diab√©tique', 'Diab√©tique'],
            yticklabels=['Non-Diab√©tique', 'Diab√©tique'])
plt.title(f'Matrice de Confusion - {best_model_name}\n(Sans Data Leakage)', 
          fontsize=14, fontweight='bold')
plt.ylabel('Vraie Classe')
plt.xlabel('Classe Pr√©dite')
plt.tight_layout()
os.makedirs('../results/visualizations', exist_ok=True)
plt.savefig('../results/visualizations/clinical_fixed_confusion_matrix.png', dpi=300)
print("\n‚úÖ Sauvegard√©: clinical_fixed_confusion_matrix.png")
plt.close()

# ============================================
# FEATURE IMPORTANCE (R√âELLE)
# ============================================
print("\n" + "="*80)
print("üìä FEATURE IMPORTANCE (SANS DIABETES_SCORE)")
print("="*80)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(f"\nüîù TOP 10 FEATURES R√âELLES:")
print(feature_importance.head(10).to_string(index=False))

# Visualisation
plt.figure(figsize=(12, 8))
top_n = 15
colors = plt.cm.viridis(np.linspace(0, 1, top_n))
bars = plt.barh(range(top_n), 
                feature_importance['Importance'].head(top_n),
                color=colors)
plt.yticks(range(top_n), feature_importance['Feature'].head(top_n))
plt.xlabel('Importance', fontsize=12, fontweight='bold')
plt.title('Top 15 Features - Clinical Dataset (SANS DATA LEAKAGE)', 
          fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)

for i, (bar, val) in enumerate(zip(bars, feature_importance['Importance'].head(top_n))):
    plt.text(val, i, f' {val:.3f}', va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig('../results/visualizations/clinical_fixed_feature_importance.png', dpi=300)
print("\n‚úÖ Sauvegard√©: clinical_fixed_feature_importance.png")
plt.close()

feature_importance.to_csv('../results/interpretation/clinical_fixed_feature_importance.csv', index=False)

# ============================================
# COMPARAISON AVANT/APR√àS
# ============================================
print("\n" + "="*80)
print("üìä COMPARAISON: AVEC vs SANS DATA LEAKAGE")
print("="*80)

comparison = pd.DataFrame({
    'M√©trique': ['Validation Accuracy', 'Test Accuracy', 'F1-Score', 'Top Feature'],
    'AVEC Diabetes_Score (Leakage)': ['100%', '100%', '100%', 'Diabetes_Score (97.9%)'],
    'SANS Diabetes_Score (Corrig√©)': [
        f"{results_df.iloc[0]['Accuracy']*100:.1f}%",
        f"{metrics_test['Accuracy']*100:.1f}%",
        f"{metrics_test['F1-Score']*100:.1f}%",
        f"{feature_importance.iloc[0]['Feature']} ({feature_importance.iloc[0]['Importance']:.3f})"
    ]
})

print("\n" + comparison.to_string(index=False))

# ============================================
# VERDICT FINAL
# ============================================
print("\n" + "="*80)
print("üéØ VERDICT FINAL")
print("="*80)

print(f"\nüìä PERFORMANCE R√âELLE (sans leakage):")
print(f"   Accuracy: {metrics_test['Accuracy']*100:.2f}%")
print(f"   Precision: {metrics_test['Precision']*100:.2f}%")
print(f"   Recall: {metrics_test['Recall']*100:.2f}%")
print(f"   F1-Score: {metrics_test['F1-Score']*100:.2f}%")
print(f"   ROC-AUC: {metrics_test['ROC-AUC']*100:.2f}%")

print(f"\nüí° INTERPR√âTATION:")
if metrics_test['Accuracy'] > 0.90:
    print(f"   ‚úÖ EXCELLENT! Le mod√®le est toujours tr√®s performant")
    print(f"   Les features cliniques permettent une pr√©diction pr√©cise")
elif metrics_test['Accuracy'] > 0.80:
    print(f"   ‚úÖ TR√àS BON! Performance r√©aliste et utilisable")
    print(f"   Le mod√®le est fiable pour aide au diagnostic")
elif metrics_test['Accuracy'] > 0.70:
    print(f"   ‚ö†Ô∏è  BON mais perfectible")
    print(f"   Peut n√©cessiter plus de features ou de donn√©es")
else:
    print(f"   ‚ö†Ô∏è  Performance limit√©e")
    print(f"   Besoin d'am√©lioration des features")

print(f"\nüîù TOP 3 FEATURES R√âELLEMENT IMPORTANTES:")
for i in range(min(3, len(feature_importance))):
    row = feature_importance.iloc[i]
    print(f"   {i+1}. {row['Feature']}: {row['Importance']:.4f}")

print("\n" + "="*80)
print("‚úÖ CORRECTION TERMIN√âE!")
print("="*80)
print("\nüìÅ Nouveaux fichiers:")
print("   - dataset_clinical_ready_fixed.csv (donn√©es sans leakage)")
print("   - models/clinical/xgboost_fixed.pkl (mod√®le corrig√©)")
print("   - clinical_fixed_feature_importance.png/csv")
print("   - clinical_fixed_confusion_matrix.png")
print("\nüí° Utilisez ce mod√®le pour l'interpr√©tation et l'interface!")
print("="*80)

üîß CORRECTION DATA LEAKAGE - DATASET CLINICAL

üîç PARTIE 1: ANALYSE DU PROBL√àME

üìä Dataset original:
   Shape: (6680, 22)
   Colonnes: ['Age', 'Sex', 'Ethnicity', 'BMI', 'Waist_Circumference', 'Fasting_Blood_Glucose', 'HbA1c', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic', 'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL', 'GGT', 'Serum_Urate', 'Physical_Activity_Level', 'Dietary_Intake_Calories', 'Alcohol_Consumption', 'Smoking_Status', 'Family_History_of_Diabetes', 'Previous_Gestational_Diabetes', 'Diabetes_Score', 'Diabetes']

‚ö†Ô∏è  PROBL√àME D√âTECT√â:
   Corr√©lation Diabetes_Score ‚Üî Diabetes: 0.8096
   üö® DATA LEAKAGE CONFIRM√â!
   Le Diabetes_Score est probablement calcul√© depuis la variable cible
   ‚Üí Le mod√®le a 100% accuracy parce qu'il 'triche'

üîß PARTIE 2: SUPPRESSION DIABETES_SCORE

‚úÖ Dataset corrig√©:
   Shape: (6680, 21)
   Colonnes restantes: 21
‚úÖ Sauvegard√©: dataset_clinical_ready_fixed.csv

ü§ñ PARTIE 3: R√âENTRA√éNEMENT DU M