In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

# ============================================
# 1. CHARGER LES DONN√âES
# ============================================
print("Chargement Dataset Sympt√¥mes...")
df = pd.read_csv('../data/Dataset_600_Lignes/dataset_ready_for_ml.csv')
print(f"Shape: {df.shape}")
print(f"Colonnes: {df.columns.tolist()}")

# ============================================
# 2. S√âPARER X et y
# ============================================
# La colonne cible est 'class'
X = df.drop('class', axis=1)
y = df['class']

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Distribution y:\n{y.value_counts()}")

# ============================================
# 3. SPLIT TRAIN/VAL/TEST (70/15/15)
# ============================================
# Train: 70%, Temp: 30%
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Val: 15%, Test: 15% (split 30% en deux)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"\nTrain: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Val: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

# ============================================
# 4. FONCTION D'√âVALUATION
# ============================================
def evaluate_model(model, X_test, y_test, model_name="Model"):
    """√âvalue un mod√®le et retourne les m√©triques"""
    
    # Pr√©dictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # M√©triques
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }
    
    print(f"\n{'='*60}")
    print(f"R√âSULTATS - {model_name}")
    print(f"{'='*60}")
    for k, v in metrics.items():
        if v is not None and k != 'Model':
            print(f"{k}: {v:.4f}")
    
    # Matrice de confusion
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nMatrice de Confusion:\n{cm}")
    
    return metrics, y_pred, y_proba

# ============================================
# 5. ENTRA√éNER PLUSIEURS MOD√àLES
# ============================================

results = []

# 5.1 LOGISTIC REGRESSION
print("\n" + "="*60)
print("1. LOGISTIC REGRESSION")
print("="*60)
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
metrics_lr, _, _ = evaluate_model(lr, X_val, y_val, "Logistic Regression")
results.append(metrics_lr)
joblib.dump(lr, '../models/symptoms/logistic_regression.pkl')

# 5.2 DECISION TREE
print("\n" + "="*60)
print("2. DECISION TREE")
print("="*60)
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)
metrics_dt, _, _ = evaluate_model(dt, X_val, y_val, "Decision Tree")
results.append(metrics_dt)
joblib.dump(dt, '../models/symptoms/decision_tree.pkl')

# 5.3 RANDOM FOREST
print("\n" + "="*60)
print("3. RANDOM FOREST")
print("="*60)
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
metrics_rf, _, _ = evaluate_model(rf, X_val, y_val, "Random Forest")
results.append(metrics_rf)
joblib.dump(rf, '../models/symptoms/random_forest.pkl')

# 5.4 GRADIENT BOOSTING
print("\n" + "="*60)
print("4. GRADIENT BOOSTING")
print("="*60)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
metrics_gb, _, _ = evaluate_model(gb, X_val, y_val, "Gradient Boosting")
results.append(metrics_gb)
joblib.dump(gb, '../models/symptoms/gradient_boosting.pkl')

# 5.5 XGBOOST
print("\n" + "="*60)
print("5. XGBOOST")
print("="*60)
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=6, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
metrics_xgb, _, _ = evaluate_model(xgb_model, X_val, y_val, "XGBoost")
results.append(metrics_xgb)
joblib.dump(xgb_model, '../models/symptoms/xgboost.pkl')

# 5.6 SVM
print("\n" + "="*60)
print("6. SVM")
print("="*60)
svm = SVC(kernel='rbf', probability=True, random_state=42)
svm.fit(X_train, y_train)
metrics_svm, _, _ = evaluate_model(svm, X_val, y_val, "SVM")
results.append(metrics_svm)
joblib.dump(svm, '../models/symptoms/svm.pkl')

# 5.7 KNN
print("\n" + "="*60)
print("7. K-NEAREST NEIGHBORS")
print("="*60)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
metrics_knn, _, _ = evaluate_model(knn, X_val, y_val, "KNN")
results.append(metrics_knn)
joblib.dump(knn, '../models/symptoms/knn.pkl')

# ============================================
# 6. COMPARER TOUS LES MOD√àLES
# ============================================
print("\n" + "="*80)
print("COMPARAISON FINALE - DATASET SYMPT√îMES")
print("="*80)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('F1-Score', ascending=False)
print(results_df.to_string(index=False))

# Sauvegarder
results_df.to_csv('../results/metrics/symptoms_models_comparison.csv', index=False)

# Visualiser
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx//2, idx%2]
    ax.barh(results_df['Model'], results_df[metric])
    ax.set_xlabel(metric)
    ax.set_title(f'Comparaison - {metric}')
    ax.set_xlim([0, 1])

plt.tight_layout()
plt.savefig('../results/visualizations/symptoms_models_comparison.png', dpi=300)
plt.close()

print("\n‚úÖ MOD√àLES SYMPT√îMES ENTRA√éN√âS ET SAUVEGARD√âS!")
print(f"üèÜ Meilleur mod√®le: {results_df.iloc[0]['Model']} (F1={results_df.iloc[0]['F1-Score']:.4f})")