In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import json
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# ============================================
# CONFIGURATION
# ============================================

# Chemin vers le dataset complet
DATASET_PATH = r"C:\Users\sersi\Desktop\projet_SE_et_IOT\ASL-Sensor-Dataglove-Dataset\asl_dataset_complete.csv"

# Dossier pour sauvegarder le mod√®le
MODEL_OUTPUT_DIR = r"C:\Users\sersi\Desktop\projet_SE_et_IOT\ASL-Sensor-Dataglove-Dataset\model"

# Colonnes des features (capteurs)
FEATURE_COLUMNS = [
    'flex_1', 'flex_2', 'flex_3', 'flex_4', 'flex_5',  # 5 Flex Sensors
    'GYRx', 'GYRy', 'GYRz',                            # 3 Gyroscope
    'ACCx', 'ACCy', 'ACCz'                             # 3 Accelerometer
]

# Param√®tres du mod√®le
TEST_SIZE = 0.2        # 20% pour le test
RANDOM_STATE = 42      # Pour la reproductibilit√©
N_ESTIMATORS = 100     # Nombre d'arbres dans la for√™t

# ============================================
# FONCTION 1: CHARGER ET EXPLORER LES DONN√âES
# ============================================

def load_and_explore_data():
    """
    Charge le dataset et affiche des statistiques
    """
    print("="*60)
    print("üìä CHARGEMENT DES DONN√âES")
    print("="*60)
    
    # Charger le dataset
    df = pd.read_csv(DATASET_PATH)
    
    print(f"\n‚úÖ Dataset charg√© avec succ√®s!")
    print(f"   üìè Dimensions: {df.shape[0]} lignes √ó {df.shape[1]} colonnes")
    print(f"\nüìã Colonnes disponibles:")
    print(f"   {list(df.columns)}")
    
    # Informations sur les labels
    print(f"\nüè∑Ô∏è  LABELS (Lettres/Mots):")
    print(f"   Total de classes: {df['label'].nunique()}")
    print(f"   Classes: {sorted(df['label'].unique())}")
    
    # R√©partition des donn√©es
    print(f"\nüìà R√âPARTITION PAR CLASSE:")
    label_counts = df['label'].value_counts().sort_index()
    for label, count in label_counts.items():
        print(f"   {label:15s}: {count:6d} √©chantillons")
    
    # Statistiques des features
    print(f"\nüìä STATISTIQUES DES FEATURES:")
    print(df[FEATURE_COLUMNS].describe())
    
    # V√©rifier les valeurs manquantes
    missing = df[FEATURE_COLUMNS].isnull().sum()
    if missing.sum() > 0:
        print(f"\n‚ö†Ô∏è  VALEURS MANQUANTES:")
        print(missing[missing > 0])
    else:
        print(f"\n‚úÖ Aucune valeur manquante dans les features!")
    
    return df


# ============================================
# FONCTION 2: PR√âTRAITEMENT DES DONN√âES
# ============================================

def preprocess_data(df):
    """
    Pr√©pare les donn√©es pour l'entra√Ænement
    """
    print("\n" + "="*60)
    print("üîß PR√âTRAITEMENT DES DONN√âES")
    print("="*60)
    
    # Extraire les features (X) et les labels (y)
    X = df[FEATURE_COLUMNS].values
    y = df['label'].values
    
    print(f"\nüìä Features (X): {X.shape}")
    print(f"üè∑Ô∏è  Labels (y): {y.shape}")
    
    # Encoder les labels (convertir lettres en nombres)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    print(f"\nüî¢ Encodage des labels:")
    for i, label in enumerate(label_encoder.classes_[:5]):  # Afficher les 5 premiers
        print(f"   '{label}' ‚Üí {i}")
    print(f"   ... ({len(label_encoder.classes_)} classes au total)")
    
    # Normaliser les features (StandardScaler)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print(f"\n‚úÖ Normalisation appliqu√©e (StandardScaler)")
    print(f"   Moyenne avant: {X.mean(axis=0)[:3]} ...")
    print(f"   Moyenne apr√®s: {X_scaled.mean(axis=0)[:3]} ...")
    
    # S√©parer en train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_encoded, 
        test_size=TEST_SIZE, 
        random_state=RANDOM_STATE,
        stratify=y_encoded  # Garde la m√™me proportion de classes
    )
    
    print(f"\nüìä S√âPARATION TRAIN/TEST:")
    print(f"   üéØ Train: {X_train.shape[0]} √©chantillons ({(1-TEST_SIZE)*100:.0f}%)")
    print(f"   üß™ Test:  {X_test.shape[0]} √©chantillons ({TEST_SIZE*100:.0f}%)")
    
    return X_train, X_test, y_train, y_test, scaler, label_encoder


# ============================================
# FONCTION 3: ENTRA√éNER LE MOD√àLE
# ============================================

def train_model(X_train, y_train):
    """
    Entra√Æne un Random Forest Classifier
    """
    print("\n" + "="*60)
    print("üöÄ ENTRA√éNEMENT DU MOD√àLE")
    print("="*60)
    
    # Cr√©er le mod√®le
    model = RandomForestClassifier(
        n_estimators=N_ESTIMATORS,
        random_state=RANDOM_STATE,
        n_jobs=-1,  # Utiliser tous les CPU
        verbose=1
    )
    
    print(f"\nüå≥ Mod√®le: Random Forest")
    print(f"   Nombre d'arbres: {N_ESTIMATORS}")
    print(f"   √âtat al√©atoire: {RANDOM_STATE}")
    
    # Entra√Æner
    print(f"\n‚è≥ Entra√Ænement en cours...")
    model.fit(X_train, y_train)
    
    print(f"‚úÖ Entra√Ænement termin√©!")
    
    # Cross-validation
    print(f"\nüîÑ Validation crois√©e (5-fold)...")
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    print(f"   Scores: {cv_scores}")
    print(f"   Moyenne: {cv_scores.mean():.4f} (¬±{cv_scores.std():.4f})")
    
    return model


# ============================================
# FONCTION 4: √âVALUER LE MOD√àLE
# ============================================

def evaluate_model(model, X_test, y_test, label_encoder):
    """
    √âvalue les performances du mod√®le
    """
    print("\n" + "="*60)
    print("üìà √âVALUATION DU MOD√àLE")
    print("="*60)
    
    # Pr√©dictions
    y_pred = model.predict(X_test)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nüéØ ACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    # Classification report
    print(f"\nüìä RAPPORT DE CLASSIFICATION:")
    print("-"*60)
    report = classification_report(
        y_test, y_pred, 
        target_names=label_encoder.classes_,
        zero_division=0
    )
    print(report)
    
    # Matrice de confusion
    print(f"\nüî≤ MATRICE DE CONFUSION:")
    cm = confusion_matrix(y_test, y_pred)
    
    # Afficher seulement un aper√ßu si trop de classes
    if len(label_encoder.classes_) <= 10:
        print(cm)
    else:
        print(f"   (Matrice {cm.shape[0]}√ó{cm.shape[1]} - voir visualisation)")
    
    return accuracy, report, cm


# ============================================
# FONCTION 5: VISUALISER LES R√âSULTATS
# ============================================

def visualize_results(cm, label_encoder, feature_importance, model_dir):
    """
    Cr√©e des visualisations des r√©sultats
    """
    print("\n" + "="*60)
    print("üìä CR√âATION DES VISUALISATIONS")
    print("="*60)
    
    # 1. Matrice de confusion
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title('Matrice de Confusion')
    plt.ylabel('Vraie classe')
    plt.xlabel('Classe pr√©dite')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    
    cm_path = os.path.join(model_dir, 'confusion_matrix.png')
    plt.savefig(cm_path, dpi=150)
    print(f"   ‚úÖ Matrice de confusion: {cm_path}")
    plt.close()
    
    # 2. Importance des features
    plt.figure(figsize=(10, 6))
    feature_names = FEATURE_COLUMNS
    indices = np.argsort(feature_importance)[::-1]
    
    plt.bar(range(len(feature_importance)), feature_importance[indices])
    plt.xticks(range(len(feature_importance)), 
               [feature_names[i] for i in indices], 
               rotation=45, ha='right')
    plt.title('Importance des Features')
    plt.ylabel('Importance')
    plt.xlabel('Features')
    plt.tight_layout()
    
    fi_path = os.path.join(model_dir, 'feature_importance.png')
    plt.savefig(fi_path, dpi=150)
    print(f"   ‚úÖ Importance des features: {fi_path}")
    plt.close()


# ============================================
# FONCTION 6: SAUVEGARDER LE MOD√àLE
# ============================================

def save_model(model, scaler, label_encoder, model_dir):
    """
    Sauvegarde le mod√®le et les transformateurs
    """
    print("\n" + "="*60)
    print("üíæ SAUVEGARDE DU MOD√àLE")
    print("="*60)
    
    # Cr√©er le dossier
    os.makedirs(model_dir, exist_ok=True)
    
    # Sauvegarder le mod√®le
    model_path = os.path.join(model_dir, 'asl_model.pkl')
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"   ‚úÖ Mod√®le: {model_path}")
    
    # Sauvegarder le scaler
    scaler_path = os.path.join(model_dir, 'scaler.pkl')
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"   ‚úÖ Scaler: {scaler_path}")
    
    # Sauvegarder le label encoder
    encoder_path = os.path.join(model_dir, 'label_encoder.pkl')
    with open(encoder_path, 'wb') as f:
        pickle.dump(label_encoder, f)
    print(f"   ‚úÖ Label Encoder: {encoder_path}")
    
    # Sauvegarder les m√©tadonn√©es en JSON
    metadata = {
        'feature_columns': FEATURE_COLUMNS,
        'classes': label_encoder.classes_.tolist(),
        'n_classes': len(label_encoder.classes_),
        'n_features': len(FEATURE_COLUMNS),
        'model_type': 'RandomForestClassifier',
        'n_estimators': N_ESTIMATORS
    }
    
    metadata_path = os.path.join(model_dir, 'model_metadata.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=4)
    print(f"   ‚úÖ M√©tadonn√©es: {metadata_path}")


# ============================================
# FONCTION 7: TESTER UNE PR√âDICTION
# ============================================

def test_prediction(model, scaler, label_encoder, X_test, y_test):
    """
    Teste une pr√©diction sur un √©chantillon al√©atoire
    """
    print("\n" + "="*60)
    print("üß™ TEST DE PR√âDICTION")
    print("="*60)
    
    # S√©lectionner un √©chantillon al√©atoire
    idx = np.random.randint(0, len(X_test))
    sample = X_test[idx:idx+1]
    true_label = label_encoder.classes_[y_test[idx]]
    
    # Pr√©dire
    prediction = model.predict(sample)
    predicted_label = label_encoder.classes_[prediction[0]]
    
    # Probabilit√©s
    probabilities = model.predict_proba(sample)[0]
    
    print(f"\nüìä √âchantillon #{idx}:")
    print(f"   Features: {sample[0][:5]}... (premiers 5 valeurs)")
    print(f"\nüéØ Vraie classe: {true_label}")
    print(f"ü§ñ Pr√©diction: {predicted_label}")
    print(f"   {'‚úÖ CORRECT' if true_label == predicted_label else '‚ùå INCORRECT'}")
    
    print(f"\nüìä Top 3 probabilit√©s:")
    top_3_idx = np.argsort(probabilities)[-3:][::-1]
    for i, idx in enumerate(top_3_idx, 1):
        label = label_encoder.classes_[idx]
        prob = probabilities[idx]
        print(f"   {i}. {label:15s}: {prob:.4f} ({prob*100:.2f}%)")


# ============================================
# FONCTION PRINCIPALE
# ============================================
def main():
    """
    Pipeline complet d'entra√Ænement
    """
    print("\n" + "="*70)
    print("ü§ñ ENTRA√éNEMENT DU MOD√àLE DE RECONNAISSANCE ASL")
    print("="*70)
   
    # 1. Charger les donn√©es
    df = load_and_explore_data()
   
    # 2. Pr√©traiter
    X_train, X_test, y_train, y_test, scaler, label_encoder = preprocess_data(df)
   
    # 3. Entra√Æner
    model = train_model(X_train, y_train)
   
    # 4. √âvaluer
    accuracy, report, cm = evaluate_model(model, X_test, y_test, label_encoder)
   
    # AJOUT : Cr√©er le dossier de sortie AVANT la visualisation
    os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
    print(f"üìÅ Dossier de sortie cr√©√©: {MODEL_OUTPUT_DIR}")
   
    # 5. Visualiser
    feature_importance = model.feature_importances_
    visualize_results(cm, label_encoder, feature_importance, MODEL_OUTPUT_DIR)
   
    # 6. Sauvegarder (le makedirs est redondant maintenant, mais inoffensif)
    save_model(model, scaler, label_encoder, MODEL_OUTPUT_DIR)
   
    # 7. Test de pr√©diction
    test_prediction(model, scaler, label_encoder, X_test, y_test)
    
    print("\n" + "="*70)
    print("‚ú® ENTRA√éNEMENT TERMIN√â AVEC SUCC√àS !")
    print("="*70)
    print(f"\nüìä R√©sum√©:")
    print(f"   üéØ Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"   üìÅ Mod√®le sauvegard√© dans: {MODEL_OUTPUT_DIR}")
    print(f"   üè∑Ô∏è  Classes reconnues: {len(label_encoder.classes_)}")
    print(f"   üìä Features utilis√©es: {len(FEATURE_COLUMNS)}")


# ============================================
# EX√âCUTION
# ============================================

if __name__ == "__main__":
    main()