In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import json
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# ============================================
# CONFIGURATION OPTIMIS√âE POUR MOBILE
# ============================================

DATASET_PATH = r"C:\Users\sersi\Desktop\projet_SE_et_IOT\HandSense_project\asl_dataset_complete.csv"
MODEL_OUTPUT_DIR = r"C:\Users\sersi\Desktop\projet_SE_et_IOT\HandSense_project\model_mobile"

FEATURE_COLUMNS = [
    'flex_1', 'flex_2', 'flex_3', 'flex_4', 'flex_5',
    'GYRx', 'GYRy', 'GYRz',
    'ACCx', 'ACCy', 'ACCz'
]

TEST_SIZE = 0.2 
RANDOM_STATE = 42

# ‚ö° PARAM√àTRES OPTIMIS√âS POUR MOBILE
N_ESTIMATORS = 30        # ‚úÖ R√©duit de 100 ‚Üí 30 (3x plus l√©ger)
MAX_DEPTH = 12           # ‚úÖ Limite la profondeur des arbres
MIN_SAMPLES_SPLIT = 10   # ‚úÖ √âvite le surapprentissage
MIN_SAMPLES_LEAF = 4     # ‚úÖ R√©duit la complexit√©

# ============================================
# FONCTION 1: CHARGER LES DONN√âES
# ============================================

def load_and_explore_data():
    print("="*60)
    print("üìä CHARGEMENT DES DONN√âES")
    print("="*60)
    
    df = pd.read_csv(DATASET_PATH)
    
    print(f"\n‚úÖ Dataset charg√© avec succ√®s!")
    print(f"   üìè Dimensions: {df.shape[0]} lignes √ó {df.shape[1]} colonnes")
    print(f"\nüè∑Ô∏è  Classes: {df['label'].nunique()}")
    print(f"   {sorted(df['label'].unique())}")
    
    print(f"\nüìà R√âPARTITION PAR CLASSE:")
    label_counts = df['label'].value_counts().sort_index()
    for label, count in label_counts.items():
        print(f"   {label:15s}: {count:6d} √©chantillons")
    
    missing = df[FEATURE_COLUMNS].isnull().sum()
    if missing.sum() > 0:
        print(f"\n‚ö†Ô∏è  VALEURS MANQUANTES:")
        print(missing[missing > 0])
    else:
        print(f"\n‚úÖ Aucune valeur manquante!")
    
    return df


# ============================================
# FONCTION 2: PR√âTRAITEMENT
# ============================================

def preprocess_data(df):
    print("\n" + "="*60)
    print("üîß PR√âTRAITEMENT DES DONN√âES")
    print("="*60)
    
    X = df[FEATURE_COLUMNS].values
    y = df['label'].values
    
    print(f"\nüìä Features (X): {X.shape}")
    print(f"üè∑Ô∏è  Labels (y): {y.shape}")
    
    # Encoder les labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    print(f"\nüî¢ Encodage des labels:")
    for i, label in enumerate(label_encoder.classes_[:5]):
        print(f"   '{label}' ‚Üí {i}")
    print(f"   ... ({len(label_encoder.classes_)} classes au total)")
    
    # Normaliser
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print(f"\n‚úÖ Normalisation appliqu√©e (StandardScaler)")
    
    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_encoded, 
        test_size=TEST_SIZE, 
        random_state=RANDOM_STATE,
        stratify=y_encoded
    )
    
    print(f"\nüìä S√âPARATION TRAIN/TEST:")
    print(f"   üéØ Train: {X_train.shape[0]} √©chantillons")
    print(f"   üß™ Test:  {X_test.shape[0]} √©chantillons")
    
    return X_train, X_test, y_train, y_test, scaler, label_encoder


# ============================================
# FONCTION 3: ENTRA√éNER LE MOD√àLE OPTIMIS√â
# ============================================

def train_model(X_train, y_train):
    print("\n" + "="*60)
    print("üöÄ ENTRA√éNEMENT DU MOD√àLE OPTIMIS√â MOBILE")
    print("="*60)
    
    # ‚ö° Mod√®le optimis√© pour mobile
    model = RandomForestClassifier(
        n_estimators=N_ESTIMATORS,           # Moins d'arbres
        max_depth=MAX_DEPTH,                 # Profondeur limit√©e
        min_samples_split=MIN_SAMPLES_SPLIT, # N≈ìuds plus larges
        min_samples_leaf=MIN_SAMPLES_LEAF,   # Feuilles plus larges
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=1
    )
    
    print(f"\nüå≥ Configuration du mod√®le:")
    print(f"   üì¶ Nombre d'arbres: {N_ESTIMATORS} (au lieu de 100)")
    print(f"   üìè Profondeur max: {MAX_DEPTH}")
    print(f"   üîπ Min samples split: {MIN_SAMPLES_SPLIT}")
    print(f"   üîπ Min samples leaf: {MIN_SAMPLES_LEAF}")
    print(f"\n   ‚úÖ Optimis√© pour r√©duire la taille du mod√®le ONNX")
    
    print(f"\n‚è≥ Entra√Ænement en cours...")
    model.fit(X_train, y_train)
    
    print(f"‚úÖ Entra√Ænement termin√©!")
    
    # Validation crois√©e
    print(f"\nüîÑ Validation crois√©e (5-fold)...")
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    print(f"   Scores: {[f'{s:.4f}' for s in cv_scores]}")
    print(f"   Moyenne: {cv_scores.mean():.4f} (¬±{cv_scores.std():.4f})")
    
    return model


# ============================================
# FONCTION 4: √âVALUER LE MOD√àLE
# ============================================

def evaluate_model(model, X_test, y_test, label_encoder):
    print("\n" + "="*60)
    print("üìà √âVALUATION DU MOD√àLE")
    print("="*60)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nüéØ ACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    print(f"\nüìä RAPPORT DE CLASSIFICATION:")
    print("-"*60)
    report = classification_report(
        y_test, y_pred, 
        target_names=label_encoder.classes_,
        zero_division=0
    )
    print(report)
    
    cm = confusion_matrix(y_test, y_pred)
    
    return accuracy, report, cm


# ============================================
# FONCTION 5: VISUALISATION
# ============================================

def visualize_results(cm, label_encoder, feature_importance, model_dir):
    print("\n" + "="*60)
    print("üìä CR√âATION DES VISUALISATIONS")
    print("="*60)
    
    # Matrice de confusion
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title('Matrice de Confusion (Mod√®le Mobile)')
    plt.ylabel('Vraie classe')
    plt.xlabel('Classe pr√©dite')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    
    cm_path = os.path.join(model_dir, 'confusion_matrix_mobile.png')
    plt.savefig(cm_path, dpi=150)
    print(f"   ‚úÖ Matrice de confusion: {cm_path}")
    plt.close()
    
    # Importance des features
    plt.figure(figsize=(10, 6))
    feature_names = FEATURE_COLUMNS
    indices = np.argsort(feature_importance)[::-1]
    
    plt.bar(range(len(feature_importance)), feature_importance[indices])
    plt.xticks(range(len(feature_importance)), 
               [feature_names[i] for i in indices], 
               rotation=45, ha='right')
    plt.title('Importance des Features (Mod√®le Mobile)')
    plt.ylabel('Importance')
    plt.xlabel('Features')
    plt.tight_layout()
    
    fi_path = os.path.join(model_dir, 'feature_importance_mobile.png')
    plt.savefig(fi_path, dpi=150)
    print(f"   ‚úÖ Importance des features: {fi_path}")
    plt.close()


# ============================================
# FONCTION 6: SAUVEGARDER LE MOD√àLE
# ============================================

def save_model(model, scaler, label_encoder, model_dir):
    print("\n" + "="*60)
    print("üíæ SAUVEGARDE DU MOD√àLE MOBILE")
    print("="*60)
    
    os.makedirs(model_dir, exist_ok=True)
    
    # Sauvegarder le mod√®le
    model_path = os.path.join(model_dir, 'asl_model_mobile.pkl')
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    
    # Taille du fichier
    size_mb = os.path.getsize(model_path) / (1024 * 1024)
    print(f"   ‚úÖ Mod√®le: {model_path}")
    print(f"      üì¶ Taille: {size_mb:.2f} MB")
    
    # Scaler
    scaler_path = os.path.join(model_dir, 'scaler.pkl')
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"   ‚úÖ Scaler: {scaler_path}")
    
    # Label encoder
    encoder_path = os.path.join(model_dir, 'label_encoder.pkl')
    with open(encoder_path, 'wb') as f:
        pickle.dump(label_encoder, f)
    print(f"   ‚úÖ Label Encoder: {encoder_path}")
    
    # M√©tadonn√©es
    metadata = {
        'feature_columns': FEATURE_COLUMNS,
        'classes': label_encoder.classes_.tolist(),
        'n_classes': len(label_encoder.classes_),
        'n_features': len(FEATURE_COLUMNS),
        'model_type': 'RandomForestClassifier',
        'n_estimators': N_ESTIMATORS,
        'max_depth': MAX_DEPTH,
        'optimized_for': 'mobile',
        'target_format': 'ONNX'
    }
    
    metadata_path = os.path.join(model_dir, 'model_metadata.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=4)
    print(f"   ‚úÖ M√©tadonn√©es: {metadata_path}")


# ============================================
# FONCTION 7: TEST DE PR√âDICTION
# ============================================

def test_prediction(model, scaler, label_encoder, X_test, y_test):
    print("\n" + "="*60)
    print("üß™ TEST DE PR√âDICTION")
    print("="*60)
    
    idx = np.random.randint(0, len(X_test))
    sample = X_test[idx:idx+1]
    true_label = label_encoder.classes_[y_test[idx]]
    
    prediction = model.predict(sample)
    predicted_label = label_encoder.classes_[prediction[0]]
    
    probabilities = model.predict_proba(sample)[0]
    
    print(f"\nüìä √âchantillon #{idx}:")
    print(f"   Features: {sample[0][:5]}...")
    print(f"\nüéØ Vraie classe: {true_label}")
    print(f"ü§ñ Pr√©diction: {predicted_label}")
    print(f"   {'‚úÖ CORRECT' if true_label == predicted_label else '‚ùå INCORRECT'}")
    
    print(f"\nüìä Top 3 probabilit√©s:")
    top_3_idx = np.argsort(probabilities)[-3:][::-1]
    for i, idx in enumerate(top_3_idx, 1):
        label = label_encoder.classes_[idx]
        prob = probabilities[idx]
        print(f"   {i}. {label:15s}: {prob:.4f} ({prob*100:.2f}%)")


# ============================================
# FONCTION PRINCIPALE
# ============================================

def main():
    print("\n" + "="*70)
    print("ü§ñ ENTRA√éNEMENT DU MOD√àLE ASL OPTIMIS√â POUR MOBILE")
    print("="*70)
    
    # 1. Charger
    df = load_and_explore_data()
    
    # 2. Pr√©traiter
    X_train, X_test, y_train, y_test, scaler, label_encoder = preprocess_data(df)
    
    # 3. Entra√Æner (mod√®le optimis√©)
    model = train_model(X_train, y_train)
    
    # 4. √âvaluer
    accuracy, report, cm = evaluate_model(model, X_test, y_test, label_encoder)
    
    # 5. Cr√©er le dossier
    os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
    
    # 6. Visualiser
    feature_importance = model.feature_importances_
    visualize_results(cm, label_encoder, feature_importance, MODEL_OUTPUT_DIR)
    
    # 7. Sauvegarder
    save_model(model, scaler, label_encoder, MODEL_OUTPUT_DIR)
    
    # 8. Test
    test_prediction(model, scaler, label_encoder, X_test, y_test)
    
    print("\n" + "="*70)
    print("‚ú® ENTRA√éNEMENT TERMIN√â AVEC SUCC√àS !")
    print("="*70)
    print(f"\nüìä R√©sum√©:")
    print(f"   üéØ Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"   üìÅ Mod√®le sauvegard√©: {MODEL_OUTPUT_DIR}")
    print(f"   üì¶ Optimis√© pour conversion ONNX")
    print(f"   üå≥ Arbres: {N_ESTIMATORS} (r√©duit pour mobile)")
    print(f"   üìè Profondeur max: {MAX_DEPTH}")


if __name__ == "__main__":
    main()


ü§ñ ENTRA√éNEMENT DU MOD√àLE ASL OPTIMIS√â POUR MOBILE
üìä CHARGEMENT DES DONN√âES

‚úÖ Dataset charg√© avec succ√®s!
   üìè Dimensions: 1500000 lignes √ó 13 colonnes

üè∑Ô∏è  Classes: 40
   ['a', 'b', 'bad', 'c', 'd', 'deaf', 'e', 'f', 'fine', 'g', 'good', 'goodbye', 'h', 'hello', 'hungry', 'i', 'j', 'k', 'l', 'm', 'me', 'n', 'no', 'o', 'p', 'please', 'q', 'r', 's', 'sorry', 't', 'thankyou', 'u', 'v', 'w', 'x', 'y', 'yes', 'you', 'z']

üìà R√âPARTITION PAR CLASSE:
   a              :  37500 √©chantillons
   b              :  37500 √©chantillons
   bad            :  37500 √©chantillons
   c              :  37500 √©chantillons
   d              :  37500 √©chantillons
   deaf           :  37500 √©chantillons
   e              :  37500 √©chantillons
   f              :  37500 √©chantillons
   fine           :  37500 √©chantillons
   g              :  37500 √©chantillons
   good           :  37500 √©chantillons
   goodbye        :  37500 √©chantillons
   h              :  37500 √©cha

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   46.3s finished


‚úÖ Entra√Ænement termin√©!

üîÑ Validation crois√©e (5-fold)...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   34.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   45.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   37.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_j

   Scores: ['0.9115', '0.9162', '0.9142', '0.9126', '0.9095']
   Moyenne: 0.9128 (¬±0.0023)

üìà √âVALUATION DU MOD√àLE


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    2.0s finished



üéØ ACCURACY: 0.9098 (90.98%)

üìä RAPPORT DE CLASSIFICATION:
------------------------------------------------------------
              precision    recall  f1-score   support

           a       0.89      0.98      0.93      7500
           b       0.94      0.99      0.96      7500
         bad       0.83      0.68      0.75      7500
           c       0.97      0.81      0.88      7500
           d       0.76      0.94      0.84      7500
        deaf       0.87      0.84      0.86      7500
           e       0.81      0.91      0.86      7500
           f       1.00      1.00      1.00      7500
        fine       0.88      0.95      0.91      7500
           g       0.94      1.00      0.97      7500
        good       0.92      0.65      0.76      7500
     goodbye       0.88      0.86      0.87      7500
           h       1.00      1.00      1.00      7500
       hello       0.84      0.84      0.84      7500
      hungry       0.99      0.96      0.98      7500
         

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.0s finished
