In [None]:
# ============================================
# IMPORTS
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    confusion_matrix, classification_report,
    roc_curve, auc, roc_auc_score,
    ConfusionMatrixDisplay
)

import mlflow
from mlflow.tracking import MlflowClient

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("="*80)
print(" ENVIRONNEMENT D'ÉVALUATION CONFIGURÉ")
print("="*80)

In [None]:
# ============================================
# CHARGEMENT DES DONNÉES
# ============================================

df = pd.read_csv('../data/annotated/podcasts_annotated.csv')

if 'keywords_text' not in df.columns:
    df['keywords_text'] = df['keywords_clean'].apply(
        lambda x: ' '.join(eval(x)) if isinstance(x, str) else ' '.join(x)
    )

X = df['keywords_text']
y = df['is_kid_friendly']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Dataset prêt: Train={len(X_train):,}, Test={len(X_test):,}")

In [None]:
# ============================================
# ENTRAÎNEMENT DES MODÈLES FINAUX
# (utiliser les meilleurs paramètres du tuning)
# ============================================

# TODO: Remplacer par les paramètres optimaux du notebook 03

best_lr = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=None, min_df=2, max_df=0.95,
        ngram_range=(1, 2), sublinear_tf=True
    )),
    ('clf', LogisticRegression(
        C=1.0, max_iter=1000, random_state=42,
        class_weight='balanced', solver='liblinear'
    ))
])

best_svm = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=None, min_df=2, max_df=0.95,
        ngram_range=(1, 2), sublinear_tf=True
    )),
    ('clf', LinearSVC(
        C=1.0, max_iter=2000, random_state=42,
        class_weight='balanced', dual='auto'
    ))
])

models = {
    'Logistic Regression': best_lr,
    'Linear SVM': best_svm
}

# Entraînement
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"✓ {name} entraîné")

In [None]:
# ============================================
# MATRICES DE CONFUSION
# ============================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, (name, model) in enumerate(models.items()):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    disp = ConfusionMatrixDisplay(
        confusion_matrix=cm,
        display_labels=['Not Kid-Friendly', 'Kid-Friendly']
    )
    disp.plot(ax=axes[idx], cmap='Blues', values_format='d')
    axes[idx].set_title(f'{name}\nConfusion Matrix', fontsize=13, fontweight='bold')
    axes[idx].grid(False)

plt.tight_layout()
plt.savefig('../models/confusion_matrices.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Matrices de confusion sauvegardées")

In [None]:
# ============================================
# RAPPORTS DE CLASSIFICATION
# ============================================

print("\n" + "="*80)
print(" RAPPORTS DE CLASSIFICATION DÉTAILLÉS")
print("="*80)

for name, model in models.items():
    y_pred = model.predict(X_test)
    
    print(f"\n{name}:")
    print("─" * 80)
    print(classification_report(
        y_test, y_pred,
        target_names=['Not Kid-Friendly', 'Kid-Friendly'],
        digits=4
    ))
    print("─" * 80)

In [None]:
# ============================================
# COURBES ROC
# ============================================

plt.figure(figsize=(10, 7))

for name, model in models.items():
    # Probabilités
    if hasattr(model, 'predict_proba'):
        y_score = model.predict_proba(X_test)[:, 1]
    else:
        y_score = model.decision_function(X_test)
    
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, linewidth=2.5,
             label=f'{name} (AUC = {roc_auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--', linewidth=1.5, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
plt.title('ROC Curves - Kid-Friendly Podcast Classification', 
          fontsize=14, fontweight='bold', pad=20)
plt.legend(loc='lower right', fontsize=11, framealpha=0.95)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../models/roc_curves.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Courbes ROC sauvegardées")

In [None]:
# ============================================
# VISUALISATION MLFLOW (si disponible)
# ============================================

MLFLOW_TRACKING_URI = "http://localhost:5000"

try:
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    client = MlflowClient()
    
    experiment = mlflow.get_experiment_by_name("podcast-classification-kid-friendly")
    
    if experiment:
        runs = mlflow.search_runs(
            experiment_ids=[experiment.experiment_id],
            order_by=["metrics.f1_score DESC"],
            max_results=10
        )
        
        if not runs.empty:
            print("\n" + "="*80)
            print(" TOP 10 MLFLOW RUNS (par F1-Score)")
            print("="*80 + "\n")
            
            display_cols = ['tags.model_type', 'metrics.f1_score', 
                          'metrics.accuracy', 'metrics.precision', 'metrics.recall']
            available_cols = [col for col in display_cols if col in runs.columns]
            
            print(runs[available_cols].head(10).to_string(index=False))
            print("\n" + "="*80)
            
            # Visualisation
            if 'metrics.f1_score' in runs.columns and 'tags.model_type' in runs.columns:
                plt.figure(figsize=(12, 6))
                
                plot_data = runs.dropna(subset=['metrics.f1_score', 'tags.model_type']).head(10)
                
                plt.barh(range(len(plot_data)), plot_data['metrics.f1_score'].values,
                        color=plt.cm.viridis(np.linspace(0.3, 0.9, len(plot_data))))
                
                plt.yticks(range(len(plot_data)), plot_data['tags.model_type'].values)
                plt.xlabel('F1-Score', fontsize=12, fontweight='bold')
                plt.title('MLflow Runs Comparison (Top 10)', 
                         fontsize=14, fontweight='bold', pad=20)
                plt.grid(axis='x', alpha=0.3)
                
                for i, v in enumerate(plot_data['metrics.f1_score'].values):
                    plt.text(v + 0.005, i, f'{v:.4f}', va='center', fontweight='bold')
                
                plt.tight_layout()
                plt.savefig('../models/mlflow_runs_comparison.png', dpi=150, bbox_inches='tight')
                plt.show()
                
                print("✓ Visualisation MLflow sauvegardée")
        else:
            print(" Aucun run trouvé dans l'expérience")
    else:
        print(" Expérience MLflow introuvable")
        
except Exception as e:
    print(f" MLflow non accessible: {str(e)[:60]}")
    print(" → Passé (optionnel)")

In [None]:
# ============================================
# FEATURE IMPORTANCE (Logistic Regression)
# ============================================

lr_model = models['Logistic Regression']
tfidf = lr_model.named_steps['tfidf']
clf = lr_model.named_steps['clf']

feature_names = tfidf.get_feature_names_out()
coefficients = clf.coef_[0]

# Top features positives (Kid-Friendly)
top_positive_idx = np.argsort(coefficients)[-20:]
top_positive_features = [(feature_names[i], coefficients[i]) for i in top_positive_idx]

# Top features négatives (Not Kid-Friendly)
top_negative_idx = np.argsort(coefficients)[:20]
top_negative_features = [(feature_names[i], coefficients[i]) for i in top_negative_idx]

# Visualisation
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# Kid-Friendly features
words_pos = [f[0] for f in top_positive_features]
scores_pos = [f[1] for f in top_positive_features]
ax1.barh(range(len(words_pos)), scores_pos, color='#2ecc71')
ax1.set_yticks(range(len(words_pos)))
ax1.set_yticklabels(words_pos, fontsize=10)
ax1.set_xlabel('Coefficient', fontsize=11, fontweight='bold')
ax1.set_title('Top 20 Kid-Friendly Features', fontsize=13, fontweight='bold')
ax1.grid(axis='x', alpha=0.3)

# Not Kid-Friendly features
words_neg = [f[0] for f in top_negative_features]
scores_neg = [f[1] for f in top_negative_features]
ax2.barh(range(len(words_neg)), scores_neg, color='#e74c3c')
ax2.set_yticks(range(len(words_neg)))
ax2.set_yticklabels(words_neg, fontsize=10)
ax2.set_xlabel('Coefficient', fontsize=11, fontweight='bold')
ax2.set_title('Top 20 Not Kid-Friendly Features', fontsize=13, fontweight='bold')
ax2.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('../models/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Feature importance sauvegardée")

In [None]:
# ============================================
# ANALYSE D'ERREURS
# ============================================

best_model = models['Logistic Regression']
y_pred = best_model.predict(X_test)

# Erreurs
errors = X_test[y_pred != y_test]
errors_true = y_test[y_pred != y_test]
errors_pred = y_pred[y_pred != y_test]

print("\n" + "="*80)
print(f" ANALYSE D'ERREURS - {len(errors)} erreurs sur {len(y_test)} ({len(errors)/len(y_test)*100:.2f}%)")
print("="*80)

# False Positives
fp = errors[(errors_true == 0) & (errors_pred == 1)]
print(f"\nFaux Positifs (prédit Kid-Friendly, réel Not): {len(fp)}")
if len(fp) > 0:
    print("\nExemples:")
    for i, text in enumerate(fp.head(3).values, 1):
        print(f"  {i}. {text[:100]}...")

# False Negatives
fn = errors[(errors_true == 1) & (errors_pred == 0)]
print(f"\nFaux Négatifs (prédit Not, réel Kid-Friendly): {len(fn)}")
if len(fn) > 0:
    print("\nExemples:")
    for i, text in enumerate(fn.head(3).values, 1):
        print(f"  {i}. {text[:100]}...")

print("\n" + "="*80)
print(" Prochaine étape: 05_model_registry.ipynb")
print("="*80)

In [None]:
# ============================================
# DEEPCHECKS: MODEL PERFORMANCE VALIDATION
# ============================================

print("\n" + "="*80)
print(" VALIDATION DEEPCHECKS - PERFORMANCE DES MODÈLES")
print("="*80)

try:
    from deepchecks.tabular import Dataset
    from deepchecks.tabular.checks import (
        ConfusionMatrixReport, RocReport, 
        SimpleModelComparison, PerformanceReport,
        WeakSegmentsPerformance, ModelInferenceTime
    )
    
    # Créer datasets
    train_df_dc = pd.DataFrame({'keywords_text': X_train, 'is_kid_friendly': y_train})
    test_df_dc = pd.DataFrame({'keywords_text': X_test, 'is_kid_friendly': y_test})
    
    train_ds = Dataset(train_df_dc, label='is_kid_friendly', cat_features=[])
    test_ds = Dataset(test_df_dc, label='is_kid_friendly', cat_features=[])
    
    # Utiliser le meilleur modèle (Logistic Regression)
    best_model = models['Logistic Regression']
    
    print("\n1. Analyse de la matrice de confusion (Deepchecks)...")
    try:
        result_cm = ConfusionMatrixReport().run(train_ds, test_ds, best_model)
        print("   ✓ Matrice de confusion analysée")
        result_cm.show()
    except Exception as e:
        print(f"   ⚠ Erreur: {str(e)[:60]}")
    
    print("\n2. Analyse ROC détaillée...")
    try:
        result_roc = RocReport().run(train_ds, test_ds, best_model)
        print("   ✓ Courbe ROC analysée")
        result_roc.show()
    except Exception as e:
        print(f"   ⚠ Erreur: {str(e)[:60]}")
    
    print("\n3. Comparaison avec modèle simple...")
    try:
        result_simple = SimpleModelComparison().run(train_ds, test_ds, best_model)
        if result_simple.passed():
            print("   ✓ Modèle surpasse le baseline simple")
        else:
            print("   ⚠ Modèle ne surpasse pas significativement le baseline")
    except Exception as e:
        print(f"   ⚠ Erreur: {str(e)[:60]}")
    
    print("\n4. Rapport de performance global...")
    try:
        result_perf = PerformanceReport().run(train_ds, test_ds, best_model)
        print("   ✓ Performance globale analysée")
        result_perf.show()
    except Exception as e:
        print(f"   ⚠ Erreur: {str(e)[:60]}")
    
    print("\n5. Détection des segments faibles...")
    try:
        result_weak = WeakSegmentsPerformance().run(train_ds, test_ds, best_model)
        if result_weak.passed():
            print("   ✓ Performance uniforme sur tous les segments")
        else:
            print("   ⚠ Segments faibles détectés")
            print("   → Certains types de podcasts ont des performances plus faibles")
        result_weak.show()
    except Exception as e:
        print(f"   ⚠ Erreur: {str(e)[:60]}")
    
    print("\n6. Temps d'inférence du modèle...")
    try:
        result_time = ModelInferenceTime().run(train_ds, test_ds, best_model)
        if result_time.passed():
            print("   ✓ Temps d'inférence acceptable")
        else:
            print("   ⚠ Temps d'inférence élevé")
        print(f"   → Temps moyen: {result_time.value:.4f}s par prédiction")
    except Exception as e:
        print(f"   ⚠ Erreur: {str(e)[:60]}")
    
    print("\n" + "="*80)
    print(" RÉSUMÉ VALIDATION MODÈLE")
    print("="*80)
    print("\n ✓ Validation des performances terminée")
    print(" → Les résultats détaillés sont affichés ci-dessus")
    print("\n" + "="*80)
    print(" ℹ Modèle validé - Prêt pour le registry")
    print("="*80)
    
except ImportError:
    print("\n⚠ Deepchecks non installé")
    print("  → Installation: pip install deepchecks")
    print("  → Suite sans validation deepchecks")
except Exception as e:
    print(f"\n⚠ Erreur deepchecks: {str(e)[:100]}")
    print("  → Suite sans validation deepchecks")