In [None]:
# ============================================
# IMPORTS ET CONFIGURATION
# ============================================

import pandas as pd
import numpy as np
import warnings
from datetime import datetime
import time
import requests

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, roc_auc_score
)

import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

warnings.filterwarnings('ignore')
np.random.seed(42)

print("="*80)
print(" ENVIRONNEMENT CONFIGURÉ")
print("="*80)

In [None]:
# ============================================
# CONFIGURATION MLFLOW
# ============================================

MLFLOW_TRACKING_URI = "http://localhost:5000"
MLFLOW_ENABLED = False

def check_mlflow_connection(uri):
    try:
        response = requests.get(uri, timeout=2)
        return response.status_code == 200
    except:
        return False

print("="*80)
print(" VÉRIFICATION MLFLOW")
print("="*80)

if check_mlflow_connection(MLFLOW_TRACKING_URI):
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment("podcast-classification-kid-friendly")
    MLFLOW_ENABLED = True
    print(f" ✓ MLflow connecté: {MLFLOW_TRACKING_URI}")
else:
    print(f" ✗ MLflow non accessible")
    print(f" → Pour activer: cd docker && docker-compose up -d")
    MLFLOW_ENABLED = False

print(f" MLflow: {'Activé' if MLFLOW_ENABLED else 'Désactivé'}")
print("="*80)

In [None]:
# ============================================
# CHARGEMENT DES DONNÉES
# ============================================

df = pd.read_csv('../data/annotated/podcasts_annotated.csv')

if 'keywords_text' not in df.columns:
    df['keywords_text'] = df['keywords_clean'].apply(
        lambda x: ' '.join(eval(x)) if isinstance(x, str) else ' '.join(x)
    )

print(f"Dataset chargé: {len(df):,} exemples")

In [None]:
# ============================================
# SPLIT TRAIN/TEST STRATIFIÉ
# ============================================

X = df['keywords_text']
y = df['is_kid_friendly']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n" + "="*80)
print(" SPLIT TRAIN/TEST")
print("="*80)
print(f" Train: {len(X_train):,} ({len(X_train)/len(df)*100:.1f}%)")
print(f" Test:  {len(X_test):,} ({len(X_test)/len(df)*100:.1f}%)")
print("="*80)

In [None]:
# ============================================
# DEEPCHECKS: TRAIN-TEST VALIDATION
# ============================================

print("\n" + "="*80)
print(" VALIDATION DEEPCHECKS - TRAIN/TEST SPLIT")
print("="*80)

try:
    from deepchecks.tabular import Dataset
    from deepchecks.tabular.checks import (
        TrainTestLabelDrift, NewLabelTrainTest, 
        TrainTestSamplesMix, TrainTestFeatureDrift
    )
    
    # Créer datasets Deepchecks
    train_df_dc = pd.DataFrame({'keywords_text': X_train, 'is_kid_friendly': y_train})
    test_df_dc = pd.DataFrame({'keywords_text': X_test, 'is_kid_friendly': y_test})
    
    train_ds = Dataset(train_df_dc, label='is_kid_friendly', cat_features=[])
    test_ds = Dataset(test_df_dc, label='is_kid_friendly', cat_features=[])
    
    print("\n1. Vérification du drift des labels...")
    result_label_drift = TrainTestLabelDrift().run(train_ds, test_ds)
    if result_label_drift.passed():
        print("   ✓ Distribution des labels similaire train/test")
    else:
        print("   ⚠ Drift détecté dans la distribution des labels")
        print(f"   → Peut affecter la généralisation du modèle")
    
    print("\n2. Vérification des nouveaux labels...")
    result_new_labels = NewLabelTrainTest().run(train_ds, test_ds)
    if result_new_labels.passed():
        print("   ✓ Pas de nouveaux labels dans test")
    else:
        print("   ⚠ Nouveaux labels trouvés dans test set")
    
    print("\n3. Détection de fuite de données (data leakage)...")
    result_leakage = TrainTestSamplesMix().run(train_ds, test_ds)
    if result_leakage.passed():
        print("   ✓ Pas de fuite de données détectée")
    else:
        print("   ⚠ Échantillons identiques train/test détectés (data leakage!)")
    
    print("\n" + "="*80)
    print(" RÉSUMÉ VALIDATION TRAIN/TEST")
    print("="*80)
    
    checks_results = [
        ("Label drift", result_label_drift),
        ("Nouveaux labels", result_new_labels),
        ("Data leakage", result_leakage)
    ]
    
    passed = sum(1 for _, r in checks_results if r.passed())
    total = len(checks_results)
    
    print(f"\n Checks réussis: {passed}/{total}")
    
    failed_checks = [name for name, r in checks_results if not r.passed()]
    if failed_checks:
        print(f"\n ⚠ Checks échoués:")
        for check in failed_checks:
            print(f"   • {check}")
        print("\n → Recommandation: Vérifier le split train/test")
    else:
        print("\n ✓ Split train/test validé!")
    
    print("\n" + "="*80)
    print(" ℹ Train/Test validés - Prêt pour entraînement")
    print("="*80)
    
except ImportError:
    print("\n⚠ Deepchecks non installé")
    print("  → Installation: pip install deepchecks")
    print("  → Suite sans validation deepchecks")
except Exception as e:
    print(f"\n⚠ Erreur deepchecks: {str(e)[:100]}")
    print("  → Suite sans validation deepchecks")

In [None]:
# ============================================
# DÉFINITION DES MODÈLES BASELINE
# ============================================

tfidf_params = {
    'max_features': None,
    'min_df': 2,
    'max_df': 0.95,
    'ngram_range': (1, 2),
    'sublinear_tf': True
}

baseline_models = {
    'Dummy (Baseline Naïve)': Pipeline([
        ('tfidf', TfidfVectorizer(**tfidf_params)),
        ('clf', DummyClassifier(strategy='most_frequent', random_state=42))
    ]),
    
    'Logistic Regression': Pipeline([
        ('tfidf', TfidfVectorizer(**tfidf_params)),
        ('clf', LogisticRegression(
            max_iter=1000, random_state=42, 
            class_weight='balanced', solver='liblinear'
        ))
    ]),
    
    'Linear SVM': Pipeline([
        ('tfidf', TfidfVectorizer(**tfidf_params)),
        ('clf', LinearSVC(
            max_iter=2000, random_state=42,
            class_weight='balanced', dual='auto'
        ))
    ]),
    
    'Multinomial Naive Bayes': Pipeline([
        ('tfidf', TfidfVectorizer(**tfidf_params)),
        ('clf', MultinomialNB(alpha=1.0))
    ])
}

print("\n" + "="*80)
print(" MODÈLES BASELINE CONFIGURÉS")
print("="*80)
for i, name in enumerate(baseline_models.keys(), 1):
    print(f"   {i}. {name}")
print("="*80)

In [None]:
# ============================================
# FONCTION D'ENTRAÎNEMENT
# ============================================

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, 
                             model_name, log_to_mlflow=None):
    """
    Entraîne et évalue un modèle avec logging MLflow optionnel
    """
    if log_to_mlflow is None:
        log_to_mlflow = MLFLOW_ENABLED
    
    print(f"\n{'─'*80}")
    print(f" Entraînement: {model_name}")
    print(f"{'─'*80}")
    
    # Démarrer MLflow run
    if log_to_mlflow:
        try:
            mlflow.start_run(run_name=model_name)
            mlflow.set_tag("model_type", model_name)
            mlflow.set_tag("framework", "scikit-learn")
        except Exception as e:
            print(f"   ⚠ MLflow: {str(e)[:50]}")
            log_to_mlflow = False
    
    try:
        # Entraînement
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time
        
        # Prédictions
        y_pred = model.predict(X_test)
        
        # Probabilités
        y_pred_proba = None
        try:
            if hasattr(model, 'predict_proba'):
                y_pred_proba = model.predict_proba(X_test)[:, 1]
            elif hasattr(model, 'decision_function'):
                y_pred_proba = model.decision_function(X_test)
        except:
            pass
        
        # Métriques
        results = {
            'model': model,
            'model_name': model_name,
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, zero_division=0),
            'recall': recall_score(y_test, y_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_pred, zero_division=0),
            'train_time': train_time,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        }
        
        if y_pred_proba is not None:
            try:
                results['roc_auc'] = roc_auc_score(y_test, y_pred_proba)
            except:
                results['roc_auc'] = np.nan
        else:
            results['roc_auc'] = np.nan
        
        # Log MLflow
        if log_to_mlflow:
            try:
                mlflow.log_metric("accuracy", results['accuracy'])
                mlflow.log_metric("precision", results['precision'])
                mlflow.log_metric("recall", results['recall'])
                mlflow.log_metric("f1_score", results['f1_score'])
                mlflow.log_metric("train_time", results['train_time'])
                if not np.isnan(results['roc_auc']):
                    mlflow.log_metric("roc_auc", results['roc_auc'])
                mlflow.sklearn.log_model(model, "model")
            except Exception as e:
                print(f"   ⚠ Log error: {str(e)[:50]}")
        
        # Affichage
        print(f" Terminé en {train_time:.2f}s")
        print(f"\n Métriques:")
        print(f"   • Accuracy:  {results['accuracy']:.4f}")
        print(f"   • Precision: {results['precision']:.4f}")
        print(f"   • Recall:    {results['recall']:.4f}")
        print(f"   • F1-Score:  {results['f1_score']:.4f}")
        if not np.isnan(results['roc_auc']):
            print(f"   • ROC-AUC:   {results['roc_auc']:.4f}")
        
        if log_to_mlflow and mlflow.active_run():
            print(f"\n  ✓ MLflow Run: {mlflow.active_run().info.run_id[:8]}")
        
        return results
        
    finally:
        if log_to_mlflow and mlflow.active_run():
            try:
                mlflow.end_run()
            except:
                pass

In [None]:
# ============================================
# ENTRAÎNEMENT DES MODÈLES BASELINE
# ============================================

print("\n" + "="*80)
print(" PHASE 1: ENTRAÎNEMENT BASELINE")
print("="*80)

baseline_results = {}

for model_name, model in baseline_models.items():
    results = train_and_evaluate_model(
        model, X_train, y_train, X_test, y_test, model_name
    )
    baseline_results[model_name] = results

print("\n" + "="*80)
print(" PHASE 1 TERMINÉE")
print("="*80)

In [None]:
# ============================================
# COMPARAISON DES RÉSULTATS
# ============================================

df_results = pd.DataFrame([
    {
        'Modèle': name,
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1-Score': results['f1_score'],
        'ROC-AUC': results['roc_auc'],
        'Temps (s)': results['train_time']
    }
    for name, results in baseline_results.items()
]).sort_values('F1-Score', ascending=False)

print("\n" + "="*80)
print(" RÉSULTATS BASELINE")
print("="*80 + "\n")
print(df_results.to_string(index=False))

best = df_results.iloc[0]
print(f"\n MEILLEUR: {best['Modèle']}")
print(f"   • F1-Score: {best['F1-Score']:.4f}")
print("="*80)

In [None]:
# ============================================
# CROSS-VALIDATION (optionnel)
# ============================================

print("\n" + "="*80)
print(" CROSS-VALIDATION STRATIFIÉE (5-fold)")
print("="*80)

for model_name, model_info in baseline_results.items():
    if 'Dummy' in model_name:
        continue
    
    print(f"\n{model_name}:")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for metric in ['f1', 'accuracy']:
        scores = cross_val_score(
            model_info['model'], X_train, y_train,
            cv=skf, scoring=metric, n_jobs=-1
        )
        print(f"   {metric.upper():10s}: {scores.mean():.4f} (±{scores.std():.4f})")

print("\n" + "="*80)
print(" Prochaine étape: 03_hyperparameter_tuning.ipynb")
print("="*80)