In [None]:
# ============================================
# IMPORTS
# ============================================

import pandas as pd
import numpy as np
import pickle
import time
from datetime import datetime

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

import mlflow
import mlflow.sklearn

np.random.seed(42)

print("="*80)
print(" ENVIRONNEMENT CONFIGURÉ POUR TUNING")
print("="*80)

In [None]:
# ============================================
# MLFLOW SETUP
# ============================================

# Set experiment name
mlflow.set_experiment("podcast-classification-tuning")

# Set tracking URI (optional - defaults to ./mlruns)
# mlflow.set_tracking_uri("file:./mlruns")

print("="*80)
print(" MLFLOW CONFIGURED")
print(f" Experiment: podcast-classification-tuning")
print(f" Tracking URI: {mlflow.get_tracking_uri()}")
print("="*80)

In [None]:
# ============================================
# CHARGEMENT DES DONNÉES
# ============================================

df = pd.read_csv('../data/annotated/podcasts_annotated.csv')

if 'keywords_text' not in df.columns:
    df['keywords_text'] = df['keywords_clean'].apply(
        lambda x: ' '.join(eval(x)) if isinstance(x, str) else ' '.join(x)
    )

X = df['keywords_text']
y = df['is_kid_friendly']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")

In [None]:
# ============================================
# MODÈLES BASELINE (pour référence)
# ============================================

tfidf_params = {
    'max_features': None,
    'min_df': 2,
    'max_df': 0.95,
    'ngram_range': (1, 2),
    'sublinear_tf': True
}

baseline_models = {
    'Logistic Regression': Pipeline([
        ('tfidf', TfidfVectorizer(**tfidf_params)),
        ('clf', LogisticRegression(
            max_iter=1000, random_state=42,
            class_weight='balanced', solver='liblinear'
        ))
    ]),
    
    'Linear SVM': Pipeline([
        ('tfidf', TfidfVectorizer(**tfidf_params)),
        ('clf', LinearSVC(
            max_iter=2000, random_state=42,
            class_weight='balanced', dual='auto'
        ))
    ])
}

In [None]:
# ============================================
# GRILLES DE PARAMÈTRES
# ============================================

param_grids = {
    'Logistic Regression': {
        'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'tfidf__min_df': [1, 2, 3],
        'tfidf__max_df': [0.9, 0.95, 1.0],
        'clf__C': [0.1, 1.0, 10.0],
        'clf__penalty': ['l2'],
        'clf__class_weight': ['balanced']
    },
    
    'Linear SVM': {
        'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'tfidf__min_df': [1, 2, 3],
        'tfidf__max_df': [0.9, 0.95, 1.0],
        'clf__C': [0.1, 1.0, 10.0],
        'clf__class_weight': ['balanced']
    }
}

print("\n" + "="*80)
print(" GRILLES DE PARAMÈTRES DÉFINIES")
print("="*80)
for model_name, grid in param_grids.items():
    n_comb = len(list(ParameterGrid(grid)))
    print(f"\n{model_name}:")
    print(f"   • Combinaisons: {n_comb}")
    print(f"   • Total fits (5-fold): {n_comb * 5}")
print("="*80)

In [None]:
# ============================================
# FONCTION DE TUNING
# ============================================

def tune_model(model, param_grid, X_train, y_train, model_name, cv=5):
    """
    Fine-tuning avec GridSearchCV + MLflow tracking
    """
    print(f"\n{'─'*80}")
    print(f" Fine-Tuning: {model_name}")
    print(f"{'─'*80}")
    
    n_combinations = len(list(ParameterGrid(param_grid)))
    print(f"Configuration: {n_combinations} combinaisons × {cv}-fold = {n_combinations * cv} fits")
    print(f"Métrique: F1-Score")
    
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=StratifiedKFold(cv, shuffle=True, random_state=42),
        scoring='f1',
        n_jobs=-1,
        verbose=0,
        return_train_score=True
    )
    
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    tuning_time = time.time() - start_time
    
    print(f"\n Terminé en {tuning_time:.2f}s ({tuning_time/60:.1f} min)")
    print(f" Meilleur F1 CV: {grid_search.best_score_:.4f}")
    print(f"\n Meilleurs paramètres:")
    for param, value in grid_search.best_params_.items():
        print(f"   • {param:30s}: {value}")
    
    # MLflow tracking
    with mlflow.start_run(run_name=f"{model_name}_tuning"):
        # Log tags
        mlflow.set_tag("model_type", model_name)
        mlflow.set_tag("stage", "hyperparameter_tuning")
        
        # Log hyperparameters
        for param, value in grid_search.best_params_.items():
            mlflow.log_param(param, value)
        
        # Log tuning configuration
        mlflow.log_param("cv_folds", cv)
        mlflow.log_param("n_combinations", n_combinations)
        mlflow.log_param("scoring", "f1")
        
        # Log CV metrics
        mlflow.log_metric("best_cv_f1_score", grid_search.best_score_)
        mlflow.log_metric("tuning_time_seconds", tuning_time)
        
        # Log the best model
        mlflow.sklearn.log_model(grid_search.best_estimator_, "model")
        
        print(f" ✓ Logged to MLflow (Run ID: {mlflow.active_run().info.run_id})")
    
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_, tuning_time

In [None]:
# ============================================
# EXÉCUTION DU TUNING
# ============================================

print("\n" + "="*80)
print(" PHASE: HYPERPARAMETER TUNING")
print("="*80)
print("\n⏱ Cette phase peut prendre plusieurs minutes...\n")

tuned_models = {}
tuning_results = {}

for model_name in ['Logistic Regression', 'Linear SVM']:
    best_model, best_params, best_score, tune_time = tune_model(
        baseline_models[model_name],
        param_grids[model_name],
        X_train, y_train,
        model_name
    )
    
    tuned_models[f"{model_name} (Tuned)"] = best_model
    tuning_results[f"{model_name} (Tuned)"] = {
        'best_params': best_params,
        'best_cv_score': best_score,
        'tuning_time': tune_time
    }

print("\n" + "="*80)
print(" TUNING TERMINÉ")
print("="*80)

In [None]:
# ============================================
# ÉVALUATION SUR TEST SET
# ============================================

print("\n" + "="*80)
print(" ÉVALUATION DES MODÈLES OPTIMISÉS")
print("="*80)

tuned_test_results = {}

for model_name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    
    results = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1_score': f1_score(y_test, y_pred, zero_division=0)
    }
    
    tuned_test_results[model_name] = results
    
    print(f"\n{model_name}:")
    print(f"   • Accuracy:  {results['accuracy']:.4f}")
    print(f"   • Precision: {results['precision']:.4f}")
    print(f"   • Recall:    {results['recall']:.4f}")
    print(f"   • F1-Score:  {results['f1_score']:.4f}")
    
    # Log test metrics to MLflow
    with mlflow.start_run(run_name=f"{model_name}_test_evaluation"):
        mlflow.set_tag("model_type", model_name)
        mlflow.set_tag("stage", "test_evaluation")
        
        # Log test metrics
        mlflow.log_metric("test_accuracy", results['accuracy'])
        mlflow.log_metric("test_precision", results['precision'])
        mlflow.log_metric("test_recall", results['recall'])
        mlflow.log_metric("test_f1_score", results['f1_score'])
        
        # Log dataset info
        mlflow.log_param("test_size", len(X_test))
        mlflow.log_param("train_size", len(X_train))

print("\n" + "="*80)

In [None]:
# ============================================
# RÉSUMÉ COMPARATIF
# ============================================

print("\n" + "="*80)
print(" RÉSUMÉ DU TUNING")
print("="*80)

for base_name in ['Logistic Regression', 'Linear SVM']:
    tuned_name = f"{base_name} (Tuned)"
    
    cv_score = tuning_results[tuned_name]['best_cv_score']
    test_score = tuned_test_results[tuned_name]['f1_score']
    
    print(f"\n{base_name}:")
    print(f"   • Meilleur F1 CV:  {cv_score:.4f}")
    print(f"   • F1 Test Set:     {test_score:.4f}")
    print(f"   • Temps tuning:    {tuning_results[tuned_name]['tuning_time']:.1f}s")

print("\n" + "="*80)
print(" MLFLOW TRACKING")
print("="*80)
print(f" Tous les runs sont enregistrés dans: {mlflow.get_tracking_uri()}")
print(f" Pour visualiser: mlflow ui")
print("="*80)
print("\n Prochaine étape: 06_model_evaluation.ipynb")
print("="*80)