In [None]:
# ============================================
# IMPORTS
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Deepchecks imports
from deepchecks.tabular import Dataset, Suite
from deepchecks.tabular.checks import (
    # Data Integrity
    IsSingleValue, MixedNulls, MixedDataTypes, StringMismatch,
    DataDuplicates, ConflictingLabels, FeatureLabelCorrelation,
    OutlierSampleDetection, SpecialCharacters,
    
    # Train-Test Validation
    TrainTestLabelDrift, NewLabelTrainTest, TrainTestSamplesMix,
    TrainTestFeatureDrift, CategoryMismatchTrainTest,
    
    # Model Performance
    ConfusionMatrixReport, RocReport, SimpleModelComparison,
    PerformanceReport, WeakSegmentsPerformance, ModelInferenceTime,
    CalibrationScore, UnusedFeatures, BoostingOverfit
)

warnings.filterwarnings('ignore')
np.random.seed(42)

print("="*80)
print(" DEEPCHECKS COMPREHENSIVE VALIDATION")
print("="*80)
print(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

In [None]:
# ============================================
# CHARGEMENT DES DONNÉES
# ============================================

print("\n" + "="*80)
print(" CHARGEMENT DES DONNÉES")
print("="*80)

df = pd.read_csv('../data/annotated/podcasts_annotated.csv')

if 'keywords_text' not in df.columns:
    df['keywords_text'] = df['keywords_clean'].apply(
        lambda x: ' '.join(eval(x)) if isinstance(x, str) else ' '.join(x)
    )

X = df['keywords_text']
y = df['is_kid_friendly']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f" Dataset: {len(df):,} exemples")
print(f" Train: {len(X_train):,} ({len(X_train)/len(df)*100:.1f}%)")
print(f" Test:  {len(X_test):,} ({len(X_test)/len(df)*100:.1f}%)")
print("="*80)

In [None]:
# ============================================
# ENTRAÎNEMENT DU MODÈLE DE RÉFÉRENCE
# ============================================

print("\n" + "="*80)
print(" ENTRAÎNEMENT DU MODÈLE")
print("="*80)

# Utiliser les meilleurs paramètres (à ajuster selon tuning)
model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=None, min_df=2, max_df=0.95,
        ngram_range=(1, 2), sublinear_tf=True
    )),
    ('clf', LogisticRegression(
        C=1.0, max_iter=1000, random_state=42,
        class_weight='balanced', solver='liblinear'
    ))
])

model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score

train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
train_f1 = f1_score(y_train, y_pred_train)
test_f1 = f1_score(y_test, y_pred_test)

print(f" ✓ Modèle entraîné: Logistic Regression")
print(f"\n Performances:")
print(f"   • Train Accuracy: {train_acc:.4f}")
print(f"   • Test Accuracy:  {test_acc:.4f}")
print(f"   • Train F1-Score: {train_f1:.4f}")
print(f"   • Test F1-Score:  {test_f1:.4f}")
print("="*80)

In [None]:
# ============================================
# PRÉPARATION DES DATASETS DEEPCHECKS
# ============================================

print("\n" + "="*80)
print(" PRÉPARATION DATASETS DEEPCHECKS")
print("="*80)

# Créer DataFrames
train_df = pd.DataFrame({
    'keywords_text': X_train.values,
    'is_kid_friendly': y_train.values
})

test_df = pd.DataFrame({
    'keywords_text': X_test.values,
    'is_kid_friendly': y_test.values
})

# Créer Deepchecks Datasets
train_ds = Dataset(
    train_df, 
    label='is_kid_friendly',
    cat_features=[]
)

test_ds = Dataset(
    test_df,
    label='is_kid_friendly',
    cat_features=[]
)

print(" ✓ Datasets Deepchecks créés")
print("="*80)

In [None]:
# ============================================
# PHASE 1: DATA INTEGRITY SUITE
# ============================================

print("\n" + "="*80)
print(" PHASE 1: VALIDATION DE L'INTÉGRITÉ DES DONNÉES")
print("="*80)
print(" Vérifie la qualité et cohérence des données brutes")
print("="*80)

data_integrity_suite = Suite(
    "Data Integrity Validation",
    IsSingleValue(),                # Détecte features constantes
    MixedNulls(),                   # Valeurs manquantes mixtes
    MixedDataTypes(),               # Types incohérents
    StringMismatch(),               # Problèmes d'encodage
    DataDuplicates(),               # Duplicates exacts
    ConflictingLabels(),            # Même input, labels différents
    FeatureLabelCorrelation(),      # Corrélation feature-label
    SpecialCharacters()             # Caractères spéciaux problématiques
)

data_integrity_result = data_integrity_suite.run(train_ds, test_ds)

print("\n" + "─"*80)
print(" RÉSULTATS INTÉGRITÉ DES DONNÉES")
print("─"*80)
data_integrity_result.show()

In [None]:
# ============================================
# PHASE 2: TRAIN-TEST VALIDATION SUITE
# ============================================

print("\n" + "="*80)
print(" PHASE 2: VALIDATION DU SPLIT TRAIN/TEST")
print("="*80)
print(" Vérifie la qualité et cohérence du split")
print("="*80)

train_test_suite = Suite(
    "Train-Test Validation",
    TrainTestLabelDrift(),          # Drift distribution labels
    NewLabelTrainTest(),            # Nouveaux labels dans test
    TrainTestSamplesMix(),          # Data leakage
    TrainTestFeatureDrift()         # Drift des features
)

train_test_result = train_test_suite.run(train_ds, test_ds)

print("\n" + "─"*80)
print(" RÉSULTATS VALIDATION TRAIN/TEST")
print("─"*80)
train_test_result.show()

In [None]:
# ============================================
# PHASE 3: MODEL PERFORMANCE SUITE
# ============================================

print("\n" + "="*80)
print(" PHASE 3: VALIDATION DES PERFORMANCES DU MODÈLE")
print("="*80)
print(" Analyse approfondie des performances et erreurs")
print("="*80)

model_performance_suite = Suite(
    "Model Performance Validation",
    ConfusionMatrixReport(),        # Matrice de confusion détaillée
    RocReport(),                    # Courbe ROC
    SimpleModelComparison(),        # vs baseline simple
    PerformanceReport(),            # Métriques complètes
    WeakSegmentsPerformance(),      # Segments faibles
    CalibrationScore()              # Calibration des probabilités
)

model_performance_result = model_performance_suite.run(train_ds, test_ds, model)

print("\n" + "─"*80)
print(" RÉSULTATS PERFORMANCE MODÈLE")
print("─"*80)
model_performance_result.show()

In [None]:
# ============================================
# PHASE 4: ROBUSTNESS & PRODUCTION READINESS
# ============================================

print("\n" + "="*80)
print(" PHASE 4: ROBUSTESSE ET PRODUCTION READINESS")
print("="*80)
print(" Vérifie la stabilité et performance du modèle")
print("="*80)

robustness_suite = Suite(
    "Robustness & Production Readiness",
    ModelInferenceTime(),           # Temps d'inférence
    UnusedFeatures(),               # Features inutilisées
    BoostingOverfit()               # Détection d'overfitting
)

robustness_result = robustness_suite.run(train_ds, test_ds, model)

print("\n" + "─"*80)
print(" RÉSULTATS ROBUSTESSE")
print("─"*80)
robustness_result.show()

In [None]:
# ============================================
# RAPPORT DE SYNTHÈSE
# ============================================

print("\n" + "="*80)
print(" RAPPORT DE SYNTHÈSE - VALIDATION COMPLÈTE")
print("="*80)

all_suites = [
    ("Data Integrity", data_integrity_result),
    ("Train-Test Validation", train_test_result),
    ("Model Performance", model_performance_result),
    ("Robustness", robustness_result)
]

total_checks = 0
total_passed = 0
all_failed_checks = []

for suite_name, result in all_suites:
    suite_passed = 0
    suite_total = 0
    
    for check_result in result.results:
        suite_total += 1
        if check_result.passed():
            suite_passed += 1
        else:
            all_failed_checks.append({
                'suite': suite_name,
                'check': check_result.get_header(),
                'severity': 'HIGH' if hasattr(check_result, 'priority') and check_result.priority > 2 else 'MEDIUM'
            })
    
    total_checks += suite_total
    total_passed += suite_passed
    
    status = "✓" if suite_passed == suite_total else "⚠"
    print(f"\n{status} {suite_name}:")
    print(f"   Checks réussis: {suite_passed}/{suite_total} ({suite_passed/suite_total*100:.1f}%)")

print("\n" + "─"*80)
print(f" TOTAL: {total_passed}/{total_checks} checks réussis ({total_passed/total_checks*100:.1f}%)")
print("─"*80)

if all_failed_checks:
    print(f"\n⚠ {len(all_failed_checks)} checks échoués:")
    for failed in all_failed_checks:
        print(f"   [{failed['severity']}] {failed['suite']}: {failed['check']}")
    print("\n→ Recommandations:")
    print("   1. Examiner les résultats détaillés ci-dessus")
    print("   2. Corriger les problèmes HIGH priority")
    print("   3. Évaluer l'impact des problèmes MEDIUM")
else:
    print("\n✓ TOUS LES CHECKS SONT PASSÉS!")
    print("   → Modèle validé et prêt pour la production")

print("\n" + "="*80)
print(f" Validation terminée: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

In [None]:
# ============================================
# EXPORT DES RAPPORTS HTML
# ============================================

print("\n" + "="*80)
print(" EXPORT DES RAPPORTS HTML")
print("="*80)

# Créer le dossier reports s'il n'existe pas
reports_dir = '../models/validation_reports'
os.makedirs(reports_dir, exist_ok=True)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

try:
    # Export de chaque suite
    data_integrity_result.save_as_html(
        f'{reports_dir}/01_data_integrity_{timestamp}.html'
    )
    print(f" ✓ Data Integrity: 01_data_integrity_{timestamp}.html")
    
    train_test_result.save_as_html(
        f'{reports_dir}/02_train_test_validation_{timestamp}.html'
    )
    print(f" ✓ Train-Test: 02_train_test_validation_{timestamp}.html")
    
    model_performance_result.save_as_html(
        f'{reports_dir}/03_model_performance_{timestamp}.html'
    )
    print(f" ✓ Model Performance: 03_model_performance_{timestamp}.html")
    
    robustness_result.save_as_html(
        f'{reports_dir}/04_robustness_{timestamp}.html'
    )
    print(f" ✓ Robustness: 04_robustness_{timestamp}.html")
    
    print(f"\n → Tous les rapports sauvegardés dans: {reports_dir}")
    
except Exception as e:
    print(f" ⚠ Erreur d'export: {str(e)}")

print("="*80)

In [None]:
# ============================================
# DÉCISION FINALE: PRODUCTION READY?
# ============================================

print("\n" + "="*80)
print(" DÉCISION FINALE: PRODUCTION READINESS")
print("="*80)

# Critères de décision
min_pass_rate = 0.85  # 85% des checks doivent passer
pass_rate = total_passed / total_checks

# Checks critiques qui doivent absolument passer
critical_checks = [
    'TrainTestSamplesMix',      # Data leakage
    'ConflictingLabels',        # Labels conflictuels
    'SimpleModelComparison'     # Meilleur que baseline
]

critical_passed = True
for check in all_failed_checks:
    check_name = check['check']
    if any(critical in check_name for critical in critical_checks):
        critical_passed = False
        print(f" ✗ CHECK CRITIQUE ÉCHOUÉ: {check['check']}")

print(f"\nTaux de réussite: {pass_rate*100:.1f}% (minimum requis: {min_pass_rate*100:.0f}%)")
print(f"Checks critiques: {'✓ PASSÉS' if critical_passed else '✗ ÉCHOUÉS'}")

if pass_rate >= min_pass_rate and critical_passed:
    print("\n" + "="*80)
    print(" ✓✓✓ MODÈLE VALIDÉ POUR LA PRODUCTION ✓✓✓")
    print("="*80)
    print("\n Prochaines étapes:")
    print("   1. Exécuter 07_model_registry.ipynb pour enregistrer le modèle")
    print("   2. Configurer le monitoring en production")
    print("   3. Mettre en place le pipeline de réentraînement")
else:
    print("\n" + "="*80)
    print(" ⚠⚠⚠ MODÈLE NON VALIDÉ POUR LA PRODUCTION ⚠⚠⚠")
    print("="*80)
    print("\n Actions requises:")
    print("   1. Corriger les checks échoués (voir détails ci-dessus)")
    print("   2. Réentraîner le modèle si nécessaire")
    print("   3. Relancer cette validation")

print("\n" + "="*80)
print(" FIN DE LA VALIDATION COMPLÈTE")
print("="*80)