In [23]:
# Configuration et imports
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Ajouter le path parent
sys.path.append('../')

# Vérification et installation des dépendances
def check_and_install_dependencies():
    """Vérifie et installe les modules requis"""
    required_packages = {
        'chess': 'python-chess',
        'pandas': 'pandas',
        'numpy': 'numpy',
        'plotly': 'plotly',
        'pyarrow': 'pyarrow',  # Ajouté pour le support Parquet
        'tqdm': 'tqdm',
        'nbformat': 'nbformat',
    }
    
    missing_packages = []
    
    for module, package in required_packages.items():
        try:
            __import__(module)
            print(f"✅ {module} disponible")
        except ImportError:
            missing_packages.append(package)
            print(f"❌ {module} manquant")
    
    if missing_packages:
        print(f"\n📦 Installation des packages manquants...")
        print(f"💡 Exécutez dans votre terminal :")
        print(f"   pip install {' '.join(missing_packages)}")
        
        # Tentative d'installation automatique
        try:
            import subprocess
            for package in missing_packages:
                print(f"🔄 Installation de {package}...")
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
                print(f"✅ {package} installé")
        except Exception as e:
            print(f"⚠️ Installation automatique échouée: {e}")
            print(f"📋 Veuillez installer manuellement avec: pip install {' '.join(missing_packages)}")
            return False
    
    return True

# Vérifier les dépendances
deps_ok = check_and_install_dependencies()

# Imports essentiels
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

# Import du module chess
try:
    import chess
    import chess.pgn
    print("✅ Module chess disponible")
    CHESS_AVAILABLE = True
except ImportError:
    print("❌ Module chess manquant")
    print("💡 Installez avec: pip install python-chess")
    CHESS_AVAILABLE = False

# Imports modules locaux
try:
    from src.data_processing.pgn_parser import parse_and_analyze
    from src.data_processing.data_cleaner import ChessDataCleaner, quick_clean
    print("✅ Modules locaux importés avec succès")
    LOCAL_MODULES = True
except ImportError as e:
    print(f"⚠️ Modules locaux non trouvés: {e}")
    print("📝 Utilisation de fonctions intégrées")
    LOCAL_MODULES = False

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

print(f"🚀 Setup terminé - {datetime.now().strftime('%H:%M:%S')}")

✅ chess disponible
✅ pandas disponible
✅ numpy disponible
✅ plotly disponible
✅ pyarrow disponible
✅ tqdm disponible
✅ nbformat disponible
✅ Module chess disponible
✅ Modules locaux importés avec succès
🚀 Setup terminé - 16:04:47


In [24]:
def load_data():
    """Charge les données avec fallback automatique"""
    
    # Chemins
    PGN_FILE = 'data/raw/lichess_games.pgn'
    PROCESSED_FILE = 'data/processed/games_raw.parquet'
    
    # 1. Essayer le fichier traité
    if os.path.exists(PROCESSED_FILE):
        print("📖 Chargement fichier traité...")
        return pd.read_parquet(PROCESSED_FILE)
    
    # 2. Essayer le parsing PGN (si chess disponible)
    if os.path.exists(PGN_FILE) and CHESS_AVAILABLE and LOCAL_MODULES:
        print("🔄 Parsing fichier PGN...")
        try:
            df, _ = parse_and_analyze(PGN_FILE, max_games=50000)
            df.to_parquet(PROCESSED_FILE, index=False)
            return df
        except Exception as e:
            print(f"❌ Erreur parsing: {e}")
            print("📝 Passage aux données d'exemple...")
    
    # 3. Parser PGN simple (sans modules locaux)
    elif os.path.exists(PGN_FILE) and CHESS_AVAILABLE:
        print("🔄 Parsing PGN simplifié...")
        return parse_pgn_simple(PGN_FILE)
    
    # 4. Données d'exemple
    print("🎯 Génération de données d'exemple...")
    return create_sample_data()

def parse_pgn_simple(pgn_file, max_games=5000):
    """Parser PGN simplifié sans modules externes"""
    
    games_data = []
    
    try:
        with open(pgn_file, 'r', encoding='utf-8') as f:
            game_count = 0
            
            while game_count < max_games:
                try:
                    game = chess.pgn.read_game(f)
                    if game is None:
                        break
                    
                    headers = game.headers
                    
                    # Extraire les données essentielles
                    white_elo = int(headers.get('WhiteElo', 0))
                    black_elo = int(headers.get('BlackElo', 0))
                    result = headers.get('Result', '*')
                    opening = headers.get('Opening', 'Unknown')
                    time_control = headers.get('TimeControl', 'Unknown')
                    
                    # Compter les coups
                    moves_count = len(list(game.mainline_moves()))
                    
                    # Convertir le résultat
                    if result == '1-0':
                        white_score = 1.0
                    elif result == '0-1':
                        white_score = 0.0
                    elif result == '1/2-1/2':
                        white_score = 0.5
                    else:
                        continue  # Ignorer les parties sans résultat
                    
                    # Nettoyer le contrôle de temps
                    if 'bullet' in time_control.lower() or '180' in time_control:
                        tc_category = 'Bullet'
                    elif 'blitz' in time_control.lower() or '300' in time_control:
                        tc_category = 'Blitz'
                    elif 'rapid' in time_control.lower():
                        tc_category = 'Rapid'
                    else:
                        tc_category = 'Classical'
                    
                    games_data.append({
                        'white_elo': white_elo,
                        'black_elo': black_elo,
                        'white_score': white_score,
                        'moves_count': moves_count,
                        'opening': opening,
                        'time_control': tc_category,
                        'termination': headers.get('Termination', 'Normal')
                    })
                    
                    game_count += 1
                    
                    if game_count % 1000 == 0:
                        print(f"   📊 {game_count:,} parties parsées...")
                
                except Exception as e:
                    continue  # Ignorer les parties problématiques
        
        df = pd.DataFrame(games_data)
        
        # Ajouter les colonnes dérivées
        df['avg_elo'] = (df['white_elo'] + df['black_elo']) / 2
        df['black_score'] = 1 - df['white_score']
        df['draw'] = (df['white_score'] == 0.5).astype(int)
        df['main_opening'] = df['opening'].str.split(':').str[0]  # Première partie avant ':'
        df['variation'] = df['opening'].str.split(':').str[1].fillna('Main Line')
        
        print(f"✅ {len(df):,} parties parsées avec succès")
        
        # Sauvegarder pour la prochaine fois
        os.makedirs('../data/processed', exist_ok=True)
        df.to_parquet('../data/processed/games_raw.parquet', index=False)
        
        return df
        
    except Exception as e:
        print(f"❌ Erreur lors du parsing: {e}")
        return create_sample_data()

def create_sample_data():
    """Génère des données d'exemple réalistes"""
    np.random.seed(42)
    n_games = 5000
    
    df = pd.DataFrame({
        'white_elo': np.random.normal(1500, 300, n_games).astype(int),
        'black_elo': np.random.normal(1500, 300, n_games).astype(int),
        'white_score': np.random.choice([0, 0.5, 1], n_games, p=[0.32, 0.28, 0.40]),
        'moves_count': np.random.gamma(2.5, 12, n_games).astype(int) + 8,
        'opening': np.random.choice([
            'Sicilian Defense', "Queen's Gambit", 'French Defense',
            'English Opening', 'Caro-Kann Defense', 'Italian Game'
        ], n_games),
        'time_control': np.random.choice(['Bullet', 'Blitz', 'Rapid'], n_games, p=[0.5, 0.35, 0.15]),
        'termination': np.random.choice(['Normal', 'Time forfeit', 'Resignation'], n_games)
    })
    
    # Colonnes dérivées
    df['avg_elo'] = (df['white_elo'] + df['black_elo']) / 2
    df['black_score'] = 1 - df['white_score']
    df['draw'] = (df['white_score'] == 0.5).astype(int)
    df['main_opening'] = df['opening']
    df['variation'] = 'Main Line'
    
    # Ajouter des problèmes pour tester le nettoyage
    problems = np.random.choice(len(df), 200, replace=False)
    df.loc[problems[:50], 'white_elo'] = 0
    df.loc[problems[50:100], 'black_elo'] = 5000
    df.loc[problems[100:150], 'moves_count'] = 3
    df.loc[problems[150:], 'moves_count'] = 250
    
    print(f"✅ {len(df):,} parties d'exemple générées")
    return df

# Charger les données
df_raw = load_data()
print(f"📊 {len(df_raw):,} parties chargées")

📖 Chargement fichier traité...
📊 49,919 parties chargées


In [25]:
def quick_inspect(df):
    """Inspection synthétique des données"""
    
    print("🔍 INSPECTION RAPIDE")
    print("=" * 30)
    
    # Infos générales
    print(f"📊 Parties: {len(df):,}")
    print(f"📋 Colonnes: {len(df.columns)}")
    print(f"💾 Taille: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Colonnes avec problèmes
    missing = df.isnull().sum()
    problems = missing[missing > 0]
    if len(problems) > 0:
        print(f"\n❌ Données manquantes:")
        for col, count in problems.items():
            print(f"   {col}: {count:,} ({count/len(df)*100:.1f}%)")
    
    # ELO
    if 'white_elo' in df.columns:
        elo_issues = 0
        for col in ['white_elo', 'black_elo']:
            elo_issues += (df[col] < 700).sum() + (df[col] > 3500).sum() + (df[col] == 0).sum()
        
        if elo_issues > 0:
            print(f"⚠️ ELO problématiques: {elo_issues:,}")
        
        print(f"🎯 ELO moyen: {df[['white_elo', 'black_elo']].mean().mean():.0f}")
    
    # Coups
    if 'moves_count' in df.columns:
        short = (df['moves_count'] < 5).sum()
        long = (df['moves_count'] > 200).sum()
        if short > 0 or long > 0:
            print(f"⚠️ Parties anormales: {short + long:,}")
        
        print(f"♟️ Coups moyen: {df['moves_count'].mean():.1f}")
    
    # Ouvertures
    if 'main_opening' in df.columns:
        openings = df['main_opening'].nunique()
        print(f"📚 Ouvertures: {openings:,}")
        
        # Top 3
        top3 = df['main_opening'].value_counts().head(3)
        print("🔝 Top 3:")
        for i, (opening, count) in enumerate(top3.items(), 1):
            print(f"   {i}. {opening}: {count:,}")
    
    return df

# Inspection
quick_inspect(df_raw)

🔍 INSPECTION RAPIDE
📊 Parties: 49,919
📋 Colonnes: 22
💾 Taille: 24.7 MB

❌ Données manquantes:
   game_length_category: 106 (0.2%)
🎯 ELO moyen: 1611
⚠️ Parties anormales: 441
♟️ Coups moyen: 68.0
📚 Ouvertures: 252
🔝 Top 3:
   1. Sicilian Defense: 4,991
   2. French Defense: 3,559
   3. Queen's Pawn Game: 2,538


Unnamed: 0,white_elo,black_elo,avg_elo,elo_diff,white_score,black_score,draw,eco,opening,time_control,termination,moves_count,avg_eval_early,eval_volatility,opening_advantage,main_opening,variation,elo_range,white_elo_range,black_elo_range,decisive_game,game_length_category
0,1782,1939,1860.5,157,1.0,0.0,0,B03,Alekhine Defense: Exchange Variation,Bullet,Time,95,0.0,0.0,0.0,Alekhine Defense,Exchange Variation,1800-2000,1600-1800,1800-2000,1,Very Long
1,1519,1558,1538.5,39,1.0,0.0,0,D00,Queen's Pawn Game #3,Bullet,Time,71,0.0,0.0,0.0,Queen's Pawn Game #3,Main Line,1400-1600,1400-1600,1400-1600,1,Long
2,1701,1919,1810.0,218,0.0,1.0,0,B05,"Alekhine Defense: Modern, Flohr Variation",Blitz,Other,58,0.0,0.0,0.0,Alekhine Defense,"Modern, Flohr Variation",1800-2000,1600-1800,1800-2000,1,Medium
3,1991,1656,1823.5,335,1.0,0.0,0,A00,Hungarian Opening: Sicilian Invitation,Bullet,Time,61,0.0,0.0,0.0,Hungarian Opening,Sicilian Invitation,1800-2000,1800-2000,1600-1800,1,Long
4,1144,1390,1267.0,246,1.0,0.0,0,A20,English Opening: King's English Variation,Blitz,Time,81,0.0,0.0,0.0,English Opening,King's English Variation,1200-1400,1000-1200,1200-1400,1,Very Long
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49914,1320,1525,1422.5,205,1.0,0.0,0,A00,Polish Opening: Bugayev Attack,Rapid,Other,7,0.0,0.0,0.0,Polish Opening,Bugayev Attack,1400-1600,1200-1400,1400-1600,1,Very Short
49915,1270,1463,1366.5,193,1.0,0.0,0,D02,Queen's Pawn Game: Chigorin Variation,Classical,Other,123,0.0,0.0,0.0,Queen's Pawn Game,Chigorin Variation,1200-1400,1200-1400,1400-1600,1,Very Long
49916,1526,1220,1373.0,306,1.0,0.0,0,C00,French Defense: Pelikan Variation,Rapid,Other,30,0.0,0.0,0.0,French Defense,Pelikan Variation,1200-1400,1400-1600,1200-1400,1,Short
49917,1892,1601,1746.5,291,1.0,0.0,0,C30,"King's Gambit Declined, Queen's Knight Defense",Blitz,Other,106,0.0,0.0,0.0,"King's Gambit Declined, Queen's Knight Defense",Main Line,1600-1800,1800-2000,1600-1800,1,Very Long


In [26]:
# Configurations prédéfinies
CONFIGS = {
    'conservative': {
        'min_elo': 600,
        'max_elo': 3500,
        'min_games_per_opening': 5,
        'min_moves': 5,
        'max_moves': 200,
        'remove_duplicates': False
    },
    'balanced': {
        'min_elo': 800,
        'max_elo': 3200,
        'min_games_per_opening': 20,
        'min_moves': 10,
        'max_moves': 150,
        'remove_duplicates': False
    },
    'strict': {
        'min_elo': 1000,
        'max_elo': 3000,
        'min_games_per_opening': 50,
        'min_moves': 15,
        'max_moves': 120,
        'remove_duplicates': False
    }
}

# Choisir la stratégie
STRATEGY = 'conservative'  # 👈 Modifier ici
config = CONFIGS[STRATEGY]

print(f"🎯 Stratégie: {STRATEGY.upper()}")
print(f"⚙️ Config: ELO [{config['min_elo']}-{config['max_elo']}], "
      f"Coups [{config['min_moves']}-{config['max_moves']}], "
      f"Min ouvertures: {config['min_games_per_opening']}")

🎯 Stratégie: CONSERVATIVE
⚙️ Config: ELO [600-3500], Coups [5-200], Min ouvertures: 5


In [27]:
def clean_and_analyze(df_raw, config, strategy_name):
    """Nettoyage complet avec analyse"""
    
    print(f"🧹 NETTOYAGE - STRATÉGIE {strategy_name.upper()}")
    print("=" * 50)
    
    # Nettoyage
    cleaner = ChessDataCleaner(config)
    df_clean = cleaner.clean_dataset(df_raw, verbose=True)
    
    # Rapport de qualité
    quality_report = cleaner.get_data_quality_report(df_clean)
    
    # Comparaison avant/après
    print(f"\n📊 AVANT/APRÈS:")
    print(f"📥 Parties: {len(df_raw):,} → {len(df_clean):,}")
    
    if 'main_opening' in df_raw.columns and 'main_opening' in df_clean.columns:
        print(f"📚 Ouvertures: {df_raw['main_opening'].nunique():,} → {df_clean['main_opening'].nunique():,}")
    
    if 'avg_elo' in df_raw.columns and 'avg_elo' in df_clean.columns:
        print(f"🎯 ELO moyen: {df_raw['avg_elo'].mean():.0f} → {df_clean['avg_elo'].mean():.0f}")
    
    if 'white_score' in df_raw.columns and 'white_score' in df_clean.columns:
        print(f"🏆 Winrate blancs: {df_raw['white_score'].mean():.1%} → {df_clean['white_score'].mean():.1%}")
    
    retention = len(df_clean) / len(df_raw) * 100
    print(f"📈 Rétention: {retention:.1f}%")
    
    if retention >= 80:
        print("💚 Excellente rétention!")
    elif retention >= 60:
        print("💛 Rétention acceptable")
    else:
        print("❤️ Attention: forte perte de données")
    
    return df_clean, quality_report

# Effectuer le nettoyage
df_clean, quality_report = clean_and_analyze(df_raw, config, STRATEGY)


INFO:src.data_processing.data_cleaner:🧹 DÉBUT DU NETTOYAGE DES DONNÉES
INFO:src.data_processing.data_cleaner:🔧 Nettoyage des colonnes de base...


🧹 NETTOYAGE - STRATÉGIE CONSERVATIVE


INFO:src.data_processing.data_cleaner:🎯 Validation des ELO...
INFO:src.data_processing.data_cleaner:   ❌ Supprimé 0 parties avec ELO invalide
INFO:src.data_processing.data_cleaner:🏆 Validation des résultats...
INFO:src.data_processing.data_cleaner:   ❌ Supprimé 0 parties avec résultat invalide
INFO:src.data_processing.data_cleaner:♟️ Validation du nombre de coups...
INFO:src.data_processing.data_cleaner:   ❌ Supprimé 441 parties avec nombre de coups invalide
INFO:src.data_processing.data_cleaner:🤖 Suppression des parties de bots...
INFO:src.data_processing.data_cleaner:   ❌ Supprimé 0 parties de bots
INFO:src.data_processing.data_cleaner:♜ Suppression des variantes...
INFO:src.data_processing.data_cleaner:   ❌ Supprimé 0 parties de variantes
INFO:src.data_processing.data_cleaner:📚 Standardisation des ouvertures...
INFO:src.data_processing.data_cleaner:🔍 Filtrage des ouvertures rares...
INFO:src.data_processing.data_cleaner:   ❌ Supprimé 118 parties d'ouvertures rares
INFO:src.data_proc


📊 AVANT/APRÈS:
📥 Parties: 49,919 → 49,360
📚 Ouvertures: 252 → 192
🎯 ELO moyen: 1611 → 1611
🏆 Winrate blancs: 52.3% → 52.3%
📈 Rétention: 98.9%
💚 Excellente rétention!


In [28]:
def setup_plotly_rendering():
    """Configure Plotly pour éviter les erreurs de rendu"""
    try:
        import plotly.io as pio
        # Forcer le renderer par défaut pour éviter les problèmes nbformat
        pio.renderers.default = "browser"  # Ou "notebook" si dans Jupyter
        return True
    except:
        return False

def create_text_comparison(df_before, df_after):
    """Analyse comparative en mode texte (sans graphiques)"""
    
    print("📊 ANALYSE COMPARATIVE AVANT/APRÈS NETTOYAGE")
    print("=" * 50)
    
    # Métriques générales
    retention = len(df_after) / len(df_before) * 100
    print(f"📥 Parties: {len(df_before):,} → {len(df_after):,}")
    print(f"📈 Taux de rétention: {retention:.1f}%")
    
    if retention >= 80:
        print("💚 Excellente rétention!")
    elif retention >= 60:
        print("💛 Rétention acceptable")
    elif retention >= 20:
        print("🟠 Rétention faible")
    else:
        print("❤️ Attention: très forte perte de données")
    
    # Distribution ELO
    if 'avg_elo' in df_before.columns and 'avg_elo' in df_after.columns:
        print(f"\n🎯 ANALYSE ELO:")
        print(f"   Avant: {df_before['avg_elo'].mean():.0f} ± {df_before['avg_elo'].std():.0f}")
        print(f"   Après: {df_after['avg_elo'].mean():.0f} ± {df_after['avg_elo'].std():.0f}")
        print(f"   Plage avant: {df_before['avg_elo'].min():.0f} - {df_before['avg_elo'].max():.0f}")
        print(f"   Plage après: {df_after['avg_elo'].min():.0f} - {df_after['avg_elo'].max():.0f}")
    
    # Distribution des coups
    if 'moves_count' in df_before.columns and 'moves_count' in df_after.columns:
        print(f"\n♟️ ANALYSE COUPS:")
        print(f"   Avant: {df_before['moves_count'].mean():.1f} coups en moyenne")
        print(f"   Après: {df_after['moves_count'].mean():.1f} coups en moyenne")
        print(f"   Plage avant: {df_before['moves_count'].min()} - {df_before['moves_count'].max()}")
        print(f"   Plage après: {df_after['moves_count'].min()} - {df_after['moves_count'].max()}")
    
    # Ouvertures
    if 'main_opening' in df_before.columns and 'main_opening' in df_after.columns:
        print(f"\n📚 ANALYSE OUVERTURES:")
        print(f"   Avant: {df_before['main_opening'].nunique():,} ouvertures uniques")
        print(f"   Après: {df_after['main_opening'].nunique():,} ouvertures uniques")
        
        print(f"\n🔝 Top 10 ouvertures après nettoyage:")
        top_openings = df_after['main_opening'].value_counts().head(10)
        for i, (opening, count) in enumerate(top_openings.items(), 1):
            pct = count / len(df_after) * 100
            opening_short = opening[:40] + "..." if len(opening) > 40 else opening
            print(f"   {i:2d}. {opening_short:<43} {count:>4,} ({pct:4.1f}%)")
    
    # Résultats
    if 'white_score' in df_before.columns and 'white_score' in df_after.columns:
        print(f"\n🏆 ANALYSE RÉSULTATS:")
        
        # Avant
        white_wins_before = (df_before['white_score'] == 1.0).sum()
        black_wins_before = (df_before['white_score'] == 0.0).sum()
        draws_before = (df_before['white_score'] == 0.5).sum()
        total_before = len(df_before)
        
        # Après
        white_wins_after = (df_after['white_score'] == 1.0).sum()
        black_wins_after = (df_after['white_score'] == 0.0).sum()
        draws_after = (df_after['white_score'] == 0.5).sum()
        total_after = len(df_after)
        
        print(f"   AVANT - Blancs: {white_wins_before/total_before*100:.1f}% | "
              f"Noirs: {black_wins_before/total_before*100:.1f}% | "
              f"Nulles: {draws_before/total_before*100:.1f}%")
        print(f"   APRÈS - Blancs: {white_wins_after/total_after*100:.1f}% | "
              f"Noirs: {black_wins_after/total_after*100:.1f}% | "
              f"Nulles: {draws_after/total_after*100:.1f}%")
    
    # Contrôles de temps
    if 'time_control' in df_after.columns:
        print(f"\n⏱️ CONTRÔLES DE TEMPS (après nettoyage):")
        time_dist = df_after['time_control'].value_counts()
        for tc, count in time_dist.items():
            pct = count / len(df_after) * 100
            print(f"   {tc:<12} {count:>5,} ({pct:4.1f}%)")
    
    # Qualité des données
    print(f"\n📋 QUALITÉ DES DONNÉES:")
    
    # Données manquantes après nettoyage
    missing_after = df_after.isnull().sum()
    critical_missing = missing_after[missing_after > 0]
    
    if len(critical_missing) == 0:
        print("   ✅ Aucune donnée manquante")
    else:
        print("   ⚠️ Données manquantes restantes:")
        for col, count in critical_missing.items():
            pct = count / len(df_after) * 100
            print(f"      {col}: {count:,} ({pct:.1f}%)")
    
    # Distribution par catégorie ELO
    if 'elo_category' in df_after.columns:
        print(f"\n🎯 DISTRIBUTION PAR NIVEAU ELO:")
        elo_dist = df_after['elo_category'].value_counts().sort_index()
        for category, count in elo_dist.items():
            pct = count / len(df_after) * 100
            print(f"   {category:<12} {count:>5,} ({pct:4.1f}%)")
    
    return True

def create_comparison_viz(df_before, df_after):
    """Visualisation synthétique avant/après"""
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Distribution ELO', 'Distribution Coups', 
                       'Top Ouvertures', 'Métriques Clés'),
        specs=[[{}, {}], [{"type": "bar"}, {"type": "bar"}]]
    )
    
    # 1. Distribution ELO
    if 'avg_elo' in df_before.columns and 'avg_elo' in df_after.columns:
        fig.add_trace(go.Histogram(x=df_before['avg_elo'], name='Avant', opacity=0.6,
                                  nbinsx=30, marker_color='red'), row=1, col=1)
        fig.add_trace(go.Histogram(x=df_after['avg_elo'], name='Après', opacity=0.6,
                                  nbinsx=30, marker_color='blue'), row=1, col=1)
    
    # 2. Distribution Coups
    if 'moves_count' in df_before.columns and 'moves_count' in df_after.columns:
        fig.add_trace(go.Histogram(x=df_before['moves_count'], name='Avant (coups)', opacity=0.6,
                                  nbinsx=25, marker_color='red', showlegend=False), row=1, col=2)
        fig.add_trace(go.Histogram(x=df_after['moves_count'], name='Après (coups)', opacity=0.6,
                                  nbinsx=25, marker_color='blue', showlegend=False), row=1, col=2)
    
    # 3. Top Ouvertures
    if 'main_opening' in df_after.columns:
        top_openings = df_after['main_opening'].value_counts().head(8)
        fig.add_trace(go.Bar(x=top_openings.values, y=top_openings.index,
                            orientation='h', name='Popularité', marker_color='green',
                            showlegend=False), row=2, col=1)
    
    # 4. Métriques clés
    metrics = ['Parties', 'Ouvertures', 'Qualité']
    values = [
        len(df_after) / len(df_before) * 100,
        df_after['main_opening'].nunique() / df_before['main_opening'].nunique() * 100 if 'main_opening' in df_before.columns else 100,
        100  # Score qualité fixe
    ]
    
    colors = ['green' if v >= 80 else 'orange' if v >= 60 else 'red' for v in values]
    
    fig.add_trace(go.Bar(x=metrics, y=values, name='Rétention (%)',
                        marker_color=colors, showlegend=False), row=2, col=2)
    fig.add_hline(y=100, line_dash="dash", line_color="gray", row=2, col=2)
    
    # Mise en forme
    fig.update_layout(
        title_text="📊 Analyse Comparative Avant/Après Nettoyage",
        height=600,
        showlegend=True
    )
    
    return fig

# Configuration et analyse
if len(df_clean) > 0:
    print("📊 Génération de l'analyse comparative...")
    
    # TOUJOURS utiliser le mode texte d'abord (plus fiable)
    create_text_comparison(df_raw, df_clean)
    
    # Essayer les graphiques ensuite (optionnel)
    print(f"\n" + "="*50)
    print("🎨 TENTATIVE D'AFFICHAGE GRAPHIQUE...")
    
    try:
        # Configuration Plotly
        setup_plotly_rendering()
        
        # Créer et sauvegarder en HTML d'abord
        comparison_fig = create_comparison_viz(df_raw, df_clean)
        
        os.makedirs('../data/exported', exist_ok=True)
        html_file = f"../data/exported/comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
        comparison_fig.write_html(html_file)
        print(f"✅ Graphique sauvegardé: {html_file}")
        print("🌐 Ouvrez ce fichier dans votre navigateur pour voir les graphiques")
        
        # Essayer l'affichage (peut échouer)
        try:
            comparison_fig.show()
            print("✅ Graphiques également affichés dans le notebook!")
        except Exception as show_error:
            print(f"⚠️ Affichage notebook échoué: {show_error}")
            print("💡 Utilisez le fichier HTML ci-dessus pour voir les graphiques")
            
    except Exception as e:
        print(f"❌ Erreur création graphique: {e}")
        print("📊 L'analyse textuelle ci-dessus contient toutes les informations nécessaires")

📊 Génération de l'analyse comparative...
📊 ANALYSE COMPARATIVE AVANT/APRÈS NETTOYAGE
📥 Parties: 49,919 → 49,360
📈 Taux de rétention: 98.9%
💚 Excellente rétention!

🎯 ANALYSE ELO:
   Avant: 1611 ± 193
   Après: 1611 ± 193
   Plage avant: 902 - 2351
   Plage après: 902 - 2351

♟️ ANALYSE COUPS:
   Avant: 68.0 coups en moyenne
   Après: 68.3 coups en moyenne
   Plage avant: 0 - 281
   Plage après: 5 - 200

📚 ANALYSE OUVERTURES:
   Avant: 252 ouvertures uniques
   Après: 192 ouvertures uniques

🔝 Top 10 ouvertures après nettoyage:
    1. Sicilian Defense                            4,969 (10.1%)
    2. French Defense                              3,538 ( 7.2%)
    3. Queen's Pawn Game                           2,516 ( 5.1%)
    4. King's Pawn Game                            2,250 ( 4.6%)
    5. Scandinavian Defense                        2,195 ( 4.4%)
    6. Italian Game                                1,550 ( 3.1%)
    7. English Opening                             1,545 ( 3.1%)
    8. Bisho

In [29]:
def save_results(df_clean, config, strategy):
    """Sauvegarde synthétique"""
    
    # Créer les dossiers
    os.makedirs('../data/processed', exist_ok=True)
    os.makedirs('../data/exported', exist_ok=True)
    
    # Timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Fichiers de sortie
    files_created = []
    
    try:
        # 1. Données nettoyées (Parquet si possible, sinon CSV)
        try:
            parquet_file = f'../data/processed/games_clean_{strategy}_{timestamp}.parquet'
            df_clean.to_parquet(parquet_file, index=False)
            files_created.append(parquet_file)
            print(f"✅ Parquet: {parquet_file}")
        except Exception as e:
            print(f"⚠️ Parquet échoué ({e}), utilisation CSV...")
            csv_fallback = f'../data/processed/games_clean_{strategy}_{timestamp}.csv'
            df_clean.to_csv(csv_fallback, index=False)
            files_created.append(csv_fallback)
            print(f"✅ CSV: {csv_fallback}")
        
        # 2. Export CSV standard
        csv_file = f'../data/exported/games_clean_{strategy}_{timestamp}.csv'
        df_clean.to_csv(csv_file, index=False)
        files_created.append(csv_file)
        
        # 3. Configuration
        config_file = f'../data/exported/config_{strategy}_{timestamp}.txt'
        with open(config_file, 'w') as f:
            f.write(f"Configuration {strategy} - {timestamp}\\n")
            f.write("=" * 40 + "\\n")
            for key, value in config.items():
                f.write(f"{key}: {value}\\n")
            f.write(f"\\nRésultats:\\n")
            f.write(f"Parties nettoyées: {len(df_clean):,}\\n")
        files_created.append(config_file)
        
        print("💾 SAUVEGARDE RÉUSSIE")
        print("=" * 25)
        for file in files_created:
            size_mb = os.path.getsize(file) / 1024**2
            print(f"✅ {file} ({size_mb:.1f} MB)")
        
        return files_created
        
    except Exception as e:
        print(f"❌ Erreur sauvegarde: {e}")
        return []

# Sauvegarder les résultats
if len(df_clean) > 0:
    saved_files = save_results(df_clean, config, STRATEGY)
    
    # Résumé final
    print(f"\\n🎉 NETTOYAGE TERMINÉ!")
    print(f"📊 {len(df_clean):,} parties prêtes pour l'analyse")
    print(f"📁 {len(saved_files)} fichiers sauvegardés")
    print(f"🚀 Prêt pour les analyses d'ouvertures!")
else:
    print("❌ Aucune donnée à sauvegarder")

✅ Parquet: ../data/processed/games_clean_conservative_20250607_160448.parquet
💾 SAUVEGARDE RÉUSSIE
✅ ../data/processed/games_clean_conservative_20250607_160448.parquet (0.9 MB)
✅ ../data/exported/games_clean_conservative_20250607_160448.csv (9.7 MB)
✅ ../data/exported/config_conservative_20250607_160448.txt (0.0 MB)
\n🎉 NETTOYAGE TERMINÉ!
📊 49,360 parties prêtes pour l'analyse
📁 3 fichiers sauvegardés
🚀 Prêt pour les analyses d'ouvertures!


In [30]:
print("📋 RÉSUMÉ FINAL")
print("=" * 20)

if len(df_clean) > 0:
    print(f"✅ Dataset nettoyé disponible: df_clean")
    print(f"📊 Parties: {len(df_clean):,}")
    
    if 'main_opening' in df_clean.columns:
        print(f"📚 Ouvertures: {df_clean['main_opening'].nunique():,}")
        print(f"🔝 Plus populaire: {df_clean['main_opening'].mode()[0]}")
    
    if 'avg_elo' in df_clean.columns:
        print(f"🎯 ELO moyen: {df_clean['avg_elo'].mean():.0f}")
        print(f"📈 Plage ELO: {df_clean['avg_elo'].min():.0f} - {df_clean['avg_elo'].max():.0f}")
    
    if 'time_control' in df_clean.columns:
        dominant_tc = df_clean['time_control'].mode()[0]
        print(f"⏱️ Contrôle dominant: {dominant_tc}")
    
    print(f"\\n🚀 PROCHAINES ÉTAPES:")
    print(f"1. Analyser les ouvertures par niveau ELO")
    print(f"2. Calculer les winrates par ouverture")
    print(f"3. Identifier les tendances temporelles")
    print(f"4. Créer des visualisations avancées")
    
    # Variables disponibles
    print(f"\\n📦 Variables disponibles:")
    print(f"   df_raw   - Données brutes")
    print(f"   df_clean - Données nettoyées")
    print(f"   config   - Configuration utilisée")
    
else:
    print(f"❌ Échec du nettoyage")
    print(f"💡 Essayez une configuration moins stricte:")
    print(f"   STRATEGY = 'conservative'")

print(f"\\n⏰ Temps total: quelques secondes")
print(f"🎯 Notebook synthétique terminé!")

📋 RÉSUMÉ FINAL
✅ Dataset nettoyé disponible: df_clean
📊 Parties: 49,360
📚 Ouvertures: 192
🔝 Plus populaire: Sicilian Defense
🎯 ELO moyen: 1611
📈 Plage ELO: 902 - 2351
⏱️ Contrôle dominant: Blitz
\n🚀 PROCHAINES ÉTAPES:
1. Analyser les ouvertures par niveau ELO
2. Calculer les winrates par ouverture
3. Identifier les tendances temporelles
4. Créer des visualisations avancées
\n📦 Variables disponibles:
   df_raw   - Données brutes
   df_clean - Données nettoyées
   config   - Configuration utilisée
\n⏰ Temps total: quelques secondes
🎯 Notebook synthétique terminé!
