In [23]:
# Configuration et imports
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Ajouter le path parent
sys.path.append('../')

# V√©rification et installation des d√©pendances
def check_and_install_dependencies():
    """V√©rifie et installe les modules requis"""
    required_packages = {
        'chess': 'python-chess',
        'pandas': 'pandas',
        'numpy': 'numpy',
        'plotly': 'plotly',
        'pyarrow': 'pyarrow',  # Ajout√© pour le support Parquet
        'tqdm': 'tqdm',
        'nbformat': 'nbformat',
    }
    
    missing_packages = []
    
    for module, package in required_packages.items():
        try:
            __import__(module)
            print(f"‚úÖ {module} disponible")
        except ImportError:
            missing_packages.append(package)
            print(f"‚ùå {module} manquant")
    
    if missing_packages:
        print(f"\nüì¶ Installation des packages manquants...")
        print(f"üí° Ex√©cutez dans votre terminal :")
        print(f"   pip install {' '.join(missing_packages)}")
        
        # Tentative d'installation automatique
        try:
            import subprocess
            for package in missing_packages:
                print(f"üîÑ Installation de {package}...")
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
                print(f"‚úÖ {package} install√©")
        except Exception as e:
            print(f"‚ö†Ô∏è Installation automatique √©chou√©e: {e}")
            print(f"üìã Veuillez installer manuellement avec: pip install {' '.join(missing_packages)}")
            return False
    
    return True

# V√©rifier les d√©pendances
deps_ok = check_and_install_dependencies()

# Imports essentiels
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

# Import du module chess
try:
    import chess
    import chess.pgn
    print("‚úÖ Module chess disponible")
    CHESS_AVAILABLE = True
except ImportError:
    print("‚ùå Module chess manquant")
    print("üí° Installez avec: pip install python-chess")
    CHESS_AVAILABLE = False

# Imports modules locaux
try:
    from src.data_processing.pgn_parser import parse_and_analyze
    from src.data_processing.data_cleaner import ChessDataCleaner, quick_clean
    print("‚úÖ Modules locaux import√©s avec succ√®s")
    LOCAL_MODULES = True
except ImportError as e:
    print(f"‚ö†Ô∏è Modules locaux non trouv√©s: {e}")
    print("üìù Utilisation de fonctions int√©gr√©es")
    LOCAL_MODULES = False

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

print(f"üöÄ Setup termin√© - {datetime.now().strftime('%H:%M:%S')}")

‚úÖ chess disponible
‚úÖ pandas disponible
‚úÖ numpy disponible
‚úÖ plotly disponible
‚úÖ pyarrow disponible
‚úÖ tqdm disponible
‚úÖ nbformat disponible
‚úÖ Module chess disponible
‚úÖ Modules locaux import√©s avec succ√®s
üöÄ Setup termin√© - 16:04:47


In [24]:
def load_data():
    """Charge les donn√©es avec fallback automatique"""
    
    # Chemins
    PGN_FILE = 'data/raw/lichess_games.pgn'
    PROCESSED_FILE = 'data/processed/games_raw.parquet'
    
    # 1. Essayer le fichier trait√©
    if os.path.exists(PROCESSED_FILE):
        print("üìñ Chargement fichier trait√©...")
        return pd.read_parquet(PROCESSED_FILE)
    
    # 2. Essayer le parsing PGN (si chess disponible)
    if os.path.exists(PGN_FILE) and CHESS_AVAILABLE and LOCAL_MODULES:
        print("üîÑ Parsing fichier PGN...")
        try:
            df, _ = parse_and_analyze(PGN_FILE, max_games=50000)
            df.to_parquet(PROCESSED_FILE, index=False)
            return df
        except Exception as e:
            print(f"‚ùå Erreur parsing: {e}")
            print("üìù Passage aux donn√©es d'exemple...")
    
    # 3. Parser PGN simple (sans modules locaux)
    elif os.path.exists(PGN_FILE) and CHESS_AVAILABLE:
        print("üîÑ Parsing PGN simplifi√©...")
        return parse_pgn_simple(PGN_FILE)
    
    # 4. Donn√©es d'exemple
    print("üéØ G√©n√©ration de donn√©es d'exemple...")
    return create_sample_data()

def parse_pgn_simple(pgn_file, max_games=5000):
    """Parser PGN simplifi√© sans modules externes"""
    
    games_data = []
    
    try:
        with open(pgn_file, 'r', encoding='utf-8') as f:
            game_count = 0
            
            while game_count < max_games:
                try:
                    game = chess.pgn.read_game(f)
                    if game is None:
                        break
                    
                    headers = game.headers
                    
                    # Extraire les donn√©es essentielles
                    white_elo = int(headers.get('WhiteElo', 0))
                    black_elo = int(headers.get('BlackElo', 0))
                    result = headers.get('Result', '*')
                    opening = headers.get('Opening', 'Unknown')
                    time_control = headers.get('TimeControl', 'Unknown')
                    
                    # Compter les coups
                    moves_count = len(list(game.mainline_moves()))
                    
                    # Convertir le r√©sultat
                    if result == '1-0':
                        white_score = 1.0
                    elif result == '0-1':
                        white_score = 0.0
                    elif result == '1/2-1/2':
                        white_score = 0.5
                    else:
                        continue  # Ignorer les parties sans r√©sultat
                    
                    # Nettoyer le contr√¥le de temps
                    if 'bullet' in time_control.lower() or '180' in time_control:
                        tc_category = 'Bullet'
                    elif 'blitz' in time_control.lower() or '300' in time_control:
                        tc_category = 'Blitz'
                    elif 'rapid' in time_control.lower():
                        tc_category = 'Rapid'
                    else:
                        tc_category = 'Classical'
                    
                    games_data.append({
                        'white_elo': white_elo,
                        'black_elo': black_elo,
                        'white_score': white_score,
                        'moves_count': moves_count,
                        'opening': opening,
                        'time_control': tc_category,
                        'termination': headers.get('Termination', 'Normal')
                    })
                    
                    game_count += 1
                    
                    if game_count % 1000 == 0:
                        print(f"   üìä {game_count:,} parties pars√©es...")
                
                except Exception as e:
                    continue  # Ignorer les parties probl√©matiques
        
        df = pd.DataFrame(games_data)
        
        # Ajouter les colonnes d√©riv√©es
        df['avg_elo'] = (df['white_elo'] + df['black_elo']) / 2
        df['black_score'] = 1 - df['white_score']
        df['draw'] = (df['white_score'] == 0.5).astype(int)
        df['main_opening'] = df['opening'].str.split(':').str[0]  # Premi√®re partie avant ':'
        df['variation'] = df['opening'].str.split(':').str[1].fillna('Main Line')
        
        print(f"‚úÖ {len(df):,} parties pars√©es avec succ√®s")
        
        # Sauvegarder pour la prochaine fois
        os.makedirs('../data/processed', exist_ok=True)
        df.to_parquet('../data/processed/games_raw.parquet', index=False)
        
        return df
        
    except Exception as e:
        print(f"‚ùå Erreur lors du parsing: {e}")
        return create_sample_data()

def create_sample_data():
    """G√©n√®re des donn√©es d'exemple r√©alistes"""
    np.random.seed(42)
    n_games = 5000
    
    df = pd.DataFrame({
        'white_elo': np.random.normal(1500, 300, n_games).astype(int),
        'black_elo': np.random.normal(1500, 300, n_games).astype(int),
        'white_score': np.random.choice([0, 0.5, 1], n_games, p=[0.32, 0.28, 0.40]),
        'moves_count': np.random.gamma(2.5, 12, n_games).astype(int) + 8,
        'opening': np.random.choice([
            'Sicilian Defense', "Queen's Gambit", 'French Defense',
            'English Opening', 'Caro-Kann Defense', 'Italian Game'
        ], n_games),
        'time_control': np.random.choice(['Bullet', 'Blitz', 'Rapid'], n_games, p=[0.5, 0.35, 0.15]),
        'termination': np.random.choice(['Normal', 'Time forfeit', 'Resignation'], n_games)
    })
    
    # Colonnes d√©riv√©es
    df['avg_elo'] = (df['white_elo'] + df['black_elo']) / 2
    df['black_score'] = 1 - df['white_score']
    df['draw'] = (df['white_score'] == 0.5).astype(int)
    df['main_opening'] = df['opening']
    df['variation'] = 'Main Line'
    
    # Ajouter des probl√®mes pour tester le nettoyage
    problems = np.random.choice(len(df), 200, replace=False)
    df.loc[problems[:50], 'white_elo'] = 0
    df.loc[problems[50:100], 'black_elo'] = 5000
    df.loc[problems[100:150], 'moves_count'] = 3
    df.loc[problems[150:], 'moves_count'] = 250
    
    print(f"‚úÖ {len(df):,} parties d'exemple g√©n√©r√©es")
    return df

# Charger les donn√©es
df_raw = load_data()
print(f"üìä {len(df_raw):,} parties charg√©es")

üìñ Chargement fichier trait√©...
üìä 49,919 parties charg√©es


In [25]:
def quick_inspect(df):
    """Inspection synth√©tique des donn√©es"""
    
    print("üîç INSPECTION RAPIDE")
    print("=" * 30)
    
    # Infos g√©n√©rales
    print(f"üìä Parties: {len(df):,}")
    print(f"üìã Colonnes: {len(df.columns)}")
    print(f"üíæ Taille: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Colonnes avec probl√®mes
    missing = df.isnull().sum()
    problems = missing[missing > 0]
    if len(problems) > 0:
        print(f"\n‚ùå Donn√©es manquantes:")
        for col, count in problems.items():
            print(f"   {col}: {count:,} ({count/len(df)*100:.1f}%)")
    
    # ELO
    if 'white_elo' in df.columns:
        elo_issues = 0
        for col in ['white_elo', 'black_elo']:
            elo_issues += (df[col] < 700).sum() + (df[col] > 3500).sum() + (df[col] == 0).sum()
        
        if elo_issues > 0:
            print(f"‚ö†Ô∏è ELO probl√©matiques: {elo_issues:,}")
        
        print(f"üéØ ELO moyen: {df[['white_elo', 'black_elo']].mean().mean():.0f}")
    
    # Coups
    if 'moves_count' in df.columns:
        short = (df['moves_count'] < 5).sum()
        long = (df['moves_count'] > 200).sum()
        if short > 0 or long > 0:
            print(f"‚ö†Ô∏è Parties anormales: {short + long:,}")
        
        print(f"‚ôüÔ∏è Coups moyen: {df['moves_count'].mean():.1f}")
    
    # Ouvertures
    if 'main_opening' in df.columns:
        openings = df['main_opening'].nunique()
        print(f"üìö Ouvertures: {openings:,}")
        
        # Top 3
        top3 = df['main_opening'].value_counts().head(3)
        print("üîù Top 3:")
        for i, (opening, count) in enumerate(top3.items(), 1):
            print(f"   {i}. {opening}: {count:,}")
    
    return df

# Inspection
quick_inspect(df_raw)

üîç INSPECTION RAPIDE
üìä Parties: 49,919
üìã Colonnes: 22
üíæ Taille: 24.7 MB

‚ùå Donn√©es manquantes:
   game_length_category: 106 (0.2%)
üéØ ELO moyen: 1611
‚ö†Ô∏è Parties anormales: 441
‚ôüÔ∏è Coups moyen: 68.0
üìö Ouvertures: 252
üîù Top 3:
   1. Sicilian Defense: 4,991
   2. French Defense: 3,559
   3. Queen's Pawn Game: 2,538


Unnamed: 0,white_elo,black_elo,avg_elo,elo_diff,white_score,black_score,draw,eco,opening,time_control,termination,moves_count,avg_eval_early,eval_volatility,opening_advantage,main_opening,variation,elo_range,white_elo_range,black_elo_range,decisive_game,game_length_category
0,1782,1939,1860.5,157,1.0,0.0,0,B03,Alekhine Defense: Exchange Variation,Bullet,Time,95,0.0,0.0,0.0,Alekhine Defense,Exchange Variation,1800-2000,1600-1800,1800-2000,1,Very Long
1,1519,1558,1538.5,39,1.0,0.0,0,D00,Queen's Pawn Game #3,Bullet,Time,71,0.0,0.0,0.0,Queen's Pawn Game #3,Main Line,1400-1600,1400-1600,1400-1600,1,Long
2,1701,1919,1810.0,218,0.0,1.0,0,B05,"Alekhine Defense: Modern, Flohr Variation",Blitz,Other,58,0.0,0.0,0.0,Alekhine Defense,"Modern, Flohr Variation",1800-2000,1600-1800,1800-2000,1,Medium
3,1991,1656,1823.5,335,1.0,0.0,0,A00,Hungarian Opening: Sicilian Invitation,Bullet,Time,61,0.0,0.0,0.0,Hungarian Opening,Sicilian Invitation,1800-2000,1800-2000,1600-1800,1,Long
4,1144,1390,1267.0,246,1.0,0.0,0,A20,English Opening: King's English Variation,Blitz,Time,81,0.0,0.0,0.0,English Opening,King's English Variation,1200-1400,1000-1200,1200-1400,1,Very Long
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49914,1320,1525,1422.5,205,1.0,0.0,0,A00,Polish Opening: Bugayev Attack,Rapid,Other,7,0.0,0.0,0.0,Polish Opening,Bugayev Attack,1400-1600,1200-1400,1400-1600,1,Very Short
49915,1270,1463,1366.5,193,1.0,0.0,0,D02,Queen's Pawn Game: Chigorin Variation,Classical,Other,123,0.0,0.0,0.0,Queen's Pawn Game,Chigorin Variation,1200-1400,1200-1400,1400-1600,1,Very Long
49916,1526,1220,1373.0,306,1.0,0.0,0,C00,French Defense: Pelikan Variation,Rapid,Other,30,0.0,0.0,0.0,French Defense,Pelikan Variation,1200-1400,1400-1600,1200-1400,1,Short
49917,1892,1601,1746.5,291,1.0,0.0,0,C30,"King's Gambit Declined, Queen's Knight Defense",Blitz,Other,106,0.0,0.0,0.0,"King's Gambit Declined, Queen's Knight Defense",Main Line,1600-1800,1800-2000,1600-1800,1,Very Long


In [26]:
# Configurations pr√©d√©finies
CONFIGS = {
    'conservative': {
        'min_elo': 600,
        'max_elo': 3500,
        'min_games_per_opening': 5,
        'min_moves': 5,
        'max_moves': 200,
        'remove_duplicates': False
    },
    'balanced': {
        'min_elo': 800,
        'max_elo': 3200,
        'min_games_per_opening': 20,
        'min_moves': 10,
        'max_moves': 150,
        'remove_duplicates': False
    },
    'strict': {
        'min_elo': 1000,
        'max_elo': 3000,
        'min_games_per_opening': 50,
        'min_moves': 15,
        'max_moves': 120,
        'remove_duplicates': False
    }
}

# Choisir la strat√©gie
STRATEGY = 'conservative'  # üëà Modifier ici
config = CONFIGS[STRATEGY]

print(f"üéØ Strat√©gie: {STRATEGY.upper()}")
print(f"‚öôÔ∏è Config: ELO [{config['min_elo']}-{config['max_elo']}], "
      f"Coups [{config['min_moves']}-{config['max_moves']}], "
      f"Min ouvertures: {config['min_games_per_opening']}")

üéØ Strat√©gie: CONSERVATIVE
‚öôÔ∏è Config: ELO [600-3500], Coups [5-200], Min ouvertures: 5


In [27]:
def clean_and_analyze(df_raw, config, strategy_name):
    """Nettoyage complet avec analyse"""
    
    print(f"üßπ NETTOYAGE - STRAT√âGIE {strategy_name.upper()}")
    print("=" * 50)
    
    # Nettoyage
    cleaner = ChessDataCleaner(config)
    df_clean = cleaner.clean_dataset(df_raw, verbose=True)
    
    # Rapport de qualit√©
    quality_report = cleaner.get_data_quality_report(df_clean)
    
    # Comparaison avant/apr√®s
    print(f"\nüìä AVANT/APR√àS:")
    print(f"üì• Parties: {len(df_raw):,} ‚Üí {len(df_clean):,}")
    
    if 'main_opening' in df_raw.columns and 'main_opening' in df_clean.columns:
        print(f"üìö Ouvertures: {df_raw['main_opening'].nunique():,} ‚Üí {df_clean['main_opening'].nunique():,}")
    
    if 'avg_elo' in df_raw.columns and 'avg_elo' in df_clean.columns:
        print(f"üéØ ELO moyen: {df_raw['avg_elo'].mean():.0f} ‚Üí {df_clean['avg_elo'].mean():.0f}")
    
    if 'white_score' in df_raw.columns and 'white_score' in df_clean.columns:
        print(f"üèÜ Winrate blancs: {df_raw['white_score'].mean():.1%} ‚Üí {df_clean['white_score'].mean():.1%}")
    
    retention = len(df_clean) / len(df_raw) * 100
    print(f"üìà R√©tention: {retention:.1f}%")
    
    if retention >= 80:
        print("üíö Excellente r√©tention!")
    elif retention >= 60:
        print("üíõ R√©tention acceptable")
    else:
        print("‚ù§Ô∏è Attention: forte perte de donn√©es")
    
    return df_clean, quality_report

# Effectuer le nettoyage
df_clean, quality_report = clean_and_analyze(df_raw, config, STRATEGY)


INFO:src.data_processing.data_cleaner:üßπ D√âBUT DU NETTOYAGE DES DONN√âES
INFO:src.data_processing.data_cleaner:üîß Nettoyage des colonnes de base...


üßπ NETTOYAGE - STRAT√âGIE CONSERVATIVE


INFO:src.data_processing.data_cleaner:üéØ Validation des ELO...
INFO:src.data_processing.data_cleaner:   ‚ùå Supprim√© 0 parties avec ELO invalide
INFO:src.data_processing.data_cleaner:üèÜ Validation des r√©sultats...
INFO:src.data_processing.data_cleaner:   ‚ùå Supprim√© 0 parties avec r√©sultat invalide
INFO:src.data_processing.data_cleaner:‚ôüÔ∏è Validation du nombre de coups...
INFO:src.data_processing.data_cleaner:   ‚ùå Supprim√© 441 parties avec nombre de coups invalide
INFO:src.data_processing.data_cleaner:ü§ñ Suppression des parties de bots...
INFO:src.data_processing.data_cleaner:   ‚ùå Supprim√© 0 parties de bots
INFO:src.data_processing.data_cleaner:‚ôú Suppression des variantes...
INFO:src.data_processing.data_cleaner:   ‚ùå Supprim√© 0 parties de variantes
INFO:src.data_processing.data_cleaner:üìö Standardisation des ouvertures...
INFO:src.data_processing.data_cleaner:üîç Filtrage des ouvertures rares...
INFO:src.data_processing.data_cleaner:   ‚ùå Supprim√© 118 part


üìä AVANT/APR√àS:
üì• Parties: 49,919 ‚Üí 49,360
üìö Ouvertures: 252 ‚Üí 192
üéØ ELO moyen: 1611 ‚Üí 1611
üèÜ Winrate blancs: 52.3% ‚Üí 52.3%
üìà R√©tention: 98.9%
üíö Excellente r√©tention!


In [28]:
def setup_plotly_rendering():
    """Configure Plotly pour √©viter les erreurs de rendu"""
    try:
        import plotly.io as pio
        # Forcer le renderer par d√©faut pour √©viter les probl√®mes nbformat
        pio.renderers.default = "browser"  # Ou "notebook" si dans Jupyter
        return True
    except:
        return False

def create_text_comparison(df_before, df_after):
    """Analyse comparative en mode texte (sans graphiques)"""
    
    print("üìä ANALYSE COMPARATIVE AVANT/APR√àS NETTOYAGE")
    print("=" * 50)
    
    # M√©triques g√©n√©rales
    retention = len(df_after) / len(df_before) * 100
    print(f"üì• Parties: {len(df_before):,} ‚Üí {len(df_after):,}")
    print(f"üìà Taux de r√©tention: {retention:.1f}%")
    
    if retention >= 80:
        print("üíö Excellente r√©tention!")
    elif retention >= 60:
        print("üíõ R√©tention acceptable")
    elif retention >= 20:
        print("üü† R√©tention faible")
    else:
        print("‚ù§Ô∏è Attention: tr√®s forte perte de donn√©es")
    
    # Distribution ELO
    if 'avg_elo' in df_before.columns and 'avg_elo' in df_after.columns:
        print(f"\nüéØ ANALYSE ELO:")
        print(f"   Avant: {df_before['avg_elo'].mean():.0f} ¬± {df_before['avg_elo'].std():.0f}")
        print(f"   Apr√®s: {df_after['avg_elo'].mean():.0f} ¬± {df_after['avg_elo'].std():.0f}")
        print(f"   Plage avant: {df_before['avg_elo'].min():.0f} - {df_before['avg_elo'].max():.0f}")
        print(f"   Plage apr√®s: {df_after['avg_elo'].min():.0f} - {df_after['avg_elo'].max():.0f}")
    
    # Distribution des coups
    if 'moves_count' in df_before.columns and 'moves_count' in df_after.columns:
        print(f"\n‚ôüÔ∏è ANALYSE COUPS:")
        print(f"   Avant: {df_before['moves_count'].mean():.1f} coups en moyenne")
        print(f"   Apr√®s: {df_after['moves_count'].mean():.1f} coups en moyenne")
        print(f"   Plage avant: {df_before['moves_count'].min()} - {df_before['moves_count'].max()}")
        print(f"   Plage apr√®s: {df_after['moves_count'].min()} - {df_after['moves_count'].max()}")
    
    # Ouvertures
    if 'main_opening' in df_before.columns and 'main_opening' in df_after.columns:
        print(f"\nüìö ANALYSE OUVERTURES:")
        print(f"   Avant: {df_before['main_opening'].nunique():,} ouvertures uniques")
        print(f"   Apr√®s: {df_after['main_opening'].nunique():,} ouvertures uniques")
        
        print(f"\nüîù Top 10 ouvertures apr√®s nettoyage:")
        top_openings = df_after['main_opening'].value_counts().head(10)
        for i, (opening, count) in enumerate(top_openings.items(), 1):
            pct = count / len(df_after) * 100
            opening_short = opening[:40] + "..." if len(opening) > 40 else opening
            print(f"   {i:2d}. {opening_short:<43} {count:>4,} ({pct:4.1f}%)")
    
    # R√©sultats
    if 'white_score' in df_before.columns and 'white_score' in df_after.columns:
        print(f"\nüèÜ ANALYSE R√âSULTATS:")
        
        # Avant
        white_wins_before = (df_before['white_score'] == 1.0).sum()
        black_wins_before = (df_before['white_score'] == 0.0).sum()
        draws_before = (df_before['white_score'] == 0.5).sum()
        total_before = len(df_before)
        
        # Apr√®s
        white_wins_after = (df_after['white_score'] == 1.0).sum()
        black_wins_after = (df_after['white_score'] == 0.0).sum()
        draws_after = (df_after['white_score'] == 0.5).sum()
        total_after = len(df_after)
        
        print(f"   AVANT - Blancs: {white_wins_before/total_before*100:.1f}% | "
              f"Noirs: {black_wins_before/total_before*100:.1f}% | "
              f"Nulles: {draws_before/total_before*100:.1f}%")
        print(f"   APR√àS - Blancs: {white_wins_after/total_after*100:.1f}% | "
              f"Noirs: {black_wins_after/total_after*100:.1f}% | "
              f"Nulles: {draws_after/total_after*100:.1f}%")
    
    # Contr√¥les de temps
    if 'time_control' in df_after.columns:
        print(f"\n‚è±Ô∏è CONTR√îLES DE TEMPS (apr√®s nettoyage):")
        time_dist = df_after['time_control'].value_counts()
        for tc, count in time_dist.items():
            pct = count / len(df_after) * 100
            print(f"   {tc:<12} {count:>5,} ({pct:4.1f}%)")
    
    # Qualit√© des donn√©es
    print(f"\nüìã QUALIT√â DES DONN√âES:")
    
    # Donn√©es manquantes apr√®s nettoyage
    missing_after = df_after.isnull().sum()
    critical_missing = missing_after[missing_after > 0]
    
    if len(critical_missing) == 0:
        print("   ‚úÖ Aucune donn√©e manquante")
    else:
        print("   ‚ö†Ô∏è Donn√©es manquantes restantes:")
        for col, count in critical_missing.items():
            pct = count / len(df_after) * 100
            print(f"      {col}: {count:,} ({pct:.1f}%)")
    
    # Distribution par cat√©gorie ELO
    if 'elo_category' in df_after.columns:
        print(f"\nüéØ DISTRIBUTION PAR NIVEAU ELO:")
        elo_dist = df_after['elo_category'].value_counts().sort_index()
        for category, count in elo_dist.items():
            pct = count / len(df_after) * 100
            print(f"   {category:<12} {count:>5,} ({pct:4.1f}%)")
    
    return True

def create_comparison_viz(df_before, df_after):
    """Visualisation synth√©tique avant/apr√®s"""
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Distribution ELO', 'Distribution Coups', 
                       'Top Ouvertures', 'M√©triques Cl√©s'),
        specs=[[{}, {}], [{"type": "bar"}, {"type": "bar"}]]
    )
    
    # 1. Distribution ELO
    if 'avg_elo' in df_before.columns and 'avg_elo' in df_after.columns:
        fig.add_trace(go.Histogram(x=df_before['avg_elo'], name='Avant', opacity=0.6,
                                  nbinsx=30, marker_color='red'), row=1, col=1)
        fig.add_trace(go.Histogram(x=df_after['avg_elo'], name='Apr√®s', opacity=0.6,
                                  nbinsx=30, marker_color='blue'), row=1, col=1)
    
    # 2. Distribution Coups
    if 'moves_count' in df_before.columns and 'moves_count' in df_after.columns:
        fig.add_trace(go.Histogram(x=df_before['moves_count'], name='Avant (coups)', opacity=0.6,
                                  nbinsx=25, marker_color='red', showlegend=False), row=1, col=2)
        fig.add_trace(go.Histogram(x=df_after['moves_count'], name='Apr√®s (coups)', opacity=0.6,
                                  nbinsx=25, marker_color='blue', showlegend=False), row=1, col=2)
    
    # 3. Top Ouvertures
    if 'main_opening' in df_after.columns:
        top_openings = df_after['main_opening'].value_counts().head(8)
        fig.add_trace(go.Bar(x=top_openings.values, y=top_openings.index,
                            orientation='h', name='Popularit√©', marker_color='green',
                            showlegend=False), row=2, col=1)
    
    # 4. M√©triques cl√©s
    metrics = ['Parties', 'Ouvertures', 'Qualit√©']
    values = [
        len(df_after) / len(df_before) * 100,
        df_after['main_opening'].nunique() / df_before['main_opening'].nunique() * 100 if 'main_opening' in df_before.columns else 100,
        100  # Score qualit√© fixe
    ]
    
    colors = ['green' if v >= 80 else 'orange' if v >= 60 else 'red' for v in values]
    
    fig.add_trace(go.Bar(x=metrics, y=values, name='R√©tention (%)',
                        marker_color=colors, showlegend=False), row=2, col=2)
    fig.add_hline(y=100, line_dash="dash", line_color="gray", row=2, col=2)
    
    # Mise en forme
    fig.update_layout(
        title_text="üìä Analyse Comparative Avant/Apr√®s Nettoyage",
        height=600,
        showlegend=True
    )
    
    return fig

# Configuration et analyse
if len(df_clean) > 0:
    print("üìä G√©n√©ration de l'analyse comparative...")
    
    # TOUJOURS utiliser le mode texte d'abord (plus fiable)
    create_text_comparison(df_raw, df_clean)
    
    # Essayer les graphiques ensuite (optionnel)
    print(f"\n" + "="*50)
    print("üé® TENTATIVE D'AFFICHAGE GRAPHIQUE...")
    
    try:
        # Configuration Plotly
        setup_plotly_rendering()
        
        # Cr√©er et sauvegarder en HTML d'abord
        comparison_fig = create_comparison_viz(df_raw, df_clean)
        
        os.makedirs('../data/exported', exist_ok=True)
        html_file = f"../data/exported/comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
        comparison_fig.write_html(html_file)
        print(f"‚úÖ Graphique sauvegard√©: {html_file}")
        print("üåê Ouvrez ce fichier dans votre navigateur pour voir les graphiques")
        
        # Essayer l'affichage (peut √©chouer)
        try:
            comparison_fig.show()
            print("‚úÖ Graphiques √©galement affich√©s dans le notebook!")
        except Exception as show_error:
            print(f"‚ö†Ô∏è Affichage notebook √©chou√©: {show_error}")
            print("üí° Utilisez le fichier HTML ci-dessus pour voir les graphiques")
            
    except Exception as e:
        print(f"‚ùå Erreur cr√©ation graphique: {e}")
        print("üìä L'analyse textuelle ci-dessus contient toutes les informations n√©cessaires")

üìä G√©n√©ration de l'analyse comparative...
üìä ANALYSE COMPARATIVE AVANT/APR√àS NETTOYAGE
üì• Parties: 49,919 ‚Üí 49,360
üìà Taux de r√©tention: 98.9%
üíö Excellente r√©tention!

üéØ ANALYSE ELO:
   Avant: 1611 ¬± 193
   Apr√®s: 1611 ¬± 193
   Plage avant: 902 - 2351
   Plage apr√®s: 902 - 2351

‚ôüÔ∏è ANALYSE COUPS:
   Avant: 68.0 coups en moyenne
   Apr√®s: 68.3 coups en moyenne
   Plage avant: 0 - 281
   Plage apr√®s: 5 - 200

üìö ANALYSE OUVERTURES:
   Avant: 252 ouvertures uniques
   Apr√®s: 192 ouvertures uniques

üîù Top 10 ouvertures apr√®s nettoyage:
    1. Sicilian Defense                            4,969 (10.1%)
    2. French Defense                              3,538 ( 7.2%)
    3. Queen's Pawn Game                           2,516 ( 5.1%)
    4. King's Pawn Game                            2,250 ( 4.6%)
    5. Scandinavian Defense                        2,195 ( 4.4%)
    6. Italian Game                                1,550 ( 3.1%)
    7. English Opening            

In [29]:
def save_results(df_clean, config, strategy):
    """Sauvegarde synth√©tique"""
    
    # Cr√©er les dossiers
    os.makedirs('../data/processed', exist_ok=True)
    os.makedirs('../data/exported', exist_ok=True)
    
    # Timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Fichiers de sortie
    files_created = []
    
    try:
        # 1. Donn√©es nettoy√©es (Parquet si possible, sinon CSV)
        try:
            parquet_file = f'../data/processed/games_clean_{strategy}_{timestamp}.parquet'
            df_clean.to_parquet(parquet_file, index=False)
            files_created.append(parquet_file)
            print(f"‚úÖ Parquet: {parquet_file}")
        except Exception as e:
            print(f"‚ö†Ô∏è Parquet √©chou√© ({e}), utilisation CSV...")
            csv_fallback = f'../data/processed/games_clean_{strategy}_{timestamp}.csv'
            df_clean.to_csv(csv_fallback, index=False)
            files_created.append(csv_fallback)
            print(f"‚úÖ CSV: {csv_fallback}")
        
        # 2. Export CSV standard
        csv_file = f'../data/exported/games_clean_{strategy}_{timestamp}.csv'
        df_clean.to_csv(csv_file, index=False)
        files_created.append(csv_file)
        
        # 3. Configuration
        config_file = f'../data/exported/config_{strategy}_{timestamp}.txt'
        with open(config_file, 'w') as f:
            f.write(f"Configuration {strategy} - {timestamp}\\n")
            f.write("=" * 40 + "\\n")
            for key, value in config.items():
                f.write(f"{key}: {value}\\n")
            f.write(f"\\nR√©sultats:\\n")
            f.write(f"Parties nettoy√©es: {len(df_clean):,}\\n")
        files_created.append(config_file)
        
        print("üíæ SAUVEGARDE R√âUSSIE")
        print("=" * 25)
        for file in files_created:
            size_mb = os.path.getsize(file) / 1024**2
            print(f"‚úÖ {file} ({size_mb:.1f} MB)")
        
        return files_created
        
    except Exception as e:
        print(f"‚ùå Erreur sauvegarde: {e}")
        return []

# Sauvegarder les r√©sultats
if len(df_clean) > 0:
    saved_files = save_results(df_clean, config, STRATEGY)
    
    # R√©sum√© final
    print(f"\\nüéâ NETTOYAGE TERMIN√â!")
    print(f"üìä {len(df_clean):,} parties pr√™tes pour l'analyse")
    print(f"üìÅ {len(saved_files)} fichiers sauvegard√©s")
    print(f"üöÄ Pr√™t pour les analyses d'ouvertures!")
else:
    print("‚ùå Aucune donn√©e √† sauvegarder")

‚úÖ Parquet: ../data/processed/games_clean_conservative_20250607_160448.parquet
üíæ SAUVEGARDE R√âUSSIE
‚úÖ ../data/processed/games_clean_conservative_20250607_160448.parquet (0.9 MB)
‚úÖ ../data/exported/games_clean_conservative_20250607_160448.csv (9.7 MB)
‚úÖ ../data/exported/config_conservative_20250607_160448.txt (0.0 MB)
\nüéâ NETTOYAGE TERMIN√â!
üìä 49,360 parties pr√™tes pour l'analyse
üìÅ 3 fichiers sauvegard√©s
üöÄ Pr√™t pour les analyses d'ouvertures!


In [30]:
print("üìã R√âSUM√â FINAL")
print("=" * 20)

if len(df_clean) > 0:
    print(f"‚úÖ Dataset nettoy√© disponible: df_clean")
    print(f"üìä Parties: {len(df_clean):,}")
    
    if 'main_opening' in df_clean.columns:
        print(f"üìö Ouvertures: {df_clean['main_opening'].nunique():,}")
        print(f"üîù Plus populaire: {df_clean['main_opening'].mode()[0]}")
    
    if 'avg_elo' in df_clean.columns:
        print(f"üéØ ELO moyen: {df_clean['avg_elo'].mean():.0f}")
        print(f"üìà Plage ELO: {df_clean['avg_elo'].min():.0f} - {df_clean['avg_elo'].max():.0f}")
    
    if 'time_control' in df_clean.columns:
        dominant_tc = df_clean['time_control'].mode()[0]
        print(f"‚è±Ô∏è Contr√¥le dominant: {dominant_tc}")
    
    print(f"\\nüöÄ PROCHAINES √âTAPES:")
    print(f"1. Analyser les ouvertures par niveau ELO")
    print(f"2. Calculer les winrates par ouverture")
    print(f"3. Identifier les tendances temporelles")
    print(f"4. Cr√©er des visualisations avanc√©es")
    
    # Variables disponibles
    print(f"\\nüì¶ Variables disponibles:")
    print(f"   df_raw   - Donn√©es brutes")
    print(f"   df_clean - Donn√©es nettoy√©es")
    print(f"   config   - Configuration utilis√©e")
    
else:
    print(f"‚ùå √âchec du nettoyage")
    print(f"üí° Essayez une configuration moins stricte:")
    print(f"   STRATEGY = 'conservative'")

print(f"\\n‚è∞ Temps total: quelques secondes")
print(f"üéØ Notebook synth√©tique termin√©!")

üìã R√âSUM√â FINAL
‚úÖ Dataset nettoy√© disponible: df_clean
üìä Parties: 49,360
üìö Ouvertures: 192
üîù Plus populaire: Sicilian Defense
üéØ ELO moyen: 1611
üìà Plage ELO: 902 - 2351
‚è±Ô∏è Contr√¥le dominant: Blitz
\nüöÄ PROCHAINES √âTAPES:
1. Analyser les ouvertures par niveau ELO
2. Calculer les winrates par ouverture
3. Identifier les tendances temporelles
4. Cr√©er des visualisations avanc√©es
\nüì¶ Variables disponibles:
   df_raw   - Donn√©es brutes
   df_clean - Donn√©es nettoy√©es
   config   - Configuration utilis√©e
\n‚è∞ Temps total: quelques secondes
üéØ Notebook synth√©tique termin√©!
