In [6]:
# ====================================
# OPTA API - EXTRACTOR MA2, MA3 Y MA12 (CON SELECCI√ìN DE TEMPORADA)
# Modificado para incluir selecci√≥n de stage/temporada
# ====================================

import json
import numpy as np
import pandas as pd
import hashlib
import requests
import time
import os
from datetime import datetime

In [7]:
# ====================================
# DICCIONARIO DE EVENT TYPES
# ====================================
EVENT_TYPE_MAPPING = {
    1: "Pass",
    2: "Offside Pass", 
    3: "Take On",
    4: "Foul",
    5: "Out",
    6: "Corner Awarded",
    7: "Tackle",
    8: "Interception",
    10: "Save",
    11: "Claim",
    12: "Clearance",
    13: "Miss",
    14: "Post",
    15: "Attempt Saved",
    16: "Goal",
    17: "Card",
    18: "Player off",
    19: "Player on",
    20: "Player retired",
    21: "Player returns",
    22: "Player becomes goalkeeper",
    23: "Goalkeeper becomes player",
    24: "Condition change",
    25: "Official change",
    27: "Start delay",
    28: "End delay",
    29: "Temporary stop",
    30: "End",
    32: "Start",
    34: "Team set up",
    36: "Player changed Jersey number",
    37: "Collection End",
    38: "Temp Goal",
    39: "Temp Attempt",
    40: "Formation change",
    41: "Punch",
    42: "Good skill",
    43: "Deleted event",
    44: "Aerial",
    45: "Challenge",
    47: "Rescinded card",
    49: "Ball recovery",
    50: "Dispossessed",
    51: "Error",
    52: "Keeper pick-up",
    53: "Cross not claimed",
    54: "Smother",
    55: "Offside provoked",
    56: "Shield ball opp",
    57: "Foul throw-in",
    58: "Penalty faced",
    59: "Keeper Sweeper",
    60: "Chance missed",
    61: "Ball touch",
    63: "Temp Save",
    64: "Resume",
    65: "Contentious referee decision",
    67: "50/50",
    68: "Referee Drop Ball",
    69: "Failed to Block",
    70: "Injury Time Announcement",
    71: "Coach Setup",
    72: "Caught Offside",
    73: "Other Ball Contact",
    74: "Blocked Pass",
    75: "Delayed start",
    76: "Early end",
    79: "Coverage interruption",
    80: "Drop of Ball",
    81: "Obstacle",
    82: "Control",
    83: "Attempted tackle",
    84: "Deleted After Review"
}

In [8]:
# ====================================
# CONFIGURACI√ìN
# ====================================

outletApiKey = '10lthl3y5chwn1m0fa4mfg3bqy'
secretKey = '1u3x3eovxa0vh1lwmutbygq8xn'
delay_seconds = 30

In [9]:
# ====================================
# FUNCIONES CORE
# ====================================

def requestHeaders():
    """Funci√≥n OAuth id√©ntica a la notebook que funciona"""
    timestamp = int(round(time.time() * 1000))
    
    post_url = f'https://oauth.performgroup.com/oauth/token/{outletApiKey}?_fmt=json&_rt=b'
    
    # generate a unique hash
    key = str.encode(outletApiKey + str(timestamp) + secretKey)
    unique_hash = hashlib.sha512(key).hexdigest()
    
    # call the OAuth API (post)
    oauthHeaders = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Authorization': f'Basic {unique_hash}',
        'Timestamp': str(timestamp)
    }
    
    BODY = {
        'grant_type': 'client_credentials',
        'scope': 'b2b-feeds-auth'
    }
    
    response = requests.post(post_url, data=BODY, headers=oauthHeaders)
    access_token = response.json()['access_token']
    oauthHeaders = {'Authorization': f'Bearer {access_token}'}
    return oauthHeaders

def get_available_stages_method1():
    """M√©todo 1: Usar Tournament Calendar b√°sico (sin filtros) y filtrar despu√©s"""
    requestParameters = {
        "_fmt": "json",
        "_pgSz": "100",
        "_pgNm": "1",
        "_rt": "b"
    }
    
    sdapi_get_url = f'https://api.performfeeds.com/soccerdata/tournamentcalendar/{outletApiKey}/'
    
    try:
        print("   üîÑ Intentando con Tournament Calendar b√°sico...")
        response = requests.get(sdapi_get_url, headers=requestHeaders(), params=requestParameters)
        response.raise_for_status()
        data = response.json()
        
        stages = {}
        tournament_calendars = data.get('tournamentCalendar', [])
        
        for tc in tournament_calendars:
            stage = tc.get('stage', {})
            competition = tc.get('competition', {})
            stage_id = stage.get('id')
            stage_name = stage.get('name', 'N/A')
            comp_name = competition.get('name', 'N/A')
            comp_id = competition.get('id', 'N/A')
            
            if stage_id and stage_id not in stages:
                stages[stage_id] = {
                    'name': stage_name,
                    'competition': comp_name,
                    'competition_id': comp_id,
                    'start_date': stage.get('startDate', 'N/A'),
                    'end_date': stage.get('endDate', 'N/A')
                }
        
        return stages
    except Exception as e:
        print(f"   ‚ùå Error con Tournament Calendar: {e}")
        return {}

def get_available_stages_method2():
    """M√©todo 2: Usar MA1 b√°sico (sin filtros problem√°ticos) para obtener stages"""
    requestParameters = {
        "_fmt": "json",
        "_pgSz": "100",
        "_pgNm": "1",
        "_rt": "b"
    }
    
    sdapi_get_url = f'https://api.performfeeds.com/soccerdata/match/{outletApiKey}/'
    
    try:
        print("   üîÑ Intentando con MA1 b√°sico...")
        response = requests.get(sdapi_get_url, headers=requestHeaders(), params=requestParameters)
        response.raise_for_status()
        data = response.json()
        matches = data.get('match', [])
        
        stages = {}
        for match in matches:
            match_info = match.get('matchInfo', {})
            competition = match_info.get('competition', {})
            stage = match_info.get('stage', {})
            stage_id = stage.get('id')
            stage_name = stage.get('name', 'N/A')
            comp_name = competition.get('name', 'N/A')
            comp_id = competition.get('id', 'N/A')
            
            if stage_id and stage_id not in stages:
                stages[stage_id] = {
                    'name': stage_name,
                    'competition': comp_name,
                    'competition_id': comp_id,
                    'start_date': stage.get('startDate', 'N/A'),
                    'end_date': stage.get('endDate', 'N/A')
                }
        
        return stages
    except Exception as e:
        print(f"   ‚ùå Error con MA1 b√°sico: {e}")
        return {}

def get_known_stages():
    """M√©todo 3: Stages conocidos como fallback"""
    known_stages = {
        # La Liga 2024-25
        '4xu8dwf3cotp5qu0ddi50wkyc': {
            'name': 'La Liga 2024-25',
            'competition': 'La Liga',
            'start_date': '2024-08-01',
            'end_date': '2025-05-31'
        },
        # Premier League 2024-25 (ejemplo)
        '2kwbbcootiqqgmrzs6o5inle5': {
            'name': 'Premier League 2024-25',
            'competition': 'Premier League',
            'start_date': '2024-08-01',
            'end_date': '2025-05-31'
        }
    }

def get_existing_match_ids():
    """Lee todos los parquet existentes y obtiene los Match IDs ya procesados"""
    folder = "datos_opta_parquet"
    existing_match_ids = set()
    
    # Lista de archivos parquet a revisar
    parquet_files = [
        'player_stats.parquet',
        'team_stats.parquet', 
        'player_xg_stats.parquet',
        'xg_events.parquet',
        'abp_events.parquet',
        'team_officials.parquet'
    ]
    
    print("üîç Revisando archivos existentes...")
    
    for filename in parquet_files:
        filepath = f"{folder}/{filename}"
        if os.path.exists(filepath):
            try:
                df = pd.read_parquet(filepath)
                if not df.empty and 'Match ID' in df.columns:
                    file_match_ids = set(df['Match ID'].unique())
                    existing_match_ids.update(file_match_ids)
                    print(f"   üìÑ {filename}: {len(file_match_ids)} partidos √∫nicos")
                else:
                    print(f"   üìÑ {filename}: archivo vac√≠o o sin columna Match ID")
            except Exception as e:
                print(f"   ‚ùå Error leyendo {filename}: {e}")
        else:
            print(f"   üìÑ {filename}: no existe")
    
    if existing_match_ids:
        print(f"   ‚úÖ Total Match IDs existentes: {len(existing_match_ids)}")
    else:
        print(f"   üìÅ No se encontraron datos previos - descarga completa")
    
    return existing_match_ids

def filter_new_matches(matches_df, existing_match_ids):
    """Filtra el DataFrame de partidos para quedarse solo con los nuevos"""
    if matches_df.empty:
        return matches_df
    
    if not existing_match_ids:
        print(f"   üÜï Todos los partidos son nuevos: {len(matches_df)}")
        return matches_df
    
    # Filtrar partidos que NO est√°n en existing_match_ids
    new_matches = matches_df[~matches_df['Match ID'].isin(existing_match_ids)].copy()
    
    total_matches = len(matches_df)
    new_matches_count = len(new_matches)
    existing_matches_count = total_matches - new_matches_count
    
    print(f"   üìä Total partidos encontrados: {total_matches}")
    print(f"   ‚úÖ Ya procesados: {existing_matches_count}")
    print(f"   üÜï Nuevos por procesar: {new_matches_count}")
    
    if new_matches_count == 0:
        print(f"   üéâ ¬°Todos los partidos ya est√°n procesados!")
    
    return new_matches

def debug_api_calls():
    """Funci√≥n de debugging para probar diferentes llamadas a la API"""
    print("üîß MODO DEBUG - PROBANDO LLAMADAS A LA API")
    
    tests = [
        {
            'name': 'MA1 B√°sico (sin filtros)',
            'url': f'https://api.performfeeds.com/soccerdata/match/{outletApiKey}/',
            'params': {"_fmt": "json", "_pgSz": "5", "_rt": "b"}
        },
        {
            'name': 'MA1 con Competition Filter',
            'url': f'https://api.performfeeds.com/soccerdata/match/{outletApiKey}/',
            'params': {"_fmt": "json", "_pgSz": "5", "_rt": "b", "comp": "15"}
        },
        {
            'name': 'Tournament Calendar (OT2)',
            'url': f'https://api.performfeeds.com/soccerdata/tournamentcalendar/{outletApiKey}/',
            'params': {"_fmt": "json", "_pgSz": "5", "_rt": "b"}
        },
        {
            'name': 'OT2 con Competition Filter',
            'url': f'https://api.performfeeds.com/soccerdata/tournamentcalendar/{outletApiKey}/',
            'params': {"_fmt": "json", "_pgSz": "5", "_rt": "b", "comp": "15"}
        }
    ]
    
    for test in tests:
        print(f"\nüß™ Probando: {test['name']}")
        try:
            response = requests.get(test['url'], headers=requestHeaders(), params=test['params'])
            print(f"   Status: {response.status_code}")
            
            if response.status_code == 200:
                data = response.json()
                print(f"   ‚úÖ √âxito - Datos obtenidos")
                
                # Mostrar estructura b√°sica
                if 'match' in data:
                    matches = data.get('match', [])
                    print(f"   üìä Partidos encontrados: {len(matches)}")
                    if matches:
                        first_match = matches[0]
                        match_info = first_match.get('matchInfo', {})
                        stage = match_info.get('stage', {})
                        comp = match_info.get('competition', {})
                        print(f"   üèÜ Ejemplo - Comp: {comp.get('name', 'N/A')}, Stage: {stage.get('name', 'N/A')}")
                
                elif 'tournamentCalendar' in data:
                    tournaments = data.get('tournamentCalendar', [])
                    print(f"   üìÖ Tournament Calendars encontrados: {len(tournaments)}")
                    if tournaments:
                        first_tc = tournaments[0]
                        stage = first_tc.get('stage', {})
                        comp = first_tc.get('competition', {})
                        print(f"   üèÜ Ejemplo - Comp: {comp.get('name', 'N/A')}, Stage: {stage.get('name', 'N/A')}")
                        
            else:
                print(f"   ‚ùå Error {response.status_code}: {response.text[:200]}...")
                
        except Exception as e:
            print(f"   üí• Excepci√≥n: {str(e)[:200]}...")
    
    print(f"\nüîß ¬øQuieres probar alguna llamada espec√≠fica? (s/n): ", end="")

def process_matchTeamStats_data(match_id):
    """MA2 Team Stats - MODIFICADO para incluir Team Name"""
    # API Parameters
    requestParameters = {
        "_fmt": "json",
        "detailed": "yes",
        "fx": match_id,
        "_rt": "b"
    }
    
    # GET API
    sdapi_get_url = f'https://api.performfeeds.com/soccerdata/matchstats/{outletApiKey}/'
    response = requests.get(
        sdapi_get_url,
        headers=requestHeaders(),
        params=requestParameters
    )
    
    # Check response Status
    if response.status_code == 200:
        data = response.json()
    else:
        print(f"Error: API request failed with status code {response.status_code}")
        print(response.text)
        return pd.DataFrame()
    
    # Extract data
    match_info = data.get('matchInfo', {})
    competition_info = match_info.get('competition', {})
    stage_info = match_info.get('stage', {})
    live_data = data.get('liveData', {})
    line_ups = live_data.get('lineUp', [])
    
    # A√ëADIR: Obtener informaci√≥n home/away
    home_away_info = get_home_away_info(match_info, live_data)

    
    # Create a DataFrame for team stats
    team_stats_data = []
    
    # Extract team stats
    for team_stats in line_ups:
        team_id = team_stats['contestantId']
        team_name = home_away_info['team_mapping'].get(team_id, {}).get('name', 'N/A')
        team_position = home_away_info['team_mapping'].get(team_id, {}).get('position', 'N/A')

        # A√ëADIR: Determinar si es home o away
        is_home = team_id == home_away_info['home_team_id']
        is_away = team_id == home_away_info['away_team_id']
        
        for stat in team_stats['stat']:
            stat_info = {
                'Match ID': match_info.get('id', 'N/A'),
                'Competition ID': competition_info.get('id', 'N/A'),
                'Competition Name': competition_info.get('name', 'N/A'),
                'Week': match_info.get('week', 'N/A'),
                'Stage ID': stage_info.get('id', 'N/A'),
                'Stage Name': stage_info.get('name', 'N/A'),
                'Team ID': team_id,
                'Team Name': team_name,
                'Team Position': team_position,  # A√ëADIR
                'Is Home': is_home,  # A√ëADIR
                'Is Away': is_away,  # A√ëADIR
                'HT Home Score': home_away_info['ht_home'],  # A√ëADIR
                'HT Away Score': home_away_info['ht_away'],  # A√ëADIR
                'FT Home Score': home_away_info['ft_home'],  # A√ëADIR
                'FT Away Score': home_away_info['ft_away'],  # A√ëADIR
                'Stat Type': stat.get('type', 'N/A'),
                'Total': stat.get('value', 0)
            }
            team_stats_data.append(stat_info)
    
    # Create a DataFrame and converts "NAN" to 0 value
    df_team_stats = pd.DataFrame(team_stats_data)
    if not df_team_stats.empty:
        df_team_stats = df_team_stats.pivot(
            index=['Team ID', 'Team Name', 'Team Position', 'Is Home', 'Is Away', 
            'HT Home Score', 'HT Away Score', 'FT Home Score', 'FT Away Score',
            'Match ID', 'Competition ID', 'Competition Name', 'Week', 'Stage ID', 'Stage Name'],
            columns='Stat Type', 
            values='Total'
        ).reset_index()
        df_team_stats = df_team_stats.fillna(0)
        
        # Convert all stat columns (excluding metadata) to floats
        metadata_cols = ['Team ID', 'Team Name', 'Team Position', 'Is Home', 'Is Away', 
                        'HT Home Score', 'HT Away Score', 'FT Home Score', 'FT Away Score',
                        'Match ID', 'Competition ID', 'Competition Name', 'Week', 'Stage ID', 'Stage Name']
        stat_cols = [col for col in df_team_stats.columns if col not in metadata_cols]
        for col in stat_cols:
            df_team_stats[col] = pd.to_numeric(df_team_stats[col], errors='coerce').fillna(0).astype(float)
    
    return df_team_stats
    

# Funci√≥n auxiliar para testing r√°pido
def test_stage_access():
    """Test r√°pido para verificar acceso a stages - MEJORADO"""
    print("üîç TEST R√ÅPIDO DE ACCESO A STAGES")
    
    # Test 1: Usar el stage ID que sabemos que funciona
    known_stage = "4xu8dwf3cotp5qu0ddi50wkyc"
    
    print(f"\nüß™ Probando MA1 con stage conocido: {known_stage[:12]}...")
    
    requestParameters = {
        "_fmt": "json",
        "_pgSz": "5",
        "stg": known_stage,
        "live": "yes",
        "_rt": "b"
    }
    
    sdapi_get_url = f'https://api.performfeeds.com/soccerdata/match/{outletApiKey}/'
    
    try:
        response = requests.get(sdapi_get_url, headers=requestHeaders(), params=requestParameters)
        print(f"Status: {response.status_code}")
        
        if response.status_code == 200:
            data = response.json()
            matches = data.get('match', [])
            print(f"‚úÖ √âxito! Partidos encontrados: {len(matches)}")
            
            if matches:
                # Mostrar info del primer partido
                first_match = matches[0]
                match_info = first_match.get('matchInfo', {})
                stage = match_info.get('stage', {})
                comp = match_info.get('competition', {})
                week = match_info.get('week', 'N/A')
                
                # Verificar status del partido
                live_data = first_match.get('liveData', {})
                match_details = live_data.get('matchDetails', {})
                match_status = match_details.get('matchStatus', 'N/A')
                
                print(f"üìä Ejemplo partido:")
                print(f"   üèÜ Competici√≥n: {comp.get('name', 'N/A')}")
                print(f"   üóìÔ∏è Stage: {stage.get('name', 'N/A')}")
                print(f"   üìÖ Jornada: {week}")
                print(f"   ‚öΩ Status: {match_status}")
                
                # Test adicional: probar con jornada espec√≠fica
                print(f"\nüß™ Probando MA1 con stage + jornada espec√≠fica...")
                test_week_params = requestParameters.copy()
                test_week_params["week"] = "1"
                
                response2 = requests.get(sdapi_get_url, headers=requestHeaders(), params=test_week_params)
                if response2.status_code == 200:
                    data2 = response2.json()
                    matches2 = data2.get('match', [])
                    print(f"   ‚úÖ Jornada 1: {len(matches2)} partidos encontrados")
                else:
                    print(f"   ‚ùå Error con jornada espec√≠fica: {response2.status_code}")
                
                return True
        else:
            print(f"‚ùå Error {response.status_code}: {response.text[:200]}")
            
    except Exception as e:
        print(f"üí• Error: {e}")
    
    return False

def get_available_stages(competition_id):
    """Obtiene stages usando m√∫ltiples m√©todos como fallback - ARREGLADO basado en debug"""
    print("üîÑ Obteniendo temporadas disponibles...")
    
    # M√©todo 1: Tournament Calendar b√°sico
    stages = get_available_stages_method1()
    if stages:
        print(f"   ‚úÖ Encontradas {len(stages)} temporadas con Tournament Calendar")
        # Filtrar por competici√≥n despu√©s
        competition_names = {
            '13': 'Premier League',
            '15': ['La Liga', 'Primera Divisi√≥n'],  # M√∫ltiples nombres posibles
            '16': 'Serie A',
            '17': 'Bundesliga',
            '18': 'Ligue 1',
            '19': 'Champions League'
        }
        
        target_comps = competition_names.get(competition_id, [])
        if isinstance(target_comps, str):
            target_comps = [target_comps]
        
        if target_comps:
            filtered_stages = {}
            for stage_id, stage_info in stages.items():
                comp_name = stage_info.get('competition', '').lower()
                for target_comp in target_comps:
                    if target_comp.lower() in comp_name or comp_name in target_comp.lower():
                        filtered_stages[stage_id] = stage_info
                        break
            
            if filtered_stages:
                print(f"   üéØ Filtradas {len(filtered_stages)} temporadas para la competici√≥n")
                return filtered_stages
        
        # Si no se puede filtrar, devolver todas
        return stages
    
    # M√©todo 2: MA1 b√°sico
    stages = get_available_stages_method2()
    if stages:
        print(f"   üîç DEBUG - Todas las temporadas encontradas:")
        for stage_id, stage_info in stages.items():
            print(f"      {stage_id[:12]}... - {stage_info.get('name', 'N/A')} - {stage_info.get('competition', 'N/A')}")
        
        # Filtrar por competici√≥n
        competition_names = {
            '13': 'Premier League',
            '15': ['La Liga', 'Primera Divisi√≥n'],
            '16': 'Serie A',
            '17': 'Bundesliga',
            '18': 'Ligue 1',
            '19': 'Champions League'
        }
        
        target_comps = competition_names.get(competition_id, [])
        if isinstance(target_comps, str):
            target_comps = [target_comps]
        
        if target_comps:
            filtered_stages = {}
            for stage_id, stage_info in stages.items():
                comp_name = stage_info.get('competition', '').lower()
                for target_comp in target_comps:
                    if target_comp.lower() in comp_name or comp_name in target_comp.lower():
                        filtered_stages[stage_id] = stage_info
                        break
            
            if filtered_stages:
                print(f"   üéØ Filtradas {len(filtered_stages)} temporadas para la competici√≥n")
                return filtered_stages
        
        return stages
    
    # M√©todo 3: Stages conocidos
    print("   üîÑ Usando stages conocidos como fallback...")
    known_stages = get_known_stages()
    
    # Filtrar por competici√≥n
    competition_names = {
        '13': 'Premier League',
        '15': 'La Liga', 
        '16': 'Serie A',
        '17': 'Bundesliga',
        '18': 'Ligue 1',
        '19': 'Champions League'
    }
    target_comp = competition_names.get(competition_id, '')
    if target_comp:
        filtered_stages = {k: v for k, v in known_stages.items() 
                         if target_comp.lower() in v.get('competition', '').lower()}
        if filtered_stages:
            print(f"   ‚úÖ Usando {len(filtered_stages)} temporadas conocidas")
            return filtered_stages
    
    return known_stages

def get_match_ids_advanced(competition_levels="13,15", max_matches=50, specific_week=None, stage_id=None):
    """Obtiene match IDs - ARREGLADO para evitar filtros problem√°ticos"""
    requestParameters = {
        "_fmt": "json",
        "_pgSz": str(max_matches),
        "_pgNm": "1",
        "live": "yes",
        "_rt": "b"
        # REMOVIDO: "status": "played" - causa error 400
        # REMOVIDO: "cvlv": competition_levels - causa error 403
    }
    
    if specific_week:
        requestParameters["week"] = str(specific_week)
    
    # Usar el par√°metro 'stg' seg√∫n la documentaci√≥n MA1
    if stage_id:
        requestParameters["stg"] = str(stage_id)

    sdapi_get_url = f'https://api.performfeeds.com/soccerdata/match/{outletApiKey}/'
    
    try:
        response = requests.get(sdapi_get_url, headers=requestHeaders(), params=requestParameters)
        response.raise_for_status()
        data = response.json()
        matches = data.get('match', [])
        
        match_list = []
        for match in matches:
            match_info = match.get('matchInfo', {})
            match_id = match_info.get('id')
            if match_id:
                competition = match_info.get('competition', {})
                stage = match_info.get('stage', {})
                contestants = match_info.get('contestant', [])
                teams = [{'id': c.get('id'), 'name': c.get('name'), 'code': c.get('code')} for c in contestants]
                
                # Verificar que el partido est√© finalizado usando liveData
                live_data = match.get('liveData', {})
                match_details = live_data.get('matchDetails', {})
                match_status = match_details.get('matchStatus', '')
                
                # Solo incluir partidos finalizados
                if match_status.lower() in ['played', 'finished', 'ft', 'final']:
                    match_data = {
                        'Match ID': match_id,
                        'Competition': competition.get('name', 'N/A'),
                        'Competition ID': competition.get('id', 'N/A'),
                        'Stage ID': stage.get('id', 'N/A'),
                        'Stage Name': stage.get('name', 'N/A'),
                        'Date': match_info.get('date', 'N/A'),
                        'Week': match_info.get('week', 'N/A'),
                        'Match Status': match_status,
                        'Teams': teams
                    }
                    match_list.append(match_data)
        
        return pd.DataFrame(match_list)
    except Exception as e:
        print(f"Error obteniendo partidos: {e}")
        return pd.DataFrame()

# ====================================
# EXTRACTORES - COPIADOS DE LA NOTEBOOK QUE FUNCIONA
# ====================================

def process_matchPlayerStats_data(match_id):
    """MA2 Player Stats + Team Officials - CORREGIDO COMPLETAMENTE"""
    # API Parameters
    requestParameters = {
        "_fmt": "json",
        "detailed": "yes",
        "fx": match_id,
        "_rt": "b"
    }
    
    # GET API
    sdapi_get_url = f'https://api.performfeeds.com/soccerdata/matchstats/{outletApiKey}/'
    response = requests.get(
        sdapi_get_url,
        headers=requestHeaders(),
        params=requestParameters
    )
    
    # Check response Status
    if response.status_code == 200:
        data = response.json()
    else:
        print(f"Error: API request failed with status code {response.status_code}")
        print(response.text)
        return pd.DataFrame(), pd.DataFrame()
    
    # Extract data
    match_info = data.get('matchInfo', {})
    competition_info = match_info.get('competition', {})
    stage_info = match_info.get('stage', {})
    live_data = data.get('liveData', {})
    line_ups = live_data.get('lineUp', [])
    
    # A√ëADIR: Obtener informaci√≥n home/away
    home_away_info = get_home_away_info(match_info, live_data)
    
    # Create a DataFrame for player stats
    player_stats_data = []
    
    # Extract player stats
    for line_up in line_ups:
        team_id = line_up.get('contestantId')
        team_name = home_away_info['team_mapping'].get(team_id, {}).get('name', 'N/A')
        team_position = home_away_info['team_mapping'].get(team_id, {}).get('position', 'N/A')
                
        # A√ëADIR: Determinar si es home o away
        is_home = team_id == home_away_info['home_team_id']
        is_away = team_id == home_away_info['away_team_id']
        
        for player in line_up.get('player', []):
            player_entry = {
                'Match ID': match_info.get('id', 'N/A'),
                'Competition ID': competition_info.get('id', 'N/A'),
                'Competition Name': competition_info.get('name', 'N/A'),
                'Week': match_info.get('week', 'N/A'),
                'Stage ID': stage_info.get('id', 'N/A'),
                'Stage Name': stage_info.get('name', 'N/A'),
                'Team ID': team_id,
                'Team Name': team_name,
                'Team Position': team_position,  # A√ëADIR
                'Is Home': is_home,  # A√ëADIR
                'Is Away': is_away,  # A√ëADIR
                'HT Home Score': home_away_info['ht_home'],  # A√ëADIR
                'HT Away Score': home_away_info['ht_away'],  # A√ëADIR
                'FT Home Score': home_away_info['ft_home'],  # A√ëADIR
                'FT Away Score': home_away_info['ft_away'],  # A√ëADIR
                'Player ID': player.get('playerId', 'N/A'),
                'First Name': player.get('firstName', 'N/A'),
                'Last Name': player.get('lastName', 'N/A'),
                'Match Name': player.get('matchName', 'N/A'),
                'Shirt Number': player.get('shirtNumber', 'N/A'),
                'Position': player.get('position', 'N/A'),
                'Position Side': player.get('positionSide', 'N/A'),
                'Formation Place': player.get('formationPlace', 'N/A'),
            }
            
            for stat in player.get('stat', []):
                stat_type = stat.get('type', '')
                stat_value = stat.get('value', 0)
                player_entry[stat_type] = stat_value
            
            player_stats_data.append(player_entry)
    
    # Create a DataFrame for team officials
    team_officials_data = []
    
    # Extract team officials
    for line_up in line_ups:
        team_id = line_up.get('contestantId')
        team_name = home_away_info['team_mapping'].get(team_id, {}).get('name', 'N/A')
        
        for official in line_up.get('teamOfficial', []):
            official_entry = {
                'Match ID': match_info.get('id', 'N/A'),
                'Competition ID': competition_info.get('id', 'N/A'),
                'Competition Name': competition_info.get('name', 'N/A'),
                'Week': match_info.get('week', 'N/A'),
                'Stage ID': stage_info.get('id', 'N/A'),
                'Stage Name': stage_info.get('name', 'N/A'),
                'Team ID': team_id,
                'Team Name': team_name,
                'Official ID': official.get('id', 'N/A'),
                'First Name': official.get('firstName', 'N/A'),
                'Last Name': official.get('lastName', 'N/A'),
                'Short First Name': official.get('shortFirstName', 'N/A'),
                'Short Last Name': official.get('shortlastName', 'N/A'),
                'Known Name': official.get('knownName', 'N/A'),
                'Type': official.get('type', 'N/A')
            }
            team_officials_data.append(official_entry)
    
    # Create DataFrames and convert "NAN" to 0 value
    df_player_stats = pd.DataFrame(player_stats_data).fillna(0)
    df_team_officials = pd.DataFrame(team_officials_data).fillna('N/A')
    
    # Convert all stat columns (excluding metadata) to numeric for player stats
    non_stat_cols = ['Match ID', 'Competition ID', 'Competition Name', 'Week', 'Stage ID', 'Stage Name', 
                     'Team ID', 'Team Name', 'Team Position', 'Is Home', 'Is Away', 
                     'HT Home Score', 'HT Away Score', 'FT Home Score', 'FT Away Score',
                     'Player ID', 'First Name', 'Last Name', 'Match Name',
                     'Shirt Number', 'Position', 'Position Side', 'Formation Place']
    
    if not df_player_stats.empty:
        stat_cols = [col for col in df_player_stats.columns if col not in non_stat_cols]
        
        for col in stat_cols:
            df_player_stats[col] = pd.to_numeric(df_player_stats[col], errors='coerce').fillna(0)
        
        # Optionally, convert all floats to ints if appropriate
        df_player_stats[stat_cols] = df_player_stats[stat_cols].astype(float)
    
    return df_player_stats, df_team_officials

def process_match_events_data(match_id):
    """MA3 Match Events - CORREGIDO"""
    # API Parameters
    requestParameters = {
        "_fmt": "json",
        "fx": match_id,
        "_rt": "b"
    }
    
    # GET API
    sdapi_get_url = f'https://api.performfeeds.com/soccerdata/matchevent/{outletApiKey}/'
    response = requests.get(
        sdapi_get_url,
        headers=requestHeaders(),
        params=requestParameters
    )
    
    # Check response Status
    if response.status_code == 200:
        data = response.json()
    else:
        print(f"Error: API request failed with status code {response.status_code}")
        print(response.text)
        return pd.DataFrame()
    
    # Extract data
    match_info = data.get('matchInfo', {})
    competition_info = match_info.get('competition', {})
    stage_info = match_info.get('stage', {})
    live_data = data.get('liveData', {})
    events = live_data.get('event', [])
    
    # A√ëADIR: Obtener informaci√≥n home/away
    home_away_info = get_home_away_info(match_info, live_data)
    
    # Find all unique qualifier IDs
    qualifier_ids = set()
    for event in events:
        for q in event.get('qualifier', []):
            qualifier_ids.add(str(q.get('qualifierId', '')))
    
    # Initialize DataFrame columns
    columns = [
        'Match ID', 'Competition ID', 'Competition Name', 'Week', 'Stage ID', 'Stage Name',
        'EventId', 'timeStamp', 'contestantId', 'Team ID', 'Team Name', 'Team Position',
        'Is Home', 'Is Away', 'HT Home Score', 'HT Away Score', 'FT Home Score', 'FT Away Score',
        'periodId', 'timeMin', 'timeSec', 'playerId', 'playerName', 'typeId', 'Event Name', 
        'outcome', 'x', 'y'
    ] + [f'qualifier {qid}' for qid in qualifier_ids]
    
    # Create a DataFrame for events
    events_data = []
    
    for event in events:
        contestant_id = event.get('contestantId', None)
        team_name = home_away_info['team_mapping'].get(contestant_id, {}).get('name', 'N/A') if contestant_id else 'N/A'
        team_position = home_away_info['team_mapping'].get(contestant_id, {}).get('position', 'N/A') if contestant_id else 'N/A'
        
        # A√ëADIR: Determinar si es home o away
        is_home = contestant_id == home_away_info['home_team_id']
        is_away = contestant_id == home_away_info['away_team_id']
        
        type_id = event.get('typeId', None)
        event_name = EVENT_TYPE_MAPPING.get(type_id, 'Unknown Event') if type_id else 'Unknown Event'
        
        event_info = {
            'Match ID': match_info.get('id', 'N/A'),
            'Competition ID': competition_info.get('id', 'N/A'),
            'Competition Name': competition_info.get('name', 'N/A'),
            'Week': match_info.get('week', 'N/A'),
            'Stage ID': stage_info.get('id', 'N/A'),
            'Stage Name': stage_info.get('name', 'N/A'),
            'EventId': event.get('eventId', None),
            'typeId': type_id,
            'Event Name': event_name,
            'periodId': event.get('periodId', None),
            'timeMin': event.get('timeMin', None),
            'timeSec': event.get('timeSec', None),
            'contestantId': contestant_id,
            'Team ID': contestant_id,
            'Team Name': team_name,
            'Team Position': team_position,  # A√ëADIR
            'Is Home': is_home,  # A√ëADIR
            'Is Away': is_away,  # A√ëADIR
            'HT Home Score': home_away_info['ht_home'],  # A√ëADIR
            'HT Away Score': home_away_info['ht_away'],  # A√ëADIR
            'FT Home Score': home_away_info['ft_home'],  # A√ëADIR
            'FT Away Score': home_away_info['ft_away'],  # A√ëADIR
            'playerId': event.get('playerId', None),
            'playerName': event.get('playerName', None),
            'outcome': event.get('outcome', None),
            'x': event.get('x', None),
            'y': event.get('y', None),
            'timeStamp': event.get('timeStamp', None),
        }
        
        # Initialize all qualifiers to 0
        for qid in qualifier_ids:
            event_info[f'qualifier {qid}'] = 0
        
        # Update with actual qualifier values
        for q in event.get('qualifier', []):
            event_info[f'qualifier {q["qualifierId"]}'] = q.get('value', None)
        
        events_data.append(event_info)
    
    events_df = pd.DataFrame(events_data, columns=columns)
    return events_df

def process_xG_matchPlayerStats_data(match_id):
    """MA12 Player xG Stats + Team Officials - MODIFICADO para incluir teamOfficial"""
    # API Parameters
    requestParameters = {
        "_fmt": "json",
        "fx": match_id,
        "_rt": "b"
    }
    
    # GET API
    sdapi_get_url = f'https://api.performfeeds.com/soccerdata/matchexpectedgoals/{outletApiKey}/'
    response = requests.get(
        sdapi_get_url,
        headers=requestHeaders(),
        params=requestParameters
    )
    
    # Create DataFrames for xG player stats and team officials
    player_xGstats_data = []
    team_officials_data = []
    
    # Check response Status
    if response.status_code == 200:
        data = response.json()
        
        # Extract data
        match_info = data.get('matchInfo', {})
        competition_info = match_info.get('competition', {})
        stage_info = match_info.get('stage', {})
        live_data = data.get('liveData', {})
        line_ups = live_data.get('lineUp', [])
        
        # A√ëADIR: Obtener informaci√≥n home/away
        home_away_info = get_home_away_info(match_info, live_data)
        
        # Extract player stats
        for line_up in line_ups:
            team_id = line_up.get('contestantId')
            team_name = home_away_info['team_mapping'].get(team_id, {}).get('name', 'N/A')            
            for player in line_up.get('player', []):
                player_entry = {
                    'Match ID': match_info.get('id', 'N/A'),
                    'Competition ID': competition_info.get('id', 'N/A'),
                    'Competition Name': competition_info.get('name', 'N/A'),
                    'Week': match_info.get('week', 'N/A'),
                    'Stage ID': stage_info.get('id', 'N/A'),
                    'Stage Name': stage_info.get('name', 'N/A'),
                    'Team ID': team_id,
                    'Team Name': team_name,
                    'Player ID': player.get('playerId', 'N/A'),
                    'First Name': player.get('firstName', 'N/A'),
                    'Last Name': player.get('lastName', 'N/A'),
                    'Match Name': player.get('matchName', 'N/A'),
                    'Shirt Number': player.get('shirtNumber', 'N/A'),
                    'Position': player.get('position', 'N/A'),
                    'Position Side': player.get('positionSide', 'N/A'),
                    'Formation Place': player.get('formationPlace', 'N/A'),
                }
                
                for stat in player.get('stat', []):
                    stat_type = stat.get('type', '')
                    stat_value = stat.get('value', 0)
                    player_entry[stat_type] = stat_value
                
                player_xGstats_data.append(player_entry)
            
            # Extract team officials
            for official in line_up.get('teamOfficial', []):
                official_entry = {
                    'Match ID': match_info.get('id', 'N/A'),
                    'Competition ID': competition_info.get('id', 'N/A'),
                    'Competition Name': competition_info.get('name', 'N/A'),
                    'Week': match_info.get('week', 'N/A'),
                    'Stage ID': stage_info.get('id', 'N/A'),
                    'Stage Name': stage_info.get('name', 'N/A'),
                    'Team ID': team_id,
                    'Team Name': team_name,  # En MA12 firstName contiene el nombre del equipo seg√∫n tu comentario
                    'Official ID': official.get('id', 'N/A'),
                    'First Name': official.get('firstName', 'N/A'),
                    'Last Name': official.get('lastName', 'N/A'),
                    'Short First Name': official.get('shortFirstName', 'N/A'),
                    'Short Last Name': official.get('shortlastName', 'N/A'),
                    'Known Name': official.get('knownName', 'N/A'),
                    'Type': official.get('type', 'N/A')
                }
                team_officials_data.append(official_entry)
    
    else:
        print(f"Error: API request failed with status code {response.status_code}")
        print(response.text)
        return pd.DataFrame(), pd.DataFrame()
    
    # Create DataFrames and convert "NAN" to 0 value
    df_player_xGstats = pd.DataFrame(player_xGstats_data).fillna(0)
    df_team_officials = pd.DataFrame(team_officials_data).fillna('N/A')
    
    # Convert all stat columns (excluding metadata) to numeric for player xG stats
    non_stat_cols = ['Match ID', 'Competition ID', 'Competition Name', 'Week', 'Stage ID', 'Stage Name', 
                     'Team ID', 'Team Name', 'Player ID', 'First Name', 'Last Name', 'Match Name',
                     'Shirt Number', 'Position', 'Position Side', 'Formation Place']
    
    if not df_player_xGstats.empty:
        stat_cols = [col for col in df_player_xGstats.columns if col not in non_stat_cols]
        
        for col in stat_cols:
            df_player_xGstats[col] = pd.to_numeric(df_player_xGstats[col], errors='coerce').fillna(0)
        
        # Optionally, convert all floats to ints if appropriate
        df_player_xGstats[stat_cols] = df_player_xGstats[stat_cols].astype(float)
    
    return df_player_xGstats, df_team_officials

def process_xG_match_events(match_id):
    """MA12 xG Events - CORREGIDO"""
    # API Parameters
    requestParameters = {
        "_rt": "b",
        "_fmt": "json",
        "fx": match_id
    }
    
    # GET API
    sdapi_get_url = f'https://api.performfeeds.com/soccerdata/matchexpectedgoals/{outletApiKey}/'
    
    response = requests.get(
        sdapi_get_url,
        headers=requestHeaders(),
        params=requestParameters
    )
    
    # Check response Status
    if response.status_code == 200:
        data = response.json()
    else:
        print(f"Error: API request failed with status code {response.status_code}")
        print(response.text)
        return pd.DataFrame()
    
    # Extract event data
    match_info = data.get('matchInfo', {})
    competition_info = match_info.get('competition', {})
    stage_info = match_info.get('stage', {})
    live_data = data.get('liveData', {})
    xG_events = live_data.get('event', [])
    
    # A√ëADIR: Obtener informaci√≥n home/away
    home_away_info = get_home_away_info(match_info, live_data)
    
    # Find all unique qualifier IDs
    qualifier_ids = {'321', '322'}
    
    # Initialize DataFrame columns
    columns = [
        'Match ID', 'Competition ID', 'Competition Name', 'Week', 'Stage ID', 'Stage Name', 'EventId', 'timeStamp', 
        'contestantId', 'Team ID', 'Team Name', 'periodId', 'timeMin', 'timeSec',
        'playerId', 'playerName', 'typeId', 'Event Name', 'outcome', 'x', 'y'] + [f'qualifier {qid}' for qid in qualifier_ids]
    
    # Create a DataFrame for xG events
    xG_events_data = []
    
    for event in xG_events:
        contestant_id = event.get('contestantId', None)
        team_name = home_away_info['team_mapping'].get(contestant_id, {}).get('name', 'N/A') if contestant_id else 'N/A'  # CORREGIDO
        type_id = event.get('typeId', None)
        event_name = EVENT_TYPE_MAPPING.get(type_id, 'Unknown Event') if type_id else 'Unknown Event'
        
        xG_event_info = {
            'Match ID': match_info.get('id', 'N/A'),
            'Competition ID': competition_info.get('id', 'N/A'),
            'Competition Name': competition_info.get('name', 'N/A'),
            'Week': match_info.get('week', 'N/A'),
            'Stage ID': stage_info.get('id', 'N/A'),
            'Stage Name': stage_info.get('name', 'N/A'),
            'EventId': event.get('eventId', None),
            'typeId': type_id,
            'Event Name': event_name,
            'periodId': event.get('periodId', None),
            'timeMin': event.get('timeMin', None),
            'timeSec': event.get('timeSec', None),
            'contestantId': contestant_id,
            'Team ID': contestant_id,
            'Team Name': team_name,
            'playerId': event.get('playerId', None),
            'playerName': event.get('playerName', None),
            'outcome': event.get('outcome', None),
            'x': event.get('x', None),
            'y': event.get('y', None),
            'timeStamp': event.get('timeStamp', None),
        }
        
        for qid in qualifier_ids:
            xG_event_info[f'qualifier {qid}'] = 0
        
        # Update with actual qualifier values
        for q in event.get('qualifier', []):
            xG_event_info[f'qualifier {q["qualifierId"]}'] = q.get('value', None)
        
        xG_events_data.append(xG_event_info)
    
    xG_events_df = pd.DataFrame(xG_events_data, columns=columns)
    return xG_events_df

# ====================================
# FUNCIONES PRINCIPALES - MODIFICADAS
# ====================================
def get_current_season_stage():
    """Obtiene el Stage ID de la temporada actual (que no sea 23/24 ni 24/25)"""
    known_stages = ['49d7kwlzobzuyja3x2bzwe3o4', '4xu8dwf3cotp5qu0ddi50wkyc']
    
    # Usar el m√©todo que ya funciona para obtener stages
    stages = get_available_stages_method2()
    
    for stage_id, stage_info in stages.items():
        comp_name = stage_info.get('competition', '').lower()
        if 'primera' in comp_name or 'la liga' in comp_name:
            if stage_id not in known_stages:
                return stage_id
    
    return None
    
def interactive_competition_selection():
    """Selecci√≥n simplificada solo para temporadas de La Liga"""
    print("\nüá™üá∏ LA LIGA - SELECCIONA TEMPORADA:")
    
    current_stage = get_current_season_stage()
    
    seasons = {
        '1': {
            'name': '2023/24',
            'stage_id': '49d7kwlzobzuyja3x2bzwe3o4'
        },
        '2': {
            'name': '2024/25', 
            'stage_id': '4xu8dwf3cotp5qu0ddi50wkyc'
        },
        '3': {
            'name': '2025/26',
            'stage_id': current_stage
        }
    }
    
    for key, season in seasons.items():
        status = "‚úÖ" if season['stage_id'] else "‚ùå"
        print(f"  {key}: Temporada {season['name']} {status}")
    
    choice = input(f"\nüóìÔ∏è Selecciona temporada (1-3): ").strip()
    
    if choice in seasons:
        selected = seasons[choice]
        max_week = int(input(f"\nüìÖ ¬øHasta qu√© jornada descargar? (ej: 2): ").strip())
        
        return {
            'competition_id': '15',
            'stage_id': selected['stage_id'],
            'stage_name': f"La Liga {selected['name']}",
            'max_week': max_week
        }
    else:
        print("‚ùå Selecci√≥n no v√°lida")
        return None

def get_matches_by_weeks(competition_id, stage_id, max_week):
    """Obtiene partidos por jornadas - ARREGLADO para usar solo stage_id"""
    all_matches = []
    
    for week in range(1, max_week + 1):
        print(f"üìÖ Jornada {week}...")
        matches_df = get_match_ids_advanced(
            max_matches=50,
            specific_week=str(week),
            stage_id=stage_id  # Solo usamos stage_id, no competition_levels
        )
        
        if not matches_df.empty:
            print(f"   ‚úÖ Encontrados {len(matches_df)} partidos en jornada {week}")
            all_matches.append(matches_df)
        else:
            print(f"   ‚ö†Ô∏è No se encontraron partidos en jornada {week}")
        time.sleep(2)
    
    if all_matches:
        result_df = pd.concat(all_matches, ignore_index=True)
        print(f"\nüìä Total partidos encontrados: {len(result_df)}")
        return result_df
    else:
        return pd.DataFrame()
        
def save_to_parquet(data_dict, filename_prefix=None):
    """Guarda datos en parquet de forma incremental - VERSI√ìN DEFINITIVA"""
    folder = "datos_opta_parquet"
    os.makedirs(folder, exist_ok=True)
    
    # Nombres fijos para los archivos y sus claves de duplicaci√≥n
    file_config = {
        'player_stats': {
            'filename': 'player_stats.parquet',
            'duplicate_keys': ['Match ID', 'Player ID']
        },
        'team_stats': {
            'filename': 'team_stats.parquet', 
            'duplicate_keys': ['Match ID', 'Team ID']
        },
        'player_xg_stats': {
            'filename': 'player_xg_stats.parquet',
            'duplicate_keys': ['Match ID', 'Player ID']
        },
        'xg_events': {
            'filename': 'xg_events.parquet',
            'duplicate_keys': ['Match ID', 'EventId']
        },
        'match_events': {
            'filename': 'abp_events.parquet',
            'duplicate_keys': ['Match ID', 'EventId']
        },
        'team_officials': {
            'filename': 'team_officials.parquet',
            'duplicate_keys': ['Match ID', 'Team ID', 'Official ID']
        }
    }
    
    for data_type, df in data_dict.items():
        if not df.empty and data_type in file_config:
            config = file_config[data_type]
            filename = f"{folder}/{config['filename']}"
            duplicate_keys = config['duplicate_keys']
            
            print(f"üîÑ Procesando {data_type}...")
            
            try:
                # PASO 1: Crear una copia para no modificar el original
                df_copy = df.copy()
                
                # PASO 2: Conversi√≥n ROBUSTA de tipos de datos
                print(f"   üìù Limpiando tipos de datos...")
                
                # Convertir TODAS las columnas qualifier a string sin excepciones
                qualifier_cols = [col for col in df_copy.columns if col.startswith('qualifier')]
                for col in qualifier_cols:
                    try:
                        # Forzar conversi√≥n a string manejando todos los casos
                        df_copy[col] = df_copy[col].fillna('0').astype(str)
                        # Limpiar valores problem√°ticos
                        df_copy[col] = df_copy[col].replace(['nan', 'None', 'null', ''], '0')
                    except Exception as e:
                        print(f"   ‚ö†Ô∏è Error en columna {col}: {e}")
                        df_copy[col] = '0'  # Valor por defecto
                
                # Convertir columnas booleanas a string
                bool_cols = [col for col in df_copy.columns if col in ['Is Home', 'Is Away']]
                for col in bool_cols:
                    try:
                        df_copy[col] = df_copy[col].astype(str)
                    except:
                        df_copy[col] = 'False'
                
                # Convertir columnas num√©ricas problem√°ticas
                numeric_cols = [col for col in df_copy.columns if col in ['timeMin', 'timeSec', 'x', 'y', 'eventId', 'EventId', 'periodId']]
                for col in numeric_cols:
                    try:
                        df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
                    except:
                        df_copy[col] = None
                
                # Limpiar TODAS las columnas object
                for col in df_copy.columns:
                    if df_copy[col].dtype == 'object':
                        try:
                            df_copy[col] = df_copy[col].fillna('N/A').astype(str)
                            df_copy[col] = df_copy[col].replace(['nan', 'None', 'null'], 'N/A')
                        except:
                            df_copy[col] = 'N/A'
                
                # PASO 3: Intentar guardar el archivo
                if os.path.exists(filename):
                    existing_df = pd.read_parquet(filename)
                    
                    # Verificar claves de duplicaci√≥n
                    available_keys = [key for key in duplicate_keys if key in df_copy.columns and key in existing_df.columns]
                    
                    if available_keys:
                        combined_df = pd.concat([existing_df, df_copy], ignore_index=True)
                        combined_df = combined_df.drop_duplicates(subset=available_keys, keep='last')
                        print(f"üíæ Actualizado: {filename} ({len(existing_df)} ‚Üí {len(combined_df)} filas)")
                        combined_df.to_parquet(filename, index=False)
                    else:
                        combined_df = pd.concat([existing_df, df_copy], ignore_index=True)
                        print(f"üíæ Actualizado (sin deduplicaci√≥n): {filename} ({len(existing_df)} ‚Üí {len(combined_df)} filas)")
                        combined_df.to_parquet(filename, index=False)
                else:
                    print(f"üíæ Creado: {filename} ({len(df_copy)} filas)")
                    df_copy.to_parquet(filename, index=False)
                    
                print(f"   ‚úÖ {data_type} guardado exitosamente")
                    
            except Exception as e:
                print(f"‚ùå Error con parquet en {data_type}: {e}")
                print(f"   üîÑ Intentando guardar como CSV...")
                
                # FALLBACK: Guardar como CSV
                try:
                    csv_filename = filename.replace('.parquet', '.csv')
                    df_copy.to_csv(csv_filename, index=False)
                    print(f"üíæ Guardado como CSV: {csv_filename}")
                except Exception as e2:
                    print(f"‚ùå Error tambi√©n con CSV: {e2}")
                    
                    # √öLTIMO RECURSO: Guardar solo metadatos
                    try:
                        metadata_filename = filename.replace('.parquet', '_metadata.txt')
                        with open(metadata_filename, 'w') as f:
                            f.write(f"Data type: {data_type}\n")
                            f.write(f"Rows: {len(df_copy)}\n")
                            f.write(f"Columns: {list(df_copy.columns)}\n")
                            f.write(f"Error: {str(e)}\n")
                        print(f"üìù Metadatos guardados: {metadata_filename}")
                    except:
                        print(f"üí• Error total con {data_type} - omitiendo...")
                        
        elif not df.empty:
            print(f"‚ö†Ô∏è Tipo de datos desconocido: {data_type} - omitido")
        else:
            print(f"üìÑ {data_type}: DataFrame vac√≠o - omitido")

def main_interactive():
    """Funci√≥n principal - MODIFICADA para descarga incremental"""
    print("üéØ EXTRACCI√ìN OPTA - MA2, MA3 Y MA12 CON SELECCI√ìN DE TEMPORADA")
    
    config = interactive_competition_selection()
    if not config:
        return None
    
    print(f"\n‚úÖ Configuraci√≥n:")
    print(f"   üèÜ Competici√≥n: {config['competition_id']}")
    print(f"   üóìÔ∏è Temporada: {config['stage_name']}")
    print(f"   üìÖ Jornadas: 1-{config['max_week']}")
    print(f"   üìä Feeds: MA2 (Match Stats), MA3 (Events), MA12 (xG)")
    
    # NUEVO: Obtener Match IDs ya procesados
    existing_match_ids = get_existing_match_ids()
    
    print("\nüîÑ Obteniendo partidos...")
    all_matches_df = get_matches_by_weeks(config['competition_id'], config['stage_id'], config['max_week'])
    
    if all_matches_df.empty:
        print("‚ùå No se encontraron partidos")
        print("\nüîß POSIBLES SOLUCIONES:")
        print("   1. Verificar que el Stage ID sea correcto")
        print("   2. Probar con una jornada espec√≠fica que sepas que tiene partidos")
        print("   3. Verificar permisos de API")
        return None
    
    # NUEVO: Filtrar solo partidos nuevos
    print("\nüîç Filtrando partidos nuevos...")
    new_matches_df = filter_new_matches(all_matches_df, existing_match_ids)
    
    if new_matches_df.empty:
        print("üéâ ¬°No hay partidos nuevos que procesar!")
        print("üíæ Los archivos existentes ya contienen todos los datos solicitados.")
        
        # Mostrar resumen de datos existentes
        print("\nüìä RESUMEN DE DATOS EXISTENTES:")
        folder = "datos_opta_parquet"
        parquet_files = {
            'player_stats': 'player_stats.parquet',
            'team_stats': 'team_stats.parquet', 
            'player_xg_stats': 'player_xg_stats.parquet',
            'xg_events': 'xg_events.parquet',
            'match_events': 'abp_events.parquet',
            'team_officials': 'team_officials.parquet'
        }
        
        result = {}
        for data_type, filename in parquet_files.items():
            filepath = f"{folder}/{filename}"
            if os.path.exists(filepath):
                df = pd.read_parquet(filepath)
                result[data_type] = df
                print(f"   ‚úÖ {data_type}: {len(df)} filas")
            else:
                result[data_type] = pd.DataFrame()
                print(f"   üìÑ {data_type}: archivo no existe")
        
        return result
    
    print(f"\nüìä Procesando {len(new_matches_df)} partidos nuevos...")
    match_ids = new_matches_df['Match ID'].tolist()
    
    # Procesar datos (solo partidos nuevos)
    all_data = {
        'player_stats': [],
        'team_stats': [],
        'player_xg_stats': [],
        'xg_events': [],
        'match_events': [],
        'team_officials': []
    }
    
    for i, match_id in enumerate(match_ids):
        print(f"‚öΩ Partido {i+1}/{len(match_ids)}: {match_id}")
        
        # MA2 - Player Stats + Team Officials
        player_stats_df, team_officials_df = process_matchPlayerStats_data(match_id)
        if not player_stats_df.empty:
            all_data['player_stats'].append(player_stats_df)
        if not team_officials_df.empty:
            all_data['team_officials'].append(team_officials_df)
        
        # MA2 - Team Stats
        team_stats_df = process_matchTeamStats_data(match_id)
        if not team_stats_df.empty:
            all_data['team_stats'].append(team_stats_df)
        
        # MA3 - Match Events
        match_events_df = process_match_events_data(match_id)
        if not match_events_df.empty:
            all_data['match_events'].append(match_events_df)
        
        # MA12 - Player xG Stats + Team Officials 
        player_xg_df, team_officials_xg_df = process_xG_matchPlayerStats_data(match_id)
        if not player_xg_df.empty:
            all_data['player_xg_stats'].append(player_xg_df)
        if not team_officials_xg_df.empty:
            all_data['team_officials'].append(team_officials_xg_df)
        
        # MA12 - xG Events
        xg_events_df = process_xG_match_events(match_id)
        if not xg_events_df.empty:
            all_data['xg_events'].append(xg_events_df)
        
        if i < len(match_ids) - 1:
            time.sleep(delay_seconds)
    
    # Combinar DataFrames de nuevos datos
    new_data = {}
    for data_type, df_list in all_data.items():
        if df_list:
            new_data[data_type] = pd.concat(df_list, ignore_index=True)
            print(f"‚úÖ {data_type} (nuevos): {len(new_data[data_type])} filas")
        else:
            new_data[data_type] = pd.DataFrame()
    
    # Guardar (la funci√≥n save_to_parquet ya maneja la combinaci√≥n con datos existentes)
    filename_prefix = f"OPTA_Comp{config['competition_id']}_Stage{config['stage_id'][:8]}_Week{config['max_week']}"
    
    print(f"\nüíæ Guardando datos nuevos y combinando con existentes...")
    save_to_parquet(new_data)
    
    # Cargar datos finales combinados para retornar
    print(f"\nüìä CARGANDO DATOS FINALES COMBINADOS:")
    folder = "datos_opta_parquet"
    parquet_files = {
        'player_stats': 'player_stats.parquet',
        'team_stats': 'team_stats.parquet', 
        'player_xg_stats': 'player_xg_stats.parquet',
        'xg_events': 'xg_events.parquet',
        'match_events': 'abp_events.parquet',
        'team_officials': 'team_officials.parquet'
    }
    
    final_result = {}
    for data_type, filename in parquet_files.items():
        filepath = f"{folder}/{filename}"
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            final_result[data_type] = df
            print(f"   ‚úÖ {data_type}: {len(df)} filas totales")
        else:
            final_result[data_type] = pd.DataFrame()
    
    print(f"\n‚úÖ Completado! Archivos actualizados con prefijo: {filename_prefix}")
    print(f"üéâ Descarga incremental: {len(new_matches_df)} partidos nuevos procesados")
    
    return final_result

def quick_analysis(data):
    """An√°lisis r√°pido"""
    print("\nüìä AN√ÅLISIS R√ÅPIDO:")
    
    for data_type, df in data.items():
        if not df.empty:
            print(f"\n{data_type}: {len(df)} filas")
            
            if data_type == 'xg_events':
                # Analizar eventos xG
                xg_events = df[df['qualifier 321'].notna() & (df['qualifier 321'] != '0')]
                if not xg_events.empty:
                    print(f"  üéØ Eventos con xG: {len(xg_events)}")
            
            elif data_type == 'player_stats':
                print(f"  üë• Jugadores √∫nicos: {df['Player ID'].nunique()}")
                print(f"  üìä Partidos √∫nicos: {df['Match ID'].nunique()}")
                print(f"  üóìÔ∏è Stages √∫nicos: {df['Stage ID'].nunique()}")
            
            elif data_type == 'team_stats':
                print(f"  üèÜ Equipos √∫nicos: {df['Team ID'].nunique()}")
                if 'Is Home' in df.columns:
                    home_teams = df[df['Is Home'] == 'True']['Team Name'].nunique()
                    away_teams = df[df['Is Away'] == 'True']['Team Name'].nunique()
                    print(f"  üè† Equipos locales: {home_teams}")
                    print(f"  üöå Equipos visitantes: {away_teams}")
                    
                    # Mostrar marcadores si est√°n disponibles
                    if 'FT Home Score' in df.columns:
                        matches_with_scores = df[df['FT Home Score'].notna()]
                        if not matches_with_scores.empty:
                            print(f"  ‚öΩ Partidos con marcador final: {len(matches_with_scores)}")

def get_home_away_info(match_info, live_data):
    """Extrae informaci√≥n de home/away del partido"""
    contestants = match_info.get('contestant', [])
    match_details = live_data.get('matchDetails', {})
    
    # Crear mapeo de equipos
    team_mapping = {}
    home_team_id = None
    away_team_id = None
    
    for contestant in contestants:
        team_id = contestant.get('id')
        team_name = contestant.get('name', 'N/A')  # ‚úÖ ESTA L√çNEA ES CORRECTA
        position = contestant.get('position', '')
        
        team_mapping[team_id] = {
            'name': team_name,
            'position': position
        }
        
        if position.lower() == 'home':
            home_team_id = team_id
        elif position.lower() == 'away':
            away_team_id = team_id
    
    # Obtener marcadores si est√°n disponibles
    scores = match_details.get('scores', {})
    ht_scores = scores.get('ht', {})
    ft_scores = scores.get('ft', {})
    
    return {
        'home_team_id': home_team_id,
        'away_team_id': away_team_id,
        'team_mapping': team_mapping,
        'ht_home': ht_scores.get('home', None),
        'ht_away': ht_scores.get('away', None),
        'ft_home': ft_scores.get('home', None),
        'ft_away': ft_scores.get('away', None)
    }

def discover_stage_ids_for_competition(competition_id):
    """Descubre Stage IDs para una competici√≥n espec√≠fica"""
    print(f"üîç Descubriendo Stage IDs para competici√≥n {competition_id}...")
    
    stages = get_available_stages(competition_id)
    print(f"üìä Stages encontrados:")
    for stage_id, stage_info in stages.items():
        if isinstance(stage_info, dict):
            print(f"  '{stage_id}': '{stage_info.get('name', 'N/A')}'")
        else:
            print(f"  '{stage_id}': '{stage_info}'")
    
    return stages

In [10]:
# ====================================
# EJECUCI√ìN
# ====================================

if __name__ == "__main__":
    print("üéØ OPTA API - EXTRACTOR CON SELECCI√ìN DE TEMPORADA")
    print("üìä Feeds: MA2 (Stats), MA3 (Events), MA12 (xG)")
    
    print(f"\nüí° STAGE ID CONOCIDO:")
    print(f"   üá™üá∏ La Liga 2024-25: 4xu8dwf3cotp5qu0ddi50wkyc")
    
    # Ejecutar extracci√≥n directa
    data = main_interactive()
    if data:
        quick_analysis(data)

üéØ OPTA API - EXTRACTOR CON SELECCI√ìN DE TEMPORADA
üìä Feeds: MA2 (Stats), MA3 (Events), MA12 (xG)

üí° STAGE ID CONOCIDO:
   üá™üá∏ La Liga 2024-25: 4xu8dwf3cotp5qu0ddi50wkyc
üéØ EXTRACCI√ìN OPTA - MA2, MA3 Y MA12 CON SELECCI√ìN DE TEMPORADA

üá™üá∏ LA LIGA - SELECCIONA TEMPORADA:
   üîÑ Intentando con MA1 b√°sico...
  1: Temporada 2023/24 ‚úÖ
  2: Temporada 2024/25 ‚úÖ
  3: Temporada 2025/26 ‚úÖ

‚úÖ Configuraci√≥n:
   üèÜ Competici√≥n: 15
   üóìÔ∏è Temporada: La Liga 2024/25
   üìÖ Jornadas: 1-10
   üìä Feeds: MA2 (Match Stats), MA3 (Events), MA12 (xG)
üîç Revisando archivos existentes...
   üìÑ player_stats.parquet: no existe
   üìÑ team_stats.parquet: no existe
   üìÑ player_xg_stats.parquet: no existe
   üìÑ xg_events.parquet: no existe
   üìÑ abp_events.parquet: no existe
   üìÑ team_officials.parquet: no existe
   üìÅ No se encontraron datos previos - descarga completa

üîÑ Obteniendo partidos...
üìÖ Jornada 1...
   ‚úÖ Encontrados 10 partidos en jornad