In [3]:
import pandas as pd
import numpy as np
import os

# ==============================================================================
# 1. CONFIGURACIÓN
# ==============================================================================
INPUT_FILE  = '../services/ingestion/data_drop/fbref_big5_players_24_25_raw.csv'
OUTPUT_FILE = '../services/ingestion/data_drop/fbref_big5_players_24_25_clean.csv'

print(f">>> INICIANDO ETL V5: CLEAN, MERGE & STANDARDIZE <<<")

# ==============================================================================
# 2. CARGA
# ==============================================================================
try:
    df = pd.read_csv(INPUT_FILE, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(INPUT_FILE, encoding='latin1')
except FileNotFoundError:
    print(f"!!! ERROR: No se encuentra {INPUT_FILE}")
    raise

# Limpiar espacios en nombres de columnas del CSV original
df.columns = df.columns.str.strip()
print(f"[1/5] Carga inicial: {df.shape}")

# ==============================================================================
# 3. LIMPIEZA ESTRUCTURAL
# ==============================================================================
# A. Filas repetidas
if 'Rk' in df.columns:
    df = df[df['Rk'] != 'Rk'].copy()

# B. Columnas basura
cols_to_drop = [c for c in df.columns if '_stats_' in c]
df.drop(columns=cols_to_drop, inplace=True)

# C. Strings
if 'Nation' in df.columns:
    df['Nation'] = df['Nation'].astype(str).str.split(' ').str[-1]
if 'Comp' in df.columns:
    df['Comp'] = df['Comp'].astype(str).str.split(' ', n=1).str[1]
if 'Pos' in df.columns:
    df['Pos_Primary'] = df['Pos'].astype(str).str.split(',').str[0]

# ==============================================================================
# 4. CONVERSIÓN DE TIPOS
# ==============================================================================
non_numeric = ['Rk', 'Player', 'Nation', 'Pos', 'Pos_Primary', 'Squad', 'Comp', 'Age', 'Born']
cols_numeric = [c for c in df.columns if c not in non_numeric]

for col in cols_numeric:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Age y Born numéricos pero tratados como meta
for c in ['Age', 'Born']:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors='coerce')

# Rellenar Nulos en métricas con 0
metrics_to_fill = [
    'Min', 'Gls', 'Ast', 'xG', 'npxG', 'xAG', # Ataque
    'Tkl', 'TklW', 'Int', 'Blocks', 'Clr', 'Recov', # Defensa
    'PrgP', 'PrgC', 'KP', 'PPA', # Creación
    'Saves', 'GA', 'SoTA', 'CS', 'Save%' # Portero
]
for col in metrics_to_fill:
    if col in df.columns: df[col] = df[col].fillna(0)

# ==============================================================================
# 5. LOGICA DE TRANSFERENCIAS (TOTALES)
# ==============================================================================
print("[3/5] Calculando Totales por Jugador...")

df['Team_Order'] = df.groupby('Player').cumcount() + 1
df['Registro_Tipo'] = 'Parcial'

player_counts = df['Player'].value_counts()
multi_players = player_counts[player_counts > 1].index
df_multi = df[df['Player'].isin(multi_players)].copy()

if not df_multi.empty:
    agg_dict = {col: 'sum' for col in cols_numeric}
    meta_cols_agg = [c for c in non_numeric if c in df.columns and c != 'Player']
    agg_dict.update({col: 'first' for col in meta_cols_agg})
    if 'Age' in df.columns: agg_dict['Age'] = 'max'

    df_totals = df_multi.groupby('Player').agg(agg_dict).reset_index()

    # Sanitizar Ratios
    ratio_keywords = ['%', '/90', 'Per 90', 'Av', 'Mn/']
    ratio_cols = [c for c in df_totals.columns if any(k in c for k in ratio_keywords)]
    for col in ratio_cols: df_totals[col] = np.nan

    df_totals['Squad'] = 'COMBINED'
    df_totals['Comp'] = 'Multiple'
    df_totals['Registro_Tipo'] = 'Total'
    df_totals['Team_Order'] = 0
    
    df_final = pd.concat([df, df_totals], ignore_index=True)
else:
    df_final = df.copy()

# ==============================================================================
# 6. ESTANDARIZACIÓN DE NOMBRES (ELITE MAPPING)
# ==============================================================================
print("[4/5] Renombrando columnas al estándar de Base de Datos...")

# MAPEO OFICIAL: CSV RAW -> POSTGRESQL
# Aquí es donde ocurre la magia de armonización
db_mapping = {
    # --- IDENTIFICACIÓN ---
    'Player': 'player_name', 'Nation': 'nation', 'Pos_Primary': 'main_position_group', 
    'Squad': 'squad', 'Comp': 'league_id', 'Age': 'age', 'Born': 'born',
    'Registro_Tipo': 'record_type', 'Team_Order': 'team_order',
    
    # --- TIEMPO ---
    'MP': 'matches_played', 'Starts': 'starts', 'Min': 'minutes_played', '90s': 'nineties',
    
    # --- ATAQUE ---
    'Gls': 'goals', 'Ast': 'assists', 'G+A': 'goals_assists',
    'PK': 'pk_goals', 'PKatt': 'pk_attempts',
    'xG': 'xg', 'npxG': 'npxg', 'xAG': 'xa',
    'Sh': 'shots_total', 'SoT': 'shots_on_target', 'SoT%': 'shots_on_target_pct',
    'G/Sh': 'goals_per_shot', 'G/SoT': 'goals_per_sot', 'Dist': 'avg_shot_distance',
    'FK': 'free_kick_shots',
    
    # --- CREACIÓN Y PASES ---
    'PrgP': 'progressive_passes', 'PrgC': 'progressive_carries', 
    'KP': 'key_passes', 'PPA': 'passes_penalty_area', 'CrsPA': 'crosses_penalty_area',
    'Cmp': 'passes_completed', 'Att': 'passes_attempted', 'Cmp%': 'pass_completion_pct',
    'TotDist': 'pass_total_distance', 'PrgDist': 'pass_progressive_distance',
    'xA': 'xa', 'A-xAG': 'assists_minus_xa',
    
    # --- DEFENSA ---
    'Tkl': 'tackles_total', 'TklW': 'tackles_won', 
    'Int': 'interceptions', 'Blocks': 'blocks', 'Clr': 'clearances', 
    'Err': 'errors_leading_to_goal', 'Recov': 'ball_recoveries',
    'Tkl+Int': 'tackles_interceptions', 'Def 3rd': 'tackles_def_3rd',
    'Mid 3rd': 'tackles_mid_3rd', 'Att 3rd': 'tackles_att_3rd',
    
    # --- POSESIÓN ---
    'Touches': 'touches', 'Dis': 'dispossessed', 'Mis': 'miscontrols',
    'Carries': 'carries', 'PrgR': 'progressive_runs',
    'Succ': 'dribbles_completed', 'Succ%': 'dribble_success_pct',
    
    # --- PORTERO (GK) - Las "Raras" ---
    'GA': 'goals_against', 'GA90': 'goals_against_p90',
    'SoTA': 'shots_on_target_against', 
    'Saves': 'saves', 'Save%': 'save_pct', 
    'CS': 'clean_sheets', 'CS%': 'clean_sheet_pct',
    'W': 'wins', 'D': 'draws', 'L': 'losses',
    'PKA': 'pk_allowed', 'PKsv': 'pk_saved', 'PKm': 'pk_missed',
    'PSxG': 'psxg', 'PSxG/SoT': 'psxg_per_sot', 'PSxG+/-': 'psxg_plus_minus',
    'Launch%': 'launch_pct', 'AvgLen': 'avg_pass_length', # <-- AQUÍ ESTABAN
    'Stp': 'crosses_stopped', 'Stp%': 'crosses_stopped_pct', # <-- AQUÍ ESTABAN
    '#OPA': 'def_actions_outside_box', # <-- AQUÍ ESTABA
    '#OPA/90': 'def_actions_outside_box_p90',
    'AvgDist': 'avg_keeper_sweeper_dist',
    
    # --- DISCIPLINA ---
    'CrdY': 'yellow_cards', 'CrdR': 'red_cards', '2CrdY': 'second_yellow_card',
    'Fls': 'fouls_committed', 'Fld': 'fouls_drawn', 'Off': 'offsides',
    'PKwon': 'pk_won', 'PKcon': 'pk_conceded', 'OG': 'own_goals'
}

# Renombrar solo las que existen
cols_to_rename = {k: v for k, v in db_mapping.items() if k in df_final.columns}
df_final.rename(columns=cols_to_rename, inplace=True)

# Añadir Metadata si falta
if 'season_id' not in df_final.columns:
    df_final['season_id'] = 2024-2025

# ==============================================================================
# 7. EXPORTACIÓN
# ==============================================================================
# Ordenamos columnas para que las más importantes salgan primero
priority_cols = [
    'player_name', 'nation', 'main_position_group', 'squad', 'league_id', 'age', 'record_type',
    'matches_played', 'starts', 'minutes_played',
    'goals', 'assists', 'xg', 'xa',
    'tackles_total', 'interceptions', 'progressive_passes', 'saves'
]
existing_priority = [c for c in priority_cols if c in df_final.columns]
other_cols = [c for c in df_final.columns if c not in existing_priority]

df_final = df_final[existing_priority + other_cols]

df_final.to_csv(OUTPUT_FILE, index=False)

print("="*60)
print(f"✅ ÉXITO: CSV normalizado generado con nombres de DB.")
print(f"   Ejemplo de columnas: {list(df_final.columns[:5])}")
print("="*60)

>>> INICIANDO ETL V5: CLEAN, MERGE & STANDARDIZE <<<
[1/5] Carga inicial: (2854, 267)
[3/5] Calculando Totales por Jugador...
[4/5] Renombrando columnas al estándar de Base de Datos...
✅ ÉXITO: CSV normalizado generado con nombres de DB.
   Ejemplo de columnas: ['player_name', 'nation', 'main_position_group', 'squad', 'league_id']


  df['Team_Order'] = df.groupby('Player').cumcount() + 1
  df['Registro_Tipo'] = 'Parcial'
  df_final['season_id'] = 2024-2025
