In [1]:
import pandas as pd
import numpy as np
import os

# ==============================================================================
# 1. CONFIGURACIÓN DE RUTAS (Elite Naming Convention)
# ==============================================================================
# Rutas relativas desde la carpeta 'notebooks' hacia 'services/ingestion/data_drop'
INPUT_FILE  = '../services/ingestion/data_drop/fbref_big5_players_23_24_raw.csv'
OUTPUT_FILE = '../services/ingestion/data_drop/fbref_big5_players_23_24_clean.csv'

print(f">>> INICIANDO PROTOCOLO ETL: CLEAN & MERGE V3 <<<")
print(f"    Input: {INPUT_FILE}")

# ==============================================================================
# 2. CARGA DE DATOS
# ==============================================================================
try:
    df = pd.read_csv(INPUT_FILE, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(INPUT_FILE, encoding='latin1')
except FileNotFoundError:
    print(f"!!! ERROR CRÍTICO: No se encuentra el archivo en {INPUT_FILE}")
    raise

print(f"[1/5] Archivo cargado. Dimensiones crudas: {df.shape}")

# ==============================================================================
# 3. LIMPIEZA ESTRUCTURAL
# ==============================================================================
# A. Eliminar filas repetidas de headers
if 'Rk' in df.columns:
    df_clean = df[df['Rk'] != 'Rk'].copy()
else:
    df_clean = df.copy()

# B. Eliminar Columnas Redundantes ('_stats_')
cols_to_drop = [c for c in df_clean.columns if '_stats_' in c]
df_clean.drop(columns=cols_to_drop, inplace=True)
print(f"[2/5] Limpieza Estructural: Se eliminaron {len(cols_to_drop)} columnas redundantes.")

# C. Estandarización de Strings
# Usamos métodos seguros que no fallen si la columna no existe
if 'Nation' in df_clean.columns:
    df_clean['Nation'] = df_clean['Nation'].astype(str).str.split(' ').str[-1]
if 'Comp' in df_clean.columns:
    df_clean['Comp'] = df_clean['Comp'].astype(str).str.split(' ', n=1).str[1]
if 'Pos' in df_clean.columns:
    df_clean['Pos_Primary'] = df_clean['Pos'].astype(str).str.split(',').str[0]

# ==============================================================================
# 4. CONVERSIÓN DE TIPOS
# ==============================================================================
# Lista de exclusión para no sumar metadatos
non_numeric_cols = ['Rk', 'Player', 'Nation', 'Pos', 'Pos_Primary', 'Squad', 'Comp', 'Age', 'Born']

# Identificar métricas numéricas
cols_numeric = [c for c in df_clean.columns if c not in non_numeric_cols]

# Convertir
for col in cols_numeric:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Convertir Age/Born a número (pero siguen siendo metadatos)
if 'Age' in df_clean.columns: df_clean['Age'] = pd.to_numeric(df_clean['Age'], errors='coerce')
if 'Born' in df_clean.columns: df_clean['Born'] = pd.to_numeric(df_clean['Born'], errors='coerce')

# FIX ARQUEROS (NaN -> 0)
gk_cols_fix = ['Saves', 'GA', 'SoTA', 'CS', 'PKA', 'PKsv', 'Save%']
for col in gk_cols_fix:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna(0)

# ==============================================================================
# 5. LÓGICA DE TRANSFERENCIAS (TU LÓGICA MAESTRA V3)
# ==============================================================================
print("[3/5] Generando filas 'Total' para jugadores transferidos...")

# Marcamos el tipo de registro
df_clean['Team_Order'] = df_clean.groupby('Player').cumcount() + 1
df_clean['Registro_Tipo'] = 'Parcial'

# Identificamos duplicados
player_counts = df_clean['Player'].value_counts()
multi_players = player_counts[player_counts > 1].index
df_multi = df_clean[df_clean['Player'].isin(multi_players)].copy()

if not df_multi.empty:
    # 1. Sumar Métricas
    agg_dict = {col: 'sum' for col in cols_numeric}
    
    # 2. Metadatos -> First
    meta_cols_agg = [c for c in non_numeric_cols if c in df_clean.columns and c != 'Player']
    agg_dict.update({col: 'first' for col in meta_cols_agg})
    
    # 3. Age -> MAX (Tu corrección)
    if 'Age' in df_clean.columns: agg_dict['Age'] = 'max'

    # GroupBy
    df_totals = df_multi.groupby('Player').agg(agg_dict).reset_index()

    # Sanitizar Ratios (%)
    ratio_keywords = ['%', '/90', 'Per 90', 'Av', 'Mn/']
    ratio_cols = [c for c in df_totals.columns if any(k in c for k in ratio_keywords)]
    for col in ratio_cols:
        df_totals[col] = np.nan

    # Etiquetas Totales
    df_totals['Squad'] = 'COMBINED'
    df_totals['Comp'] = 'Multiple'
    df_totals['Registro_Tipo'] = 'Total'
    df_totals['Team_Order'] = 0
    
    # Fusión
    df_final = pd.concat([df_clean, df_totals], ignore_index=True)
else:
    df_final = df_clean.copy()

# ==============================================================================
# 6. EXPORTACIÓN
# ==============================================================================
print("[4/5] Ordenando y guardando...")

df_final.sort_values(by=['Player', 'Team_Order'], inplace=True)

# Guardar el archivo limpio para que Docker lo consuma
df_final.to_csv(OUTPUT_FILE, index=False)

print("="*60)
print(f"✅ ÉXITO: Archivo limpio generado en:\n   {OUTPUT_FILE}")
print("="*60)

>>> INICIANDO PROTOCOLO ETL: CLEAN & MERGE V3 <<<
    Input: ../services/ingestion/data_drop/fbref_big5_players_23_24_raw.csv
[1/5] Archivo cargado. Dimensiones crudas: (2854, 267)
[2/5] Limpieza Estructural: Se eliminaron 118 columnas redundantes.
[3/5] Generando filas 'Total' para jugadores transferidos...
[4/5] Ordenando y guardando...


  df_clean['Team_Order'] = df_clean.groupby('Player').cumcount() + 1
  df_clean['Registro_Tipo'] = 'Parcial'


✅ ÉXITO: Archivo limpio generado en:
   ../services/ingestion/data_drop/fbref_big5_players_23_24_clean.csv
