In [None]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib # Para guardar el modelo

# 1. Conectarse a la DB (que ya está corriendo gracias a Docker)
# (Asegúrate de que el puerto 5432 esté expuesto en el docker-compose.yml)
connection_string = "postgresql://admin:adminpass@localhost:5432/meritocr_ai_db"
engine = create_engine(connection_string)

# 2. Cargar datos del período de solapamiento (donde SÍ tenemos xG)
sql_query = """
SELECT goals, assists, matches_played, minutes_played, xg, xa 
FROM player_performance_raw
WHERE xg IS NOT NULL AND xa IS NOT NULL
-- (Deberías añadir la temporada para filtrar, ej. 2013-2015)
"""
df = pd.read_sql(sql_query, engine)

# 3. Preparar datos para el modelo de imputación (Target = xG)
df_clean = df.dropna(subset=['xg', 'goals', 'assists'])
X = df_clean[['goals', 'assists', 'minutes_played']] # Features proxy
y = df_clean['xg'] # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Entrenar modelo (como definiste, ej. Random Forest)
imputation_model = RandomForestRegressor(n_estimators=100, random_state=42)
imputation_model.fit(X_train, y_train)

# 5. Validar rendimiento
preds = imputation_model.predict(X_test)
r2 = r2_score(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"Validación del Modelo de Imputación (Pre-xG):")
print(f"R^2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

# 6. Guardar (serializar) el modelo
# Lo usarás en la Fase 2 para imputar los valores de 2000-2012
joblib.dump(imputation_model, '../models/xg_imputation_model.pkl') # (Crea una carpeta /models)