# Segundo Modelo de riesgo tras recuperaciones 
Entrenaremos un segundo modelo de clasificación para estimar la probabilidad de que, tras una recuperación de balón, la jugada termine en tiro.

## Imports y rutas

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.utils.class_weight import compute_class_weight
import regex as re
import joblib

BASE_DIR = Path().resolve()
DATA_PATH = BASE_DIR / "source" / "train_recoveries_K8_T15s.csv" 
print("Directorio base:", BASE_DIR)
print("Ruta del CSV de entrada:", DATA_PATH)


## Cargar Datos

In [None]:
df = pd.read_csv(DATA_PATH)
print("Forma del dataset:", df.shape)
df.head()

## Selección de feautures y definición de objetivo

In [None]:
num = [c for c in ["ax","ay","zx","zy","a_minute","period","pass_length","pass_angle"]
       if c in df.columns]
cat = [c for c in ["a_type","play_pattern","pass_height","pass_type","pass_outcome","dribble_outcome","duel_type","duel_outcome","zone_id","team_id"]
       if c in df.columns]
binf = [c for c in ["under_pressure","counterpress","dribble_overrun","dribble_no_touch","pass_cross","pass_cut_back","pass_switch","pass_through_ball","pass_straight"]
        if c in df.columns]
y = df["y_shot"].astype(int).values
#Grupos
if "match_id" in df.columns:
    groups = df["match_id"].values
else:
    groups = np.arange(len(df))

print("Tamaño de y:", y.shape)
print("Distribución de clases (valor, conteo):", np.unique(y, return_counts=True))


## Preprocesamiento

In [None]:
pre = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num),
        ("cat", Pipeline(steps=[
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), cat),
        ("bin", SimpleImputer(strategy="constant", fill_value=0), binf),
    ],
    remainder="drop"
)
pre


## Clasificación por pesos

In [None]:
classes = np.array([0, 1])
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y)
class_weight = {0: float(cw[0]), 1: float(cw[1])}
print("Pesos de clase:", class_weight)
clf = Pipeline(steps=[
    ("prep", pre),
    ("mdl", RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_leaf=2,
        n_jobs=-1,
        class_weight=class_weight,
        random_state=7
    ))
])
clf

## Cross validation

In [None]:
gkf = GroupKFold(n_splits=5)
aucs, aps = [], []
X_all = df[num + cat + binf]

for fold, (tr, te) in enumerate(gkf.split(X_all, y, groups), start=1):
    print(f"Fold {fold}...")
    X_tr, X_te = X_all.iloc[tr], X_all.iloc[te]
    y_tr, y_te = y[tr], y[te]
    clf.fit(X_tr, y_tr)
    p = clf.predict_proba(X_te)[:, 1]
    auc = roc_auc_score(y_te, p)
    ap = average_precision_score(y_te, p)
    aucs.append(auc)
    aps.append(ap)
    print(f"  AUC = {auc:.3f} | AP = {ap:.3f}")
print("\nResultados")
print(f"AUC: {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")
print(f"AP : {np.mean(aps):.3f} ± {np.std(aps):.3f}")


## Entrenamiento y guardado del modelo

In [None]:
X = df[num + cat + binf]
print("Entrenando modelo final...")
clf.fit(X, y)
print("Entrenamiento completado.")
OUTPUT_DIR = BASE_DIR / "joblib"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
model_name = "recovery_risk_model_rf.joblib"
model_path = OUTPUT_DIR / model_name
joblib.dump(clf, model_path)
print("Modelo guardado en:", model_path)
model_path
