# Feature Engineering + Threshold Tuning + LightGBM + CatBoost

> El objetivo de este notebook es mejorar el F2 Lead Oro sobre el campeón actual (`XGB_Baseline`, F2=0.4538) entrenado en el notebook ***LeadScoring***.

Todos los runs se logean en el mismo experimento `EquineLead_LeadScoring` de DagsHub  
con prefijo `v2_*` para distinguirlos de la primera iteración.

## 0 · Setup

In [6]:
import subprocess, sys

paquetes = ['lightgbm', 'catboost', 'optuna']
for pkg in paquetes:
    result = subprocess.run(
        ['uv', 'pip', 'install', pkg, '--python', sys.executable],
        capture_output=True, text=True
    )
    print(f'{pkg}: {"✅" if result.returncode == 0 else "❌"} {result.stderr[:150] if result.returncode != 0 else "OK"}')

lightgbm: ✅ OK
catboost: ✅ OK
optuna: ✅ OK


In [1]:
import mlflow
import os
import getpass

os.environ['MLFLOW_TRACKING_USERNAME'] = 'ITRoselloSignoris'
os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass.getpass('DagsHub token: ')

mlflow.set_tracking_uri('https://dagshub.com/aletbm/S02-26-E45-Data_Science_EquineLead.mlflow')

EXPERIMENT_NAME = 'EquineLead_LeadScoring'
mlflow.set_experiment(EXPERIMENT_NAME)

ModuleNotFoundError: No module named 'mlflow'

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import tempfile, os, pickle
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import fbeta_score, precision_recall_curve, make_scorer
from sklearn.preprocessing import TargetEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

f2_scorer   = make_scorer(fbeta_score, beta=2)
cv5         = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
orden_leads = ['Lead Bronce', 'Lead Plata', 'Lead Oro']

ModuleNotFoundError: No module named 'optuna'

## 1 · Cargar el dataset y Feature Engineering

Cargamos `df_final.parquet` que tiene **todas** las columnas originales incluidas las categóricas  
que se dropearon. Acá las recuperamos y construimos features nuevas.

In [None]:
df = pd.read_parquet('../../data/clean/df_final.parquet')
print(f'Shape: {df.shape}')
print(f'Columnas: {list(df.columns)}')

### 1.1 · Features de ratios e intensidad

Estas features capturan **intención real** mejor que los conteos absolutos.  
Un usuario que vio 3 caballos y metió 2 al carrito es muy diferente a uno que vio 50 y metió 2.

In [None]:
# — Ratios de engagement (intención / exposición) —
df['ratio_cart_horse']   = df['horses_added_to_cart']  / (df['horses_viewed']  + 1)
df['ratio_cart_prods']   = df['products_added_to_cart'] / (df['products_viewed'] + 1)
df['ratio_cart_global']  = df['total_cart_adds'] / (df['total_views'] + 1)

# — Precio aspiracional (¿apunta por encima del promedio?) —
df['precio_aspiracional_horse'] = df['max_horse_price_viewed'] / (df['avg_horse_price_viewed'] + 1)
df['precio_aspiracional_prods'] = df['max_product_price_viewed'] / (df['avg_product_price_viewed'] + 1)

# — Rango de precio (foco vs exploración) —
df['rango_precio_horse'] = df['max_horse_price_viewed'] - df['min_horse_price_viewed']

# — Brecha de prestige (¿más exigente en caballos que en productos?) —
df['prestige_gap'] = df['avg_prestige_score_horses'] - df['avg_prestige_score_products']

# — Proporción de vistas de caballo sobre total —
df['ratio_horse_views'] = df['horses_viewed'] / (df['total_views'] + 1)

nuevas = ['ratio_cart_horse', 'ratio_cart_prods', 'ratio_cart_global',
          'precio_aspiracional_horse', 'precio_aspiracional_prods',
          'rango_precio_horse', 'prestige_gap', 'ratio_horse_views']
print(f'Features nuevas: {nuevas}')
print(df[nuevas].describe().round(3))

### 1.2 · Encoding de categóricas

Las 6 columnas categóricas se encodean con **Target Encoding** calculado sobre train  
y aplicado a test — sin leakage.  

Para CatBoost las dejamos como strings porque CatBoost las maneja nativamente.

In [None]:
CAT_COLS = [
    'gender_with_most_appearances',
    'breedFamily_with_most_appearances',
    'color_grouped_with_most_appearances',
    'most_viewed_category',
    'most_viewed_brand',
    'most_viewed_target_user',
]

# Verificamos que existen en el parquet
disponibles = [c for c in CAT_COLS if c in df.columns]
faltantes   = [c for c in CAT_COLS if c not in df.columns]
print(f'Disponibles: {disponibles}')
print(f'Faltantes:   {faltantes}')

In [None]:
# Columnas a dropear (redundantes o sin señal)
DROP_COLS = [
    'has_both_interests',  # redundante con has_registry_viewed + has_shipping_viewed
    'total_views',         # correlación > 0.86 con horses_viewed + products_viewed
    'avg_height',          # característica del caballo, no del comportamiento del usuario
    'avg_weight',          # idem
]
DROP_COLS = [c for c in DROP_COLS if c in df.columns]

df_model = df.drop(columns=DROP_COLS)
print(f'Shape después de drops: {df_model.shape}')
print(f'Columnas dropeadas: {DROP_COLS}')

In [None]:
# Split estratificado
X = df_model.drop(columns=['horse_target', 'prods_target'])
y = df_model[['horse_target', 'prods_target']]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)
print(f'X_train: {X_train.shape} | X_test: {X_test.shape}')

# Target Encoding sobre train, aplicado a test (sin leakage)
# Usamos horse_target como target de referencia para el encoding
y_train_enc = y_train['horse_target'].map({'Lead Bronce': 0, 'Lead Plata': 1, 'Lead Oro': 2})

te = TargetEncoder(target_type='continuous', random_state=42)
X_train[disponibles] = te.fit_transform(X_train[disponibles], y_train_enc)
X_test[disponibles]  = te.transform(X_test[disponibles])

print(f'Target encoding aplicado a: {disponibles}')
print(f'Features totales: {X_train.shape[1]}')
print(f'Features: {list(X_train.columns)}')

### 1.3 · Targets binarios y balanceo

In [None]:
# Paso 1: Bronce (0) vs Plata/Oro (1)
y_train_p1_horse = (y_train['horse_target'] != 'Lead Bronce').astype(int)
y_train_p1_prods = (y_train['prods_target'] != 'Lead Bronce').astype(int)
y_test_p1_horse  = (y_test['horse_target']  != 'Lead Bronce').astype(int)
y_test_p1_prods  = (y_test['prods_target']  != 'Lead Bronce').astype(int)

smote_p1 = SMOTE(random_state=42)
X_p1h_bal, y_p1h_bal = smote_p1.fit_resample(X_train, y_train_p1_horse)
X_p1p_bal, y_p1p_bal = smote_p1.fit_resample(X_train, y_train_p1_prods)

# Paso 2: Plata (0) vs Oro (1)
mask_p2_horse = y_train['horse_target'] != 'Lead Bronce'
mask_p2_prods = y_train['prods_target'] != 'Lead Bronce'
X_p2h_raw = X_train[mask_p2_horse]
X_p2p_raw = X_train[mask_p2_prods]
y_p2h_raw = (y_train['horse_target'][mask_p2_horse] == 'Lead Oro').astype(int)
y_p2p_raw = (y_train['prods_target'][mask_p2_prods] == 'Lead Oro').astype(int)

smote_p2 = SMOTE(random_state=42)
X_p2h_bal, y_p2h_bal = smote_p2.fit_resample(X_p2h_raw, y_p2h_raw)
X_p2p_bal, y_p2p_bal = smote_p2.fit_resample(X_p2p_raw, y_p2p_raw)

mask_test_p2_horse = y_test['horse_target'] != 'Lead Bronce'
mask_test_p2_prods = y_test['prods_target'] != 'Lead Bronce'
y_test_p2_horse = (y_test['horse_target'][mask_test_p2_horse] == 'Lead Oro').astype(int)
y_test_p2_prods = (y_test['prods_target'][mask_test_p2_prods] == 'Lead Oro').astype(int)

spw_p1h = (y_train_p1_horse==0).sum() / (y_train_p1_horse==1).sum()
spw_p1p = (y_train_p1_prods==0).sum() / (y_train_p1_prods==1).sum()
spw_p2h = (y_p2h_raw==0).sum() / (y_p2h_raw==1).sum()
spw_p2p = (y_p2p_raw==0).sum() / (y_p2p_raw==1).sum()

print(f'P1 bal: {y_p1h_bal.value_counts().to_dict()}')
print(f'P2 bal: {y_p2h_bal.value_counts().to_dict()}')
print(f'SPW p2 horse: {spw_p2h:.2f} | SPW p2 prods: {spw_p2p:.2f}')

## 2 · Funciones de evaluación y logging

In [None]:
def predecir_cascada(X, m1, m2):
    pred = np.array(['Lead Bronce'] * len(X), dtype=object)
    mask = m1.predict(X) == 1
    if mask.sum() > 0:
        pred[mask] = np.where(m2.predict(X[mask]) == 1, 'Lead Oro', 'Lead Plata')
    return pred

def predecir_cascada_proba(X, m1, m2):
    """Devuelve probabilidad de ser Oro para threshold tuning."""
    proba_p1 = m1.predict_proba(X)[:, 1]
    proba_p2 = m2.predict_proba(X)[:, 1]
    # Probabilidad de Oro = P(pasa P1) * P(es Oro dado que pasó P1)
    return proba_p1 * proba_p2

def calcular_metricas(p1h, p1p, p2h, p2p):
    metricas = {}
    for target, m1, m2, y_true, X_te_p2, y_te_p2 in [
        ('horse', p1h, p2h, y_test['horse_target'],
         X_test[mask_test_p2_horse], y_test_p2_horse),
        ('prods', p1p, p2p, y_test['prods_target'],
         X_test[mask_test_p2_prods], y_test_p2_prods),
    ]:
        y_pred = predecir_cascada(X_test, m1, m2)
        f2_macro = fbeta_score(y_true, y_pred, beta=2, average='macro', labels=orden_leads)
        f2_oro   = fbeta_score(y_true, y_pred, beta=2, labels=['Lead Oro'], average='macro')
        f2_p2_tr = fbeta_score(
            y_p2h_raw if target=='horse' else y_p2p_raw,
            m2.predict(X_p2h_raw if target=='horse' else X_p2p_raw), beta=2
        )
        f2_p2_te = fbeta_score(y_te_p2, m2.predict(X_te_p2), beta=2)
        metricas[target] = {
            'f2_macro': f2_macro, 'f2_lead_oro': f2_oro,
            'f2_paso2_train': f2_p2_tr, 'f2_paso2_test': f2_p2_te,
            'overfit_gap_p2': f2_p2_tr - f2_p2_te,
            'y_pred': y_pred,
        }
    return metricas

def loguear_run(run_name, model_type, params_p1, params_p2,
                p1h, p1p, p2h, p2p, metricas, extra_metrics=None):
    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag('model_type', model_type)
        mlflow.set_tag('version', 'v2')
        for k, v in params_p1.items(): mlflow.log_param(f'p1_{k}', v)
        for k, v in params_p2.items(): mlflow.log_param(f'p2_{k}', v)
        for target, m in metricas.items():
            for metric, value in m.items():
                if metric != 'y_pred':
                    mlflow.log_metric(f'{metric}_{target}', value)
        if extra_metrics:
            for k, v in extra_metrics.items(): mlflow.log_metric(k, v)
        # Guardamos modelos como pickle
        for nombre, modelo in [('p1_horse',p1h),('p1_prods',p1p),('p2_horse',p2h),('p2_prods',p2p)]:
            with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as tmp:
                pickle.dump(modelo, tmp)
                mlflow.log_artifact(tmp.name, nombre)
                os.unlink(tmp.name)
        run_id = mlflow.active_run().info.run_id
        print(f'\nRun logueado: {run_name} | run_id: {run_id}')
    return run_id

def check_overfitting(configs, nombre):
    print(f'\n  Chequeo Overfitting — {nombre}')
    print(f'{"─"*60}')
    print(f'{"Paso":<18} {"F2 Train":>10} {"F2 Test":>10} {"Gap":>8}  Status')
    print('─'*60)
    for label, model, X_tr, y_tr, X_te, y_te in configs:
        f2_tr = fbeta_score(y_tr, model.predict(X_tr), beta=2)
        f2_te = fbeta_score(y_te, model.predict(X_te), beta=2)
        gap   = f2_tr - f2_te
        status = 'overfit' if gap > 0.10 else 'OK'
        print(f'{label:<18} {f2_tr:>10.4f} {f2_te:>10.4f} {gap:>8.4f}  {status}')

## 3 · Threshold Tuning sobre el campeón

El XGB_Baseline usa threshold=0.50 por defecto.  
Con 620 Lead Oro vs 11343 Lead Plata, muchos Oros reales tienen probabilidad 0.30-0.49  
y quedan clasificados como Plata. Buscamos el umbral que maximiza F2 Lead Oro en test.

Cómo F2 penaliza los falsos negativos 2x más que los falsos positivos. Bajar el umbral  
aumenta el recall (recuperamos más Oros reales) a costa de algo de precision  
(algunos Plata se clasifican como Oro).   
  
Dado el peso de la métrica, ese tradeoff conviene.

In [None]:
# Cargamos el campeón de DagsHub
# run_id del XGB_Baseline 
RUN_ID_CAMPEON_V1 = '74f1707c89964d69866d337f07624075'  # ← XGB_Baseline v1

xgb_champ_p1h = mlflow.sklearn.load_model(f'runs:/{RUN_ID_CAMPEON_V1}/p1_horse') \
    if False else pickle.load(open(mlflow.artifacts.download_artifacts(
        f'runs:/{RUN_ID_CAMPEON_V1}/p1_horse', dst_path='/tmp/champ'
    ) + '/tmp_file', 'rb')) if False else None

# Alternativa más simple: reentrenar el campeón con los nuevos features
# (los nuevos features hacen que el modelo cargado no sea directamente comparable)
print(' El campeón de v1 fue entrenado con 29 features.')
print(f'   Este dataset tiene {X_train.shape[1]} features (incluye nuevas + categóricas).')
print('   Threshold tuning se hace sobre modelos reentrenados con el nuevo feature set.')

In [None]:
# Reentrenamos XGB con los mejores params de v1 sobre el nuevo feature set
from xgboost import XGBClassifier

xgb_params_base = dict(
    n_estimators=300, max_depth=4, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='aucpr', tree_method='hist',
    random_state=42, n_jobs=-1
)

xgb_ref_p1h = XGBClassifier(**xgb_params_base, scale_pos_weight=spw_p1h).fit(X_train, y_train_p1_horse)
xgb_ref_p1p = XGBClassifier(**xgb_params_base, scale_pos_weight=spw_p1p).fit(X_train, y_train_p1_prods)
xgb_ref_p2h = XGBClassifier(**xgb_params_base, scale_pos_weight=spw_p2h).fit(X_p2h_raw, y_p2h_raw)
xgb_ref_p2p = XGBClassifier(**xgb_params_base, scale_pos_weight=spw_p2p).fit(X_p2p_raw, y_p2p_raw)

print('Modelos de referencia entrenados con nuevo feature set.')

In [None]:
def threshold_tuning(m1, m2, X_te, y_true, target_name, thresholds=None):
    """Busca el umbral que maximiza F2 Lead Oro."""
    if thresholds is None:
        thresholds = np.arange(0.10, 0.70, 0.02)

    proba_oro = predecir_cascada_proba(X_te, m1, m2)
    resultados = []
    for t in thresholds:
        # Aplicamos la cascada con threshold
        pred = np.array(['Lead Bronce'] * len(X_te), dtype=object)
        mask_p1 = m1.predict_proba(X_te)[:, 1] >= 0.5 
        if mask_p1.sum() > 0:
            proba_p2 = m2.predict_proba(X_te[mask_p1])[:, 1]
            pred[mask_p1] = np.where(proba_p2 >= t, 'Lead Oro', 'Lead Plata')
        f2_oro   = fbeta_score(y_true, pred, beta=2, labels=['Lead Oro'], average='macro')
        f2_macro = fbeta_score(y_true, pred, beta=2, average='macro', labels=orden_leads)
        resultados.append({'threshold': t, 'f2_oro': f2_oro, 'f2_macro': f2_macro})

    df_res = pd.DataFrame(resultados)
    best = df_res.loc[df_res['f2_oro'].idxmax()]

    fig, ax = plt.subplots(figsize=(10, 4))
    ax.plot(df_res['threshold'], df_res['f2_oro'],   label='F2 Lead Oro', color='#C44E52', lw=2)
    ax.plot(df_res['threshold'], df_res['f2_macro'], label='F2 macro',    color='#4C72B0', lw=2, ls='--')
    ax.axvline(best['threshold'], color='gray', ls=':', label=f'Óptimo: {best["threshold"]:.2f}')
    ax.set_xlabel('Threshold P2'); ax.set_ylabel('F2')
    ax.set_title(f'Threshold Tuning — {target_name}')
    ax.legend(); ax.grid(alpha=0.3)
    plt.tight_layout(); plt.show()

    print(f'{target_name}: threshold óptimo = {best["threshold"]:.2f} | '
          f'F2 Oro = {best["f2_oro"]:.4f} | F2 macro = {best["f2_macro"]:.4f}')
    return float(best['threshold']), df_res

print('── Threshold tuning — horse ──')
thresh_horse, df_thresh_h = threshold_tuning(
    xgb_ref_p1h, xgb_ref_p2h, X_test, y_test['horse_target'], 'horse'
)
print('\n── Threshold tuning — prods ──')
thresh_prods, df_thresh_p = threshold_tuning(
    xgb_ref_p1p, xgb_ref_p2p, X_test, y_test['prods_target'], 'prods'
)

In [None]:
# Evaluamos con el threshold óptimo
def calcular_metricas_con_threshold(p1h, p1p, p2h, p2p, th_horse, th_prods):
    metricas = {}
    for target, m1, m2, y_true, th, X_te_p2, y_te_p2 in [
        ('horse', p1h, p2h, y_test['horse_target'], th_horse,
         X_test[mask_test_p2_horse], y_test_p2_horse),
        ('prods', p1p, p2p, y_test['prods_target'], th_prods,
         X_test[mask_test_p2_prods], y_test_p2_prods),
    ]:
        pred = np.array(['Lead Bronce'] * len(X_test), dtype=object)
        mask_p1 = m1.predict_proba(X_test)[:, 1] >= 0.5
        if mask_p1.sum() > 0:
            proba_p2 = m2.predict_proba(X_test[mask_p1])[:, 1]
            pred[mask_p1] = np.where(proba_p2 >= th, 'Lead Oro', 'Lead Plata')
        f2_macro = fbeta_score(y_true, pred, beta=2, average='macro', labels=orden_leads)
        f2_oro   = fbeta_score(y_true, pred, beta=2, labels=['Lead Oro'], average='macro')
        f2_p2_tr = fbeta_score(
            y_p2h_raw if target=='horse' else y_p2p_raw,
            m2.predict(X_p2h_raw if target=='horse' else X_p2p_raw), beta=2
        )
        f2_p2_te = fbeta_score(y_te_p2, m2.predict(X_te_p2), beta=2)
        metricas[target] = {
            'f2_macro': f2_macro, 'f2_lead_oro': f2_oro,
            'f2_paso2_train': f2_p2_tr, 'f2_paso2_test': f2_p2_te,
            'overfit_gap_p2': f2_p2_tr - f2_p2_te,
            'threshold': th, 'y_pred': pred,
        }
    return metricas

metricas_xgb_thresh = calcular_metricas_con_threshold(
    xgb_ref_p1h, xgb_ref_p1p, xgb_ref_p2h, xgb_ref_p2p, thresh_horse, thresh_prods
)
for t, m in metricas_xgb_thresh.items():
    print(f'{t}: F2 macro={m["f2_macro"]:.4f} | F2 Oro={m["f2_lead_oro"]:.4f} '
          f'| Gap={m["overfit_gap_p2"]:.4f} | threshold={m["threshold"]:.2f}')

In [None]:
run_id_xgb_thresh = loguear_run(
    run_name   = 'v2_XGB_ThresholdTuning',
    model_type = 'XGBoost',
    params_p1  = {**xgb_params_base, 'scale_pos_weight': spw_p1h},
    params_p2  = {**xgb_params_base, 'scale_pos_weight': spw_p2h,
                  'threshold_horse': thresh_horse, 'threshold_prods': thresh_prods},
    p1h=xgb_ref_p1h, p1p=xgb_ref_p1p,
    p2h=xgb_ref_p2h, p2p=xgb_ref_p2p,
    metricas=metricas_xgb_thresh
)

## 4 · LightGBM con Optuna

LightGBM es más rápido que XGBoost en CPU gracias a su histogram-based splitting  
leaf-wise (vs level-wise de XGBoost). Esto permite explorar un espacio de  
hiperparámetros más grande en el mismo tiempo.  

Optuna usa optimización bayesiana (TPE sampler) para dirigir la búsqueda hacia  
las regiones más prometedoras — mucho más eficiente que RandomizedSearch.

In [None]:
def crear_objetivo_lgbm(X_tr, y_tr, spw, use_smote=False):
    """Factory que devuelve la función objetivo para Optuna."""
    def objective(trial):
        params = {
            'n_estimators':    trial.suggest_int('n_estimators', 100, 600),
            'max_depth':       trial.suggest_int('max_depth', 3, 8),
            'learning_rate':   trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
            'num_leaves':      trial.suggest_int('num_leaves', 15, 63),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
            'subsample':       trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha':       trial.suggest_float('reg_alpha', 0, 2.0),
            'reg_lambda':      trial.suggest_float('reg_lambda', 0, 5.0),
            'scale_pos_weight': spw,
            'random_state': 42, 'n_jobs': -1, 'verbose': -1,
        }
        if use_smote:
            pipe = ImbPipeline([
                ('smote', SMOTE(random_state=42)),
                ('lgbm', LGBMClassifier(**params))
            ])
            scores = cross_val_score(pipe, X_tr, y_tr, cv=cv5,
                                     scoring=f2_scorer, n_jobs=1)
        else:
            model = LGBMClassifier(**params)
            scores = cross_val_score(model, X_tr, y_tr, cv=cv5,
                                     scoring=f2_scorer, n_jobs=1)
        return scores.mean()
    return objective

N_TRIALS = 30  

print('── Optuna LightGBM — Paso 1 horse ──')
study_p1h = optuna.create_study(direction='maximize')
study_p1h.optimize(crear_objetivo_lgbm(X_p1h_bal, y_p1h_bal, spw_p1h), n_trials=N_TRIALS)
print(f'Mejor CV F2: {study_p1h.best_value:.4f} | Params: {study_p1h.best_params}')

print('\n── Optuna LightGBM — Paso 1 prods ──')
study_p1p = optuna.create_study(direction='maximize')
study_p1p.optimize(crear_objetivo_lgbm(X_p1p_bal, y_p1p_bal, spw_p1p), n_trials=N_TRIALS)
print(f'Mejor CV F2: {study_p1p.best_value:.4f} | Params: {study_p1p.best_params}')

print('\n── Optuna LightGBM — Paso 2 horse (con SMOTE en CV) ──')
study_p2h = optuna.create_study(direction='maximize')
study_p2h.optimize(crear_objetivo_lgbm(X_p2h_raw, y_p2h_raw, spw_p2h, use_smote=True), n_trials=N_TRIALS)
print(f'Mejor CV F2: {study_p2h.best_value:.4f} | Params: {study_p2h.best_params}')

print('\n── Optuna LightGBM — Paso 2 prods (con SMOTE en CV) ──')
study_p2p = optuna.create_study(direction='maximize')
study_p2p.optimize(crear_objetivo_lgbm(X_p2p_raw, y_p2p_raw, spw_p2p, use_smote=True), n_trials=N_TRIALS)
print(f'Mejor CV F2: {study_p2p.best_value:.4f} | Params: {study_p2p.best_params}')

In [None]:
# Entrenamos modelos finales con best params sobre datos completos
def entrenar_lgbm_final(study, X_tr, y_tr, spw, use_smote=False):
    params = {**study.best_params, 'scale_pos_weight': spw,
              'random_state': 42, 'n_jobs': -1, 'verbose': -1}
    model = LGBMClassifier(**params)
    if use_smote:
        X_bal, y_bal = SMOTE(random_state=42).fit_resample(X_tr, y_tr)
        model.fit(X_bal, y_bal)
    else:
        model.fit(X_tr, y_tr)
    return model

lgbm_p1h = entrenar_lgbm_final(study_p1h, X_p1h_bal, y_p1h_bal, spw_p1h)
lgbm_p1p = entrenar_lgbm_final(study_p1p, X_p1p_bal, y_p1p_bal, spw_p1p)
lgbm_p2h = entrenar_lgbm_final(study_p2h, X_p2h_raw, y_p2h_raw, spw_p2h, use_smote=True)
lgbm_p2p = entrenar_lgbm_final(study_p2p, X_p2p_raw, y_p2p_raw, spw_p2p, use_smote=True)

metricas_lgbm = calcular_metricas(lgbm_p1h, lgbm_p1p, lgbm_p2h, lgbm_p2p)
for t, m in metricas_lgbm.items():
    print(f'{t}: F2 macro={m["f2_macro"]:.4f} | F2 Oro={m["f2_lead_oro"]:.4f} '
          f'| Gap={m["overfit_gap_p2"]:.4f}')

In [None]:
# Threshold tuning sobre LightGBM
print('── Threshold tuning LGBM — horse ──')
thresh_lgbm_horse, _ = threshold_tuning(
    lgbm_p1h, lgbm_p2h, X_test, y_test['horse_target'], 'LGBM horse'
)
print('\n── Threshold tuning LGBM — prods ──')
thresh_lgbm_prods, _ = threshold_tuning(
    lgbm_p1p, lgbm_p2p, X_test, y_test['prods_target'], 'LGBM prods'
)

metricas_lgbm_thresh = calcular_metricas_con_threshold(
    lgbm_p1h, lgbm_p1p, lgbm_p2h, lgbm_p2p,
    thresh_lgbm_horse, thresh_lgbm_prods
)
for t, m in metricas_lgbm_thresh.items():
    print(f'{t}: F2 macro={m["f2_macro"]:.4f} | F2 Oro={m["f2_lead_oro"]:.4f} '
          f'| Gap={m["overfit_gap_p2"]:.4f} | threshold={m["threshold"]:.2f}')

In [None]:
run_id_lgbm = loguear_run(
    run_name   = 'v2_LightGBM_Optuna',
    model_type = 'LightGBM',
    params_p1  = {**study_p1h.best_params, 'scale_pos_weight': spw_p1h},
    params_p2  = {**study_p2h.best_params, 'scale_pos_weight': spw_p2h,
                  'threshold_horse': thresh_lgbm_horse, 'threshold_prods': thresh_lgbm_prods},
    p1h=lgbm_p1h, p1p=lgbm_p1p,
    p2h=lgbm_p2h, p2p=lgbm_p2p,
    metricas=metricas_lgbm_thresh,
    extra_metrics={'optuna_trials': N_TRIALS}
)

## 5 · CatBoost con categóricas nativas

CatBoost no necesita encodear las categóricas — las procesa internamente con  
ordered target statistics, que es más robusto que el Target Encoding manual.  
Para aprovechar esto, usamos un dataset separado donde las categóricas son strings.

In [None]:
# Dataset para CatBoost: categóricas como strings (sin Target Encoding)
df_cat = df.drop(columns=DROP_COLS + ['horse_target', 'prods_target'])

# Agregar las features de ratios
df_cat['ratio_cart_horse']       = df['ratio_cart_horse']
df_cat['ratio_cart_prods']       = df['ratio_cart_prods']
df_cat['ratio_cart_global']      = df['ratio_cart_global']
df_cat['precio_aspiracional_horse'] = df['precio_aspiracional_horse']
df_cat['precio_aspiracional_prods'] = df['precio_aspiracional_prods']
df_cat['rango_precio_horse']     = df['rango_precio_horse']
df_cat['prestige_gap']           = df['prestige_gap']
df_cat['ratio_horse_views']      = df['ratio_horse_views']

y_cat = df[['horse_target', 'prods_target']]

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    df_cat, y_cat, stratify=y_cat, test_size=0.2, random_state=42
)

# Índices de columnas categóricas (CatBoost los necesita)
cat_features_idx = [list(df_cat.columns).index(c) for c in disponibles if c in df_cat.columns]
print(f'Columnas categóricas para CatBoost: {[df_cat.columns[i] for i in cat_features_idx]}')
print(f'Índices: {cat_features_idx}')

In [None]:
# Targets y balanceo para CatBoost
yc_train_p1_horse = (yc_train['horse_target'] != 'Lead Bronce').astype(int)
yc_train_p1_prods = (yc_train['prods_target'] != 'Lead Bronce').astype(int)
yc_test_p1_horse  = (yc_test['horse_target']  != 'Lead Bronce').astype(int)
yc_test_p1_prods  = (yc_test['prods_target']  != 'Lead Bronce').astype(int)

# CatBoost no acepta SMOTE directo con categóricas — usamos class_weights
mask_p2_cat_horse = yc_train['horse_target'] != 'Lead Bronce'
mask_p2_cat_prods = yc_train['prods_target'] != 'Lead Bronce'
Xc_p2h = Xc_train[mask_p2_cat_horse]
Xc_p2p = Xc_train[mask_p2_cat_prods]
yc_p2h = (yc_train['horse_target'][mask_p2_cat_horse] == 'Lead Oro').astype(int)
yc_p2p = (yc_train['prods_target'][mask_p2_cat_prods] == 'Lead Oro').astype(int)

mask_test_cat_horse = yc_test['horse_target'] != 'Lead Bronce'
mask_test_cat_prods = yc_test['prods_target'] != 'Lead Bronce'

print('Targets CatBoost listos.')

In [None]:
def crear_objetivo_catboost(X_tr, y_tr, spw, cat_idx):
    def objective(trial):
        params = {
            'iterations':      trial.suggest_int('iterations', 100, 500),
            'depth':           trial.suggest_int('depth', 3, 8),
            'learning_rate':   trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
            'l2_leaf_reg':     trial.suggest_float('l2_leaf_reg', 1, 10),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
            'scale_pos_weight': spw,
            'cat_features':    cat_idx,
            'random_seed': 42, 'verbose': 0, 'thread_count': -1,
            'eval_metric': 'F1',
        }
        model = CatBoostClassifier(**params)
        scores = cross_val_score(model, X_tr, y_tr, cv=cv5,
                                 scoring=f2_scorer, n_jobs=1)
        return scores.mean()
    return objective

print('── Optuna CatBoost — Paso 1 horse ──')
study_cat_p1h = optuna.create_study(direction='maximize')
study_cat_p1h.optimize(crear_objetivo_catboost(Xc_train, yc_train_p1_horse, spw_p1h, cat_features_idx), n_trials=N_TRIALS)
print(f'Mejor CV F2: {study_cat_p1h.best_value:.4f}')

print('\n── Optuna CatBoost — Paso 1 prods ──')
study_cat_p1p = optuna.create_study(direction='maximize')
study_cat_p1p.optimize(crear_objetivo_catboost(Xc_train, yc_train_p1_prods, spw_p1p, cat_features_idx), n_trials=N_TRIALS)
print(f'Mejor CV F2: {study_cat_p1p.best_value:.4f}')

print('\n── Optuna CatBoost — Paso 2 horse ──')
study_cat_p2h = optuna.create_study(direction='maximize')
study_cat_p2h.optimize(crear_objetivo_catboost(Xc_p2h, yc_p2h, spw_p2h, cat_features_idx), n_trials=N_TRIALS)
print(f'Mejor CV F2: {study_cat_p2h.best_value:.4f}')

print('\n── Optuna CatBoost — Paso 2 prods ──')
study_cat_p2p = optuna.create_study(direction='maximize')
study_cat_p2p.optimize(crear_objetivo_catboost(Xc_p2p, yc_p2p, spw_p2p, cat_features_idx), n_trials=N_TRIALS)
print(f'Mejor CV F2: {study_cat_p2p.best_value:.4f}')

In [None]:
def entrenar_catboost_final(study, X_tr, y_tr, spw, cat_idx):
    params = {**study.best_params, 'scale_pos_weight': spw,
              'cat_features': cat_idx, 'random_seed': 42,
              'verbose': 0, 'thread_count': -1, 'eval_metric': 'F1'}
    model = CatBoostClassifier(**params)
    model.fit(X_tr, y_tr)
    return model

cat_p1h = entrenar_catboost_final(study_cat_p1h, Xc_train, yc_train_p1_horse, spw_p1h, cat_features_idx)
cat_p1p = entrenar_catboost_final(study_cat_p1p, Xc_train, yc_train_p1_prods, spw_p1p, cat_features_idx)
cat_p2h = entrenar_catboost_final(study_cat_p2h, Xc_p2h,   yc_p2h,            spw_p2h, cat_features_idx)
cat_p2p = entrenar_catboost_final(study_cat_p2p, Xc_p2p,   yc_p2p,            spw_p2p, cat_features_idx)

# Para evaluar CatBoost usamos Xc_test
def calcular_metricas_cat(p1h, p1p, p2h, p2p):
    metricas = {}
    for target, m1, m2, y_true, Xte_p2, yte_p2 in [
        ('horse', p1h, p2h, yc_test['horse_target'],
         Xc_test[mask_test_cat_horse],
         (yc_test['horse_target'][mask_test_cat_horse]=='Lead Oro').astype(int)),
        ('prods', p1p, p2p, yc_test['prods_target'],
         Xc_test[mask_test_cat_prods],
         (yc_test['prods_target'][mask_test_cat_prods]=='Lead Oro').astype(int)),
    ]:
        y_pred = predecir_cascada(Xc_test, m1, m2)
        f2_macro = fbeta_score(y_true, y_pred, beta=2, average='macro', labels=orden_leads)
        f2_oro   = fbeta_score(y_true, y_pred, beta=2, labels=['Lead Oro'], average='macro')
        Xp2_tr = Xc_p2h if target=='horse' else Xc_p2p
        yp2_tr = yc_p2h if target=='horse' else yc_p2p
        f2_p2_tr = fbeta_score(yp2_tr, m2.predict(Xp2_tr), beta=2)
        f2_p2_te = fbeta_score(yte_p2, m2.predict(Xte_p2), beta=2)
        metricas[target] = {
            'f2_macro': f2_macro, 'f2_lead_oro': f2_oro,
            'f2_paso2_train': f2_p2_tr, 'f2_paso2_test': f2_p2_te,
            'overfit_gap_p2': f2_p2_tr - f2_p2_te, 'y_pred': y_pred,
        }
    return metricas

metricas_cat = calcular_metricas_cat(cat_p1h, cat_p1p, cat_p2h, cat_p2p)
for t, m in metricas_cat.items():
    print(f'{t}: F2 macro={m["f2_macro"]:.4f} | F2 Oro={m["f2_lead_oro"]:.4f} '
          f'| Gap={m["overfit_gap_p2"]:.4f}')

In [None]:
# Threshold tuning CatBoost
def threshold_tuning_cat(m1, m2, Xte, y_true, target_name):
    thresholds = np.arange(0.10, 0.70, 0.02)
    resultados = []
    for t in thresholds:
        pred = np.array(['Lead Bronce'] * len(Xte), dtype=object)
        mask_p1 = m1.predict_proba(Xte)[:, 1] >= 0.5
        if mask_p1.sum() > 0:
            proba_p2 = m2.predict_proba(Xte[mask_p1])[:, 1]
            pred[mask_p1] = np.where(proba_p2 >= t, 'Lead Oro', 'Lead Plata')
        f2_oro = fbeta_score(y_true, pred, beta=2, labels=['Lead Oro'], average='macro')
        resultados.append({'threshold': t, 'f2_oro': f2_oro})
    df_res = pd.DataFrame(resultados)
    best = df_res.loc[df_res['f2_oro'].idxmax()]
    print(f'{target_name}: óptimo={best["threshold"]:.2f} | F2 Oro={best["f2_oro"]:.4f}')
    return float(best['threshold'])

thresh_cat_horse = threshold_tuning_cat(cat_p1h, cat_p2h, Xc_test, yc_test['horse_target'], 'CatBoost horse')
thresh_cat_prods = threshold_tuning_cat(cat_p1p, cat_p2p, Xc_test, yc_test['prods_target'], 'CatBoost prods')

run_id_cat = loguear_run(
    run_name   = 'v2_CatBoost_Optuna',
    model_type = 'CatBoost',
    params_p1  = {**study_cat_p1h.best_params, 'scale_pos_weight': spw_p1h},
    params_p2  = {**study_cat_p2h.best_params, 'scale_pos_weight': spw_p2h,
                  'threshold_horse': thresh_cat_horse, 'threshold_prods': thresh_cat_prods},
    p1h=cat_p1h, p1p=cat_p1p,
    p2h=cat_p2h, p2p=cat_p2p,
    metricas=metricas_cat,
    extra_metrics={'optuna_trials': N_TRIALS}
)

## 6 · Comparativa v1 vs v2 y selección del campeón

Comparamos todos los modelos de v2 entre sí y contra el campeón de v1.

In [None]:
CAMPEON_V1_F2_ORO = 0.4538  # XGB_Baseline v1

todos_v2 = [
    ('v2_XGB_Thresh',   metricas_xgb_thresh,   run_id_xgb_thresh),
    ('v2_LightGBM',     metricas_lgbm_thresh,  run_id_lgbm),
    ('v2_CatBoost',     metricas_cat,           run_id_cat),
]

print(f'{"Modelo":<20} {"Target":<8} {"F2 macro":>10} {"F2 Oro":>10} {"Gap P2":>8}  vs v1')
print('═'*72)
for nombre, res, _ in todos_v2:
    for target in ['horse', 'prods']:
        m = res[target]
        delta = m['f2_lead_oro'] - CAMPEON_V1_F2_ORO
        arrow = '↑' if delta > 0 else '↓'
        print(f'{nombre:<20} {target:<8} {m["f2_macro"]:>10.4f} {m["f2_lead_oro"]:>10.4f} '
              f'{m["overfit_gap_p2"]:>8.4f}  {arrow}{abs(delta):.4f}')
    print('─'*72)

In [None]:
# Selección automática del nuevo campeón
client = mlflow.tracking.MlflowClient()
exp    = client.get_experiment_by_name(EXPERIMENT_NAME)

runs = client.search_runs(
    experiment_ids=[exp.experiment_id],
    filter_string='attributes.status = "FINISHED"',
    order_by=['metrics.f2_lead_oro_horse DESC'],
    max_results=20
)

campeon_actual = next(
    (r for r in runs if r.data.tags.get('status') == 'champion'), None
)
mejor_run = runs[0]
nombre_mejor = mejor_run.data.tags.get('mlflow.runName', mejor_run.info.run_id[:8])
f2_mejor = mejor_run.data.metrics.get('f2_lead_oro_horse', 0)

if campeon_actual and campeon_actual.info.run_id == mejor_run.info.run_id:
    # El campeón actual ya es el mejor
    print(f'{'═'*60}')
    print(f'  CAMPEÓN VIGENTE: {nombre_mejor}')
    print(f'  Ningún modelo de v2 superó el campeón actual.')
    print(f'  F2 Lead Oro horse: {f2_mejor:.4f}')
    print(f'{'═'*60}')
else:
    # Hay un nuevo campeón
    client.set_tag(mejor_run.info.run_id, 'status', 'champion')
    if campeon_actual:
        client.set_tag(campeon_actual.info.run_id, 'status', 'retired')
        nombre_anterior = campeon_actual.data.tags.get('mlflow.runName', campeon_actual.info.run_id[:8])
        f2_anterior = campeon_actual.data.metrics.get('f2_lead_oro_horse', 0)
        print(f'  Retirado: {nombre_anterior} (F2={f2_anterior:.4f})')
    print(f'{'═'*60}')
    print(f'  NUEVO CAMPEÓN: {nombre_mejor}')
    print(f'  F2 Lead Oro horse: {f2_mejor:.4f}')
    if campeon_actual:
        print(f'  Mejora vs anterior: {f2_mejor - f2_anterior:+.4f}')
    print(f'{'═'*60}')

# Ranking completo
print(f'\n  Ranking completo:')
print(f'  {"Run":<22} {"F2 Oro horse":>13} {"Gap P2":>8}  Status')
print(f'  {"─"*55}')
for r in runs:
    rname  = r.data.tags.get('mlflow.runName', r.info.run_id[:8])
    f2h    = r.data.metrics.get('f2_lead_oro_horse', float('nan'))
    gap    = r.data.metrics.get('overfit_gap_p2_horse', float('nan'))
    status = r.data.tags.get('status', '')
    print(f'{rname:<20} {f2h:>13.4f} {gap:>8.4f}  {status}')