# Model Training Notebook
Actualizado para cumplir con los requisitos del proyecto — predicción de **retorno a 30 días** en criptos *low‑cap* (AI, Gaming, RWA, Meme).

Note: you may need to restart the kernel to use updated packages.
✅ Paquetes listos — reinicia el kernel si alguno se actualizó


## 1 · Carga de datos

In [3]:
import pandas as pd, pathlib, warnings
warnings.filterwarnings('ignore')

DATA_PATH = pathlib.Path('../../data/ml_dataset.csv')
assert DATA_PATH.exists(), f'No se encontró {DATA_PATH}'

# 1⃣  cargamos sin parse_dates para inspeccionar primero
df = pd.read_csv(DATA_PATH)
print("Columnas:", list(df.columns))
print("Shape   :", df.shape)

# 2⃣  si existe alguna columna temporal la convertimos:
for cand in ["date", "fecha", "timestamp"]:
    if cand in df.columns:
        df[cand] = pd.to_datetime(df[cand], errors="coerce")
        df = df.dropna(subset=[cand])          # opcional: descartar filas sin fecha
        break                                  # salimos tras la primera coincidencia

df.head()


Columnas: ['narrative', 'market_cap', 'volume', 'ret_7d', 'ret_30d', 'sma_7', 'sma_30', 'bb_width', 'vol_30d', 'cluster_id', 'future_ret_30d']
Shape   : (107, 11)


Unnamed: 0,narrative,market_cap,volume,ret_7d,ret_30d,sma_7,sma_30,bb_width,vol_30d,cluster_id,future_ret_30d
0,ai,6390.7,1.14,0.143164,0.318811,7e-06,6e-06,0.052517,0.06495,0,0.220198
1,ai,12580.77,4.97,-0.179978,0.154281,3.1e-05,2.9e-05,0.095881,0.063185,0,0.609897
2,ai,23651.0,6.53,-0.060415,0.247716,2.6e-05,2.1e-05,0.182911,0.086268,0,0.05154
3,ai,1443249.0,6.16,0.417425,0.701156,0.043002,0.03527,0.109074,0.079329,0,-0.445229
4,meme,107137.0,107.15,-0.154055,-0.388971,2.1e-05,2.8e-05,0.26715,0.099369,3,0.711866


## 2 · Features + target

In [4]:
target = 'future_ret_30d'
cat_cols = ['narrative','cluster_id']
num_cols = [c for c in df.columns if c not in cat_cols + ['id','date',target]]

print('Num:', len(num_cols), num_cols[:10])

Num: 8 ['market_cap', 'volume', 'ret_7d', 'ret_30d', 'sma_7', 'sma_30', 'bb_width', 'vol_30d']


## 3 · Split (train/val/test 60‑20‑20)

In [5]:
from sklearn.model_selection import train_test_split
X = df[cat_cols + num_cols]
y = df[target]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test , y_val, y_test  = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print(X_train.shape, X_val.shape, X_test.shape)

(64, 10) (21, 10) (22, 10)


## 4 · Pipeline de pre‑procesamiento

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

pre = ColumnTransformer([
    ('num', Pipeline([
        ('imp', SimpleImputer(strategy='median')),
        ('sc', StandardScaler())
    ]), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

## 5 · Baselines: Ridge & ElasticNet

In [7]:
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score

ridge_pipe = Pipeline([('prep',pre),('reg',RidgeCV(alphas=[0.1,1,10]))])
ridge_pipe.fit(X_train,y_train)
pred = ridge_pipe.predict(X_val)
print('Ridge  RMSE',mean_squared_error(y_val,pred, squared=False),'R²',r2_score(y_val,pred))

enet_pipe = Pipeline([('prep',pre),('reg',ElasticNetCV(alphas=[0.01,0.1,1],l1_ratio=[.1,.5,.9],max_iter=5000))])
enet_pipe.fit(X_train,y_train)
pred2 = enet_pipe.predict(X_val)
print('Elastic RMSE',mean_squared_error(y_val,pred2, squared=False),'R²',r2_score(y_val,pred2))

Ridge  RMSE 1.1565082859676061 R² -16.536970412285974
Elastic RMSE 0.35081911370014246 R² -0.6137028497569688


In [8]:
!pip install xgboost



## 6 · Optuna + XGBoost (GPU)

In [9]:
# %% [markdown]
# ## Optuna + XGBoost 2.x (con categorías nativas y early-stopping)

# %%
import optuna, xgboost as xgb, numpy as np, tqdm
from sklearn.metrics import mean_squared_error
import pandas as pd

# ------------ 1 ▸ preparar DMatrix con categorías -----------------------
cat_cols = ["narrative", "cluster_id"]        # ajusta si cambian
def to_dmatrix(X, y):
    X_ok = X.copy()
    for c in cat_cols:
        if c in X_ok.columns:
            X_ok[c] = X_ok[c].astype("category")   # -> pandas category
    return xgb.DMatrix(
        data=X_ok,
        label=y.values.astype(np.float32),
        enable_categorical=True,
        missing=np.nan,
    )

dtrain = to_dmatrix(X_train, y_train)
dval   = to_dmatrix(X_val,   y_val)

# ------------ 2 ▸ función objetivo Optuna ------------------------------
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "tree_method": "hist",   # GPU => hist + device=cuda
        "device": "cuda",
        "max_depth":        trial.suggest_int("max_depth", 4, 10),
        "learning_rate":    trial.suggest_float("lr", 1e-3, 1e-1, log=True),
        "subsample":        trial.suggest_float("subsample", .5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample", .5, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10, log=True),
        "lambda":           trial.suggest_float("lambda", 1e-3, 10, log=True),
        "alpha":            trial.suggest_float("alpha",  1e-3, 10, log=True),
        "seed": 42,
    }

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dval, "val")],
        early_stopping_rounds=50,
        verbose_eval=False,
    )
    pred = bst.predict(dval, iteration_range=(0, bst.best_iteration + 1))
    return mean_squared_error(y_val, pred, squared=False)

# ------------ 3 ▸ lanzar la optimización -------------------------------
n_trials = 30
print(f"🔍  Optimizando {n_trials} trials …")
study = optuna.create_study(direction="minimize")
for _ in tqdm.tqdm(range(n_trials)):
    study.optimize(objective, n_trials=1, catch=(Exception,))

print(f"✅  Mejor RMSE: {study.best_value:.4f}")
print("Mejores hiperparámetros:", study.best_params)

# ------------ 4 ▸ entrenar modelo final --------------------------------
best_params = {
    **study.best_params,
    "objective": "reg:squarederror",
    "tree_method": "hist",
    "device": "cuda",
    "seed": 42,
}

best_bst = xgb.train(
    best_params,
    dtrain,
    num_boost_round=2000,
    evals=[(dval, "val")],
    early_stopping_rounds=50,
    verbose_eval=False,
)

# métricas finales
y_pred = best_bst.predict(dval, iteration_range=(0, best_bst.best_iteration + 1))
print("Final RMSE:", mean_squared_error(y_val, y_pred, squared=False))

# ------------ 5 ▸ guardar artefacto ------------------------------------
MODEL_PATH = "../../models/xgb_optuna_best.model"
best_bst.save_model(MODEL_PATH)
print("💾  Modelo guardado en", MODEL_PATH)


[I 2025-07-08 10:34:08,044] A new study created in memory with name: no-name-53ab6f4b-8795-4077-975e-d9fbfcb27000


🔍  Optimizando 30 trials …


  0%|          | 0/30 [00:00<?, ?it/s][I 2025-07-08 10:34:08,251] Trial 0 finished with value: 0.3506454784759786 and parameters: {'max_depth': 9, 'lr': 0.001221455372764859, 'subsample': 0.7495542404683255, 'colsample': 0.6023162125645007, 'min_child_weight': 0.0012753031145706219, 'lambda': 0.008365148609245683, 'alpha': 1.4361265376170007}. Best is trial 0 with value: 0.3506454784759786.
  3%|▎         | 1/30 [00:00<00:05,  4.86it/s][I 2025-07-08 10:34:08,313] Trial 1 finished with value: 0.36154549759193394 and parameters: {'max_depth': 4, 'lr': 0.025359739714568896, 'subsample': 0.8188330420976231, 'colsample': 0.648713775738792, 'min_child_weight': 0.10130467695599386, 'lambda': 0.0010976027888099915, 'alpha': 0.06191047855864595}. Best is trial 0 with value: 0.3506454784759786.
[I 2025-07-08 10:34:08,407] Trial 2 finished with value: 0.35107016462187424 and parameters: {'max_depth': 8, 'lr': 0.002931455050532695, 'subsample': 0.5192718416195483, 'colsample': 0.9210877561929778, 

✅  Mejor RMSE: 0.3372
Mejores hiperparámetros: {'max_depth': 10, 'lr': 0.020193658869072484, 'subsample': 0.7036606206389683, 'colsample': 0.998189851647681, 'min_child_weight': 8.423058881723613, 'lambda': 0.32196295015742266, 'alpha': 0.0023886307937583577}
Final RMSE: 0.339760918760371
💾  Modelo guardado en ../../models/xgb_optuna_best.model





In [10]:
# ▸ Guardado limpio de artefactos
import pathlib, json, joblib, os
from datetime import datetime

# ──────────────────────────────────────────────────────
# 1. Carpetas
# ──────────────────────────────────────────────────────
MODELS_DIR = pathlib.Path("../../models")
MODELS_DIR.mkdir(parents=True, exist_ok=True)      # crea recursivamente

timestamp   = datetime.now().strftime("%Y%m%d_%H%M%S")
model_path  = MODELS_DIR / f"xgb_optuna_best_{timestamp}.json"     # formato JSON → ligero
params_path = MODELS_DIR / f"xgb_optuna_best_params_{timestamp}.json"
prep_path   = MODELS_DIR / f"preprocessor_{timestamp}.joblib"

# ──────────────────────────────────────────────────────
# 2. Guardar Booster (árboles) y parámetros Optuna
# ──────────────────────────────────────────────────────
best_bst.save_model(model_path.as_posix())
with params_path.open("w", encoding="utf-8") as f:
    json.dump(best_params, f, indent=2)

# ──────────────────────────────────────────────────────
# 3. (Opcional) Guardar el pre-procesador completo
# ──────────────────────────────────────────────────────
# Si usaste 'pre_pipe' (ColumnTransformer + OneHotEncoder + etc.):
try:
    joblib.dump(pre_pipe, prep_path)
    print("💾  Preprocesador guardado en", prep_path)
except NameError:
    print("⚠️  pre_pipe no está definido — se omitió su guardado.")

print("✅  Modelo XGBoost guardado en", model_path)
print("✅  Hiperparámetros guardados en", params_path)


⚠️  pre_pipe no está definido — se omitió su guardado.
✅  Modelo XGBoost guardado en ../../models/xgb_optuna_best_20250708_103411.json
✅  Hiperparámetros guardados en ../../models/xgb_optuna_best_params_20250708_103411.json


## 7 · Evaluación en test

In [11]:
from sklearn.metrics import mean_absolute_error
pred_test = best_xgb.predict(X_test)
rmse = mean_squared_error(y_test, pred_test, squared=False)
mae  = mean_absolute_error(y_test, pred_test)
print('Test RMSE',rmse,'MAE',mae)

NameError: name 'best_xgb' is not defined

## 8 · Guardar modelo

In [None]:
import joblib, pathlib, datetime, json
OUT_DIR = pathlib.Path('../models'); OUT_DIR.mkdir(exist_ok=True, parents=True)
stamp = datetime.datetime.now().strftime('%Y%m%d_%H%M')
joblib.dump(best_xgb, OUT_DIR/f'xgb_best_{stamp}.pkl')
json.dump(best_params, open(OUT_DIR/f'xgb_params_{stamp}.json','w'))
print('✅ Modelo y params guardados')