# Hull Tactical Market Prediction — Ensemble robusto con inference


In [None]:
import os, gc, numpy as np, pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error

pd.set_option("display.max_columns", 200)

In [None]:
KAGGLE_INPUT = "/kaggle/input/hull-tactical-market-prediction"
LOCAL_INPUT = "/kaggle/input" if os.path.exists("/kaggle/input") else "/mnt/data"

data_dir = KAGGLE_INPUT if os.path.exists(KAGGLE_INPUT) else LOCAL_INPUT
train_path = os.path.join(data_dir, "train.csv")
test_path  = os.path.join(data_dir, "test.csv")

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path) if os.path.exists(test_path) else None

TARGET = "forward_returns"
DATE   = "date_id"
LEAK   = {"forward_returns","risk_free_rate","market_forward_excess_returns"}

print("Loaded:", {"train": train.shape, "test": None if test is None else test.shape})

In [None]:
# Selección de features numéricas seguras
if test is not None:
    commons = set(train.columns) & set(test.columns)
    feat = [c for c in train.columns if c in commons and c not in LEAK and c != DATE]
else:
    feat = [c for c in train.columns if c not in LEAK and c != DATE]

num_cols = train[feat].select_dtypes(include=[np.number]).columns.tolist()
assert len(num_cols) > 0, "No numeric features found."

train_sorted = train.sort_values(DATE).reset_index(drop=True)
X = train_sorted[num_cols].copy()
y = train_sorted[TARGET].astype(float).copy()

pre = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ]), num_cols)
], remainder="drop")

In [None]:
models = {
    "ridge": Ridge(alpha=0.5, random_state=42),
    "lasso": Lasso(alpha=1e-4, random_state=42, max_iter=20000),
    "gbr":   GradientBoostingRegressor(
        n_estimators=400, learning_rate=0.05, max_depth=3, subsample=0.8, random_state=42
    )
}

tscv = TimeSeriesSplit(n_splits=5)

cv_summary, oof_dict = [], {}
for name, mdl in models.items():
    pipe = Pipeline([("prep", pre), ("model", mdl)])
    oof_pred  = np.zeros(len(train_sorted))
    fold_mets = []
    for tr_idx, va_idx in tscv.split(X):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        pipe.fit(X_tr, y_tr)
        p = pipe.predict(X_va)
        r2  = r2_score(y_va, p)
        rmse = mean_squared_error(y_va, p, squared=False)
        oof_pred[va_idx] = p
        fold_mets.append((r2, rmse))
    r2m   = float(np.mean([m[0] for m in fold_mets]))
    rmsem = float(np.mean([m[1] for m in fold_mets]))
    cv_summary.append({"model": name, "cv_r2": r2m, "cv_rmse": rmsem})
    oof_dict[name] = oof_pred

cv_df = pd.DataFrame(cv_summary).sort_values("cv_r2", ascending=False).reset_index(drop=True)
cv_df

In [None]:
# Ensemble por 1/RMSE
w = {}
den = 0.0
for row in cv_summary:
    w[row["model"]] = 1.0 / (row["cv_rmse"] + 1e-12)
    den += w[row["model"]]
for k in w: w[k] /= den

yhat_oof = np.zeros(len(train_sorted))
for name, preds in oof_dict.items():
    yhat_oof += w[name] * preds

def sharpe_like(y_true, y_pred, k):
    pos = np.clip(k * y_pred, 0.0, 2.0)
    ret = y_true * pos
    mu  = float(np.mean(ret))
    sd  = float(np.std(ret, ddof=1) + 1e-12)
    return mu / sd

hold = 180
idx_val = np.arange(len(y))[-hold:] if len(y) > hold else np.arange(len(y))

best_k, best_s = None, -1e9
for k_try in np.linspace(5.0, 15.0, 21):
    s = sharpe_like(y.iloc[idx_val].values, yhat_oof[idx_val], k_try)
    if s > best_s:
        best_s, best_k = s, float(k_try)

k_opt = float(best_k if best_k is not None else 10.0)
print({"k_opt": k_opt, "val_sharpe_like": best_s})

In [None]:
# Entrenar full y predecir test
X_full  = X
fitted = {}
for name, mdl in models.items():
    pipe = Pipeline([("prep", pre), ("model", mdl)])
    pipe.fit(X_full, y)
    fitted[name] = pipe

if test is not None:
    X_test  = test[num_cols].copy()
    yhat_test = np.zeros(len(X_test))
    for name, pipe in fitted.items():
        yhat_test += w[name] * pipe.predict(X_test)
    position = np.clip(k_opt * yhat_test, 0.0, 2.0).astype("float64")
    sub = pd.DataFrame({"date_id": test["date_id"].astype("int64").values, "position": position})
    out = "/kaggle/working/submission.parquet" if os.path.exists("/kaggle/working") else "submission.parquet"
    sub.to_parquet(out, index=False)
    print("WROTE:", out, sub.shape)

diag = pd.DataFrame({
    "date_id": train_sorted[DATE].astype("int64").values,
    "y_true": y.values.astype("float64"),
    "y_pred_oof": yhat_oof.astype("float64")
})
out = "/kaggle/working/preds_analysis.parquet" if os.path.exists("/kaggle/working") else "preds_analysis.parquet"
diag.to_parquet(out, index=False)
print("WROTE:", out, diag.shape)

In [None]:
# ===== INFERENCE SERVER (self-contained y robusto) =====
import os, gc, numpy as np, pandas as pd
from collections.abc import Mapping

# Reconstruye si algo no existe (evaluación fuera de orden)
try:
    num_cols; fitted; k_opt
except Exception:
    KAGGLE_INPUT = "/kaggle/input/hull-tactical-market-prediction"
    data_dir = KAGGLE_INPUT if os.path.exists(KAGGLE_INPUT) else "/kaggle/input"
    train_path = os.path.join(data_dir, "train.csv")
    test_path  = os.path.join(data_dir, "test.csv")
    train = pd.read_csv(train_path)
    TARGET = "forward_returns"; DATE = "date_id"
    LEAK = {"forward_returns","risk_free_rate","market_forward_excess_returns"}
    test_cols = set(pd.read_csv(test_path, nrows=1).columns) if os.path.exists(test_path) else set(train.columns)
    feat_all = [c for c in train.columns if c not in LEAK and c != DATE]
    feat = [c for c in feat_all if c in test_cols]
    num_cols = train[feat].select_dtypes(include=[np.number]).columns.tolist()

    # >>> FIX aquí: ColumnTransformer correcto (sin paréntesis extra) <<<
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import Ridge, Lasso
    from sklearn.ensemble import GradientBoostingRegressor

    pre = ColumnTransformer([
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler(with_mean=False))
        ]), num_cols)
    ], remainder="drop")

    models = {
        "ridge": Ridge(alpha=0.5, random_state=42),
        "lasso": Lasso(alpha=1e-4, random_state=42, max_iter=20000),
        "gbr":   GradientBoostingRegressor(n_estimators=400, learning_rate=0.05,
                                           max_depth=3, subsample=0.8, random_state=42)
    }
    train_sorted = train.sort_values(DATE).reset_index(drop=True)
    X_full = train_sorted[num_cols].copy()
    y_full = train_sorted[TARGET].astype(float).copy()
    fitted = {}
    for name, mdl in models.items():
        pipe = Pipeline([("prep", pre), ("model", mdl)])
        pipe.fit(X_full, y_full)
        fitted[name] = pipe
    k_opt = 10.0
    w = {n: 1.0/len(fitted) for n in fitted}

if 'w' not in globals():
    w = {name: 1.0/len(fitted) for name in fitted.keys()}

def _row_to_frame(row, cols):
    if isinstance(row, Mapping):
        return pd.DataFrame([{c: row.get(c, 0.0) for c in cols}])[cols]
    if isinstance(row, pd.Series):
        return row.to_frame().T.reindex(columns=cols, fill_value=0.0)
    if isinstance(row, pd.DataFrame):
        return row.iloc[:1].reindex(columns=cols, fill_value=0.0)
    return pd.DataFrame([row], columns=cols)[cols]

def _ensemble_predict(X_df):
    pred = np.zeros(len(X_df))
    for name, pipe in fitted.items():
        pred += w.get(name, 1.0/len(fitted)) * pipe.predict(X_df)
    return pred

def predict(row):
    X_row = _row_to_frame(row, num_cols)
    y_pred = float(_ensemble_predict(X_row)[0])
    pos = float(np.clip(float(k_opt) * y_pred, 0.0, 2.0))
    if not np.isfinite(pos):
        pos = 0.0
    return pos

try:
    from kaggle_evaluation.default_inference_server import DefaultInferenceServer
    if "KAGGLE_IS_COMPETITION_RERUN" in os.environ:
        DefaultInferenceServer(predict).serve()
    elif os.getenv("RUN_LOCAL_GATEWAY") == "1":
        DefaultInferenceServer(predict).run_local_gateway(("/kaggle/input/hull-tactical-market-prediction",))
    else:
        print("Inference server: SKIPPED (exporta RUN_LOCAL_GATEWAY=1 para probar local).")
except Exception as e:
    print("WARNING: inference server not started:", repr(e))
finally:
    gc.collect()
