In [None]:
# Legitimate, ready-to-run notebook for Hull Tactical Market Prediction
# - Does NOT use test labels
# - Handles common runtime errors (file paths, NaNs, polars->pandas, server registration)
# - Uses time-aware CV and a robust mapping from predicted return to position [0,2]

import os
import numpy as np
import pandas as pd
import polars as pl
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from typing import List, Tuple
import joblib
import warnings
warnings.filterwarnings("ignore")

# Kaggle evaluation server (use in Kaggle environment)
import kaggle_evaluation.default_inference_server

# ----------------------------
# Configuration / constants
# ----------------------------
TRAIN_PATH = "/kaggle/input/hull-tactical-market-prediction/train.csv"
# If running locally outside Kaggle, change TRAIN_PATH to actual file path
RANDOM_STATE = 42
N_SPLITS = 4            # time-series splits for quick CV
MODEL_FILE = "gbm_model.joblib"
MEDIANS_FILE = "feature_medians.joblib"
PRED_QUANT_FILE = "pred_quantiles.joblib"

# ----------------------------
# Utility: robust feature selection + numeric coercion
# ----------------------------
def load_and_prepare_train(path: str) -> Tuple[pd.DataFrame, pd.Series, List[str]]:
    """
    Load CSV, identify numeric features, coerce to numeric safely,
    return X (DataFrame), y (Series), feature_cols list.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Train CSV not found at {path!r}. Update TRAIN_PATH.")
    df = pd.read_csv(path, index_col="date_id")
    # target
    if "forward_returns" not in df.columns:
        raise KeyError("Expected 'forward_returns' column in train.csv")
    y = df["forward_returns"].copy()
    # Known non-feature columns to drop
    drop_cols = {"forward_returns", "risk_free_rate", "market_forward_excess_returns"}
    # Feature candidates: all columns except known non-features
    feature_cols = [c for c in df.columns if c not in drop_cols]
    # Coerce feature columns to numeric (safe): strings -> NaN
    df_features = df[feature_cols].apply(pd.to_numeric, errors="coerce")
    # Replace purely-constant or all-NaN columns by dropping
    valid_cols = [c for c in df_features.columns if not df_features[c].isna().all()]
    X = df_features[valid_cols].copy()
    # Note: risk_free_rate and others remain accessible in df if needed during scoring/analysis
    return X, y.loc[X.index], valid_cols

# ----------------------------
# Simple preprocessing: compute medians for numeric imputation
# ----------------------------
def compute_medians(X: pd.DataFrame) -> pd.Series:
    # compute median per column (numeric)
    medians = X.median(numeric_only=True)
    return medians

# ----------------------------
# Model training (time-aware)
# ----------------------------
def train_time_aware_model(X: pd.DataFrame, y: pd.Series, n_splits=4, random_state=42):
    """
    Train HistGradientBoostingRegressor using time-series splits.
    Returns fitted model on full data and out-of-fold metrics.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits)
    val_scores = []
    # Use simple pipeline: scaler -> HGBRegressor (HGB handles NaNs but scaling helps)
    model = HistGradientBoostingRegressor(random_state=random_state, max_iter=500, learning_rate=0.05)
    # Track OOF predictions (optional)
    oof_preds = np.zeros(len(X))
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        # Fill missing using medians computed on X_tr (avoid leakage)
        med = X_tr.median(numeric_only=True)
        X_tr_f = X_tr.fillna(med)
        X_val_f = X_val.fillna(med)
        # Standardize (fit on train)
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr_f)
        X_val_s = scaler.transform(X_val_f)
        # Fit
        model_fold = HistGradientBoostingRegressor(random_state=random_state, max_iter=500, learning_rate=0.05)
        model_fold.fit(X_tr_s, y_tr)
        y_val_pred = model_fold.predict(X_val_s)
        oof_preds[val_idx] = y_val_pred
        mse = mean_squared_error(y_val, y_val_pred)
        val_scores.append(mse)
        print(f"[CV fold {fold}] val MSE = {mse:.6f}")
    # Fit final model on full dataset with median imputation and scaler
    final_med = compute_medians(X)
    X_full = X.fillna(final_med)
    scaler_final = StandardScaler()
    X_full_s = scaler_final.fit_transform(X_full)
    final_model = HistGradientBoostingRegressor(random_state=random_state, max_iter=500, learning_rate=0.05)
    final_model.fit(X_full_s, y)
    # Save scaler/medians/model together
    return final_model, scaler_final, final_med, oof_preds, np.mean(val_scores)

# ----------------------------
# Map predicted returns -> position in [0,2]
# ----------------------------
def fit_return_to_position_map(preds_train: np.ndarray, low_q=2.0, high_q=98.0) -> Tuple[float, float]:
    """
    Compute robust quantiles (low, high) used to linearly map predicted returns
    to [0,2]. Values below low -> 0, above high -> 2.
    """
    q_low = np.percentile(preds_train, low_q)
    q_high = np.percentile(preds_train, high_q)
    # Guard against equal quantiles (rare): expand slightly
    if q_high <= q_low:
        q_low = np.percentile(preds_train, 1.0)
        q_high = np.percentile(preds_train, 99.0)
        if q_high <= q_low:
            q_low -= 1e-6
            q_high += 1e-6
    return float(q_low), float(q_high)

def predicted_return_to_position(pred: float, q_low: float, q_high: float) -> float:
    """
    Convert a single predicted return into a position in [0,2] using linear scaling
    between q_low and q_high, clipped to [0,2].
    """
    pos = (pred - q_low) / (q_high - q_low) * 2.0
    pos_clipped = float(np.clip(pos, 0.0, 2.0))
    return pos_clipped

# ----------------------------
# Main training flow
# ----------------------------
print("Loading training data...")
X_train, y_train, FEATURE_COLS = load_and_prepare_train(TRAIN_PATH)
print(f"Train shape (features): {X_train.shape}, feature count: {len(FEATURE_COLS)}")

print("Computing medians and training model (time-aware)...")
model, scaler_obj, medians, oof_preds, cv_mse = train_time_aware_model(X_train, y_train, n_splits=N_SPLITS, random_state=RANDOM_STATE)
print(f"CV mean MSE: {cv_mse:.6f}")

# Save model + medians + scaler and compute quantiles for mapping
joblib.dump(model, MODEL_FILE)
joblib.dump(medians, MEDIANS_FILE)
joblib.dump(scaler_obj, "scaler.joblib")

# Convert training predictions -> compute mapping quantiles
X_train_f = X_train.fillna(medians)
X_train_s = scaler_obj.transform(X_train_f)
preds_train = model.predict(X_train_s)
q_low, q_high = fit_return_to_position_map(preds_train, low_q=3.0, high_q=97.0)
joblib.dump((q_low, q_high), PRED_QUANT_FILE)
print(f"Return->position mapping quantiles: q_low={q_low:.6e}, q_high={q_high:.6e}")

# ----------------------------
# Inference: define predict(test: pl.DataFrame) -> float
# ----------------------------
# Load artifacts (ensures predict works even if notebook restarted)
_model = joblib.load(MODEL_FILE)
_medians = joblib.load(MEDIANS_FILE)
_scaler = joblib.load("scaler.joblib")
_q_low, _q_high = joblib.load(PRED_QUANT_FILE)

def predict(test: pl.DataFrame) -> float:
    """
    This function will be called by the Kaggle evaluation server.
    Input: a polars DataFrame with a single row (the test row).
    Output: a float position in [0, 2].
    """
    # Convert to pandas (safer to use pandas API here)
    try:
        row = test.to_pandas()
    except Exception:
        # If polars is already converted, ensure row is a pandas DataFrame
        if isinstance(test, pd.DataFrame):
            row = test.copy()
        else:
            # as a last resort, try to construct DataFrame
            row = pd.DataFrame(test)
    # Drop columns not used as features, if present
    for c in ["date_id", "is_scored", "lagged_forward_returns", "lagged_risk_free_rate", "lagged_market_forward_excess_returns"]:
        if c in row.columns:
            row = row.drop(columns=[c])
    # Ensure we have the feature columns expected (intersection)
    available = [c for c in FEATURE_COLS if c in row.columns]
    # If some feature cols are missing from row, add them with median values
    missing = [c for c in FEATURE_COLS if c not in available]
    for c in missing:
        row[c] = _medians.get(c, 0.0)
    # Select features in the same order as training
    X_row = row[FEATURE_COLS].astype(float).fillna(_medians)
    # Standardize
    X_row_s = _scaler.transform(X_row)
    # Predict return
    pred_ret = _model.predict(X_row_s)[0]  # model returns array-like
    # Map return -> position
    pos = predicted_return_to_position(pred_ret, _q_low, _q_high)
    # Debug print (safe for local runs)
    print(f"Predicted return: {pred_ret:.6e} -> Position: {pos:.6f}")
    return float(pos)

# ----------------------------
# Hook into evaluation server (Kaggle)
# ----------------------------
print("Registering inference server with `predict` endpoint...")
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    # In the real competition rerun path
    inference_server.serve()
else:
    # Local simulation (writes submission.parquet locally)
    inference_server.run_local_gateway(("/kaggle/input/hull-tactical-market-prediction/",))

print("Done.")
