In [None]:
# Import standard libraries
import numpy as np
import pandas as pd

# Import modeling libraries
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Import Optuna for hyperparameter optimization
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

# Set numpy print options for readability
np.set_printoptions(suppress=True, linewidth=120)

In [None]:
# Define function to load the datasets
def load_data(train_path, test_path):
    # Read CSV files
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    
    return train, test

In [None]:
# Define function to identify target and feature columns
def get_columns(train_df, target_pref="market_forward_excess_returns"):
    # Determine target column
    target = target_pref if target_pref in train_df.columns else (
        "forward_returns" if "forward_returns" in train_df.columns else None
    )
    
    # Validate presence of target
    if target is None:
        raise ValueError("No suitable target found in train.csv")

    # Build exclusion set
    exclude = {target, "risk_free_rate", "market_forward_excess_returns", "forward_returns"}
    
    # Collect id-like columns
    id_cols = []
    if "date_id" in train_df.columns:
        id_cols.append("date_id")
        exclude.add("date_id")

    # Perform prefix-based feature selection
    candidates = [c for c in train_df.columns if c not in exclude]
    prefixes = ("M", "E", "I", "P", "V", "S", "MOM", "D")
    feats = [c for c in candidates if c.startswith(prefixes)]

    # Fallback to numeric columns
    if len(feats) == 0:
        feats = train_df.drop(columns=list(exclude), errors="ignore") \
                        .select_dtypes(include=[np.number]) \
                        .columns.tolist()

    # Validate features
    if len(feats) == 0:
        raise ValueError("No usable features found in training data. Check your CSV header.")

    return target, feats, id_cols

In [None]:
# Define function to create contiguous time-based CV folds
def make_time_folds(df, n_splits=10):
    # Check required column
    if "date_id" not in df.columns:
        raise ValueError("date_id column required for time-based CV.")

    # Sort by time
    df_sorted = df.sort_values("date_id").reset_index(drop=True)
    
    # Determine sizes
    n = len(df_sorted)
    fold_sizes = [n // n_splits + (1 if i < n % n_splits else 0) for i in range(n_splits)]
    indices = np.cumsum([0] + fold_sizes)

    # Build expanding-train, next-block-validate folds
    folds = []
    for i in range(1, n_splits):
        start = indices[i]
        end = indices[i + 1]
        val_idx = np.arange(start, end)
        train_idx = np.arange(0, start)
        folds.append((train_idx, val_idx))

    return folds, df_sorted.index.values

In [None]:
# Define function to evaluate parameters via time-aware CV
def cv_rmse_for_params(train_df, features, target, params, n_splits=10, random_state=42, early_stopping=200):
    # Prepare arrays
    X = train_df[features].copy()
    y = train_df[target].copy()
    
    # Identify categorical columns
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    
    # Cast to category dtype
    for c in cat_cols:
        X[c] = X[c].astype("category")
    
    # Build time folds
    folds, _ = make_time_folds(train_df, n_splits=n_splits)
    
    # Initialize OOF predictions
    oof_pred = np.zeros(len(train_df), dtype=float)
    
    # Iterate folds
    for tr_idx, va_idx in folds:
        X_tr = X.iloc[tr_idx]
        y_tr = y.iloc[tr_idx]
        X_va = X.iloc[va_idx]
        y_va = y.iloc[va_idx]
        
        # Initialize model
        model = lgb.LGBMRegressor(**params, random_state=random_state, verbose=-1)
        
        # Fit with early stopping
        model.fit(
            X_tr,
            y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="rmse",
            categorical_feature=cat_cols if len(cat_cols) else "auto",
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping, verbose=False)],
        )
        
        # Predict validation
        pred_va = model.predict(X_va, num_iteration=model.best_iteration_)
        
        # Store OOF predictions
        oof_pred[va_idx] = pred_va
    
    # Compute OOF RMSE
    rmse = np.sqrt(mean_squared_error(y, oof_pred))
    
    return rmse

In [None]:
# Define function to run Optuna optimization
def optimize_lgb_params(train_df, features, target, n_splits=10, n_trials=50, random_state=42):
    # Create sampler
    sampler = TPESampler(seed=random_state)
    
    # Create pruner
    pruner = MedianPruner(n_startup_trials=10, n_warmup_steps=0)
    
    # Define objective
    def objective(trial):
        # Suggest parameters
        learning_rate = trial.suggest_float("learning_rate", 0.005, 0.2, log=True)
        n_estimators = trial.suggest_int("n_estimators", 500, 8000, step=250)
        num_leaves = trial.suggest_int("num_leaves", 16, 512, log=True)
        max_depth = trial.suggest_int("max_depth", -1, 16)
        subsample = trial.suggest_float("subsample", 0.5, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
        reg_alpha = trial.suggest_float("reg_alpha", 0.0, 20.0)
        reg_lambda = trial.suggest_float("reg_lambda", 0.0, 30.0)
        min_child_samples = trial.suggest_int("min_child_samples", 10, 200)
        min_child_weight = trial.suggest_float("min_child_weight", 1e-4, 1.0, log=True)
        
        # Build parameter dict
        params = {
            "objective": "regression",
            "metric": "rmse",
            "learning_rate": learning_rate,
            "n_estimators": n_estimators,
            "num_leaves": num_leaves,
            "max_depth": max_depth,
            "subsample": subsample,
            "colsample_bytree": colsample_bytree,
            "reg_alpha": reg_alpha,
            "reg_lambda": reg_lambda,
            "min_child_samples": min_child_samples,
            "min_child_weight": min_child_weight,
        }
        
        # Compute CV score
        rmse = cv_rmse_for_params(
            train_df=train_df,
            features=features,
            target=target,
            params=params,
            n_splits=n_splits,
            random_state=random_state,
            early_stopping=200,
        )
        
        # Report to Optuna
        trial.report(rmse, step=0)
        
        return rmse
    
    # Create study
    study = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
    
    # Optimize
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    
    # Extract best parameters
    best_params = study.best_params
    
    # Ensure fixed parameters are present
    best_params["objective"] = "regression"
    best_params["metric"] = "rmse"
    
    return best_params, study.best_value

In [None]:
# Define function to train LightGBM with time-aware CV using chosen parameters
def train_with_cv(train_df, features, target, params, n_splits=10, random_state=42):
    # Prepare arrays
    X = train_df[features].copy()
    y = train_df[target].copy()
    
    # Identify categorical columns for LightGBM
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    
    # Cast to category dtype
    for c in cat_cols:
        X[c] = X[c].astype("category")
    
    # Build time folds
    folds, _ = make_time_folds(train_df, n_splits=n_splits)
    
    # Initialize out-of-fold predictions
    oof_pred = np.zeros(len(train_df), dtype=float)
    
    # Initialize model container
    models = []
    
    # Iterate folds in time order
    for i, (tr_idx, va_idx) in enumerate(folds, 1):
        X_tr = X.iloc[tr_idx]
        y_tr = y.iloc[tr_idx]
        X_va = X.iloc[va_idx]
        y_va = y.iloc[va_idx]
        
        # Create model
        model = lgb.LGBMRegressor(**params, random_state=random_state, verbose=-1)
        
        # Fit with early stopping
        model.fit(
            X_tr,
            y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="rmse",
            categorical_feature=cat_cols if len(cat_cols) else "auto",
            callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)],
        )
        
        # Predict validation
        pred_va = model.predict(X_va, num_iteration=model.best_iteration_)
        
        # Store OOF predictions
        oof_pred[va_idx] = pred_va
        
        # Append model
        models.append(model)
        
        # Print fold RMSE
        rmse = np.sqrt(mean_squared_error(y_va, pred_va))
        print(f"Fold {i} RMSE: {rmse:.6f}")
    
    # Compute overall OOF RMSE
    oof_rmse = np.sqrt(mean_squared_error(y, oof_pred))
    
    # Print OOF RMSE
    print(f"OOF RMSE: {oof_rmse:.6f}")
    
    return models, oof_pred

In [None]:
# Define function to ensemble predictions across models
def predict_ensemble(models, X):
    # Initialize accumulator
    preds = np.zeros(len(X), dtype=float)
    
    # Sum predictions
    for m in models:
        preds += m.predict(X, num_iteration=m.best_iteration_)
    
    # Average predictions
    preds /= max(1, len(models))
    
    return preds

In [None]:
# Define function to compute simple market and strategy stats
def strategy_eval(train_df, target_col, pred_col, vol_cap_ratio=1.2):
    # Extract realized forward returns
    realized = train_df[target_col].values
    
    # Choose market proxy
    if "forward_returns" in train_df.columns:
        market = train_df["forward_returns"].values
    else:
        market = realized
    
    # Compute market volatility
    market_vol = np.std(market)
    
    # Prepare signal
    raw_sig = train_df[pred_col].values
    sig = (raw_sig - np.median(raw_sig))
    
    # Compute scaling
    if np.std(sig) < 1e-12:
        alpha = 0.0
    else:
        strat_vol_per_unit = np.std(sig * realized)
        target_vol = vol_cap_ratio * market_vol
        alpha = 0.0 if strat_vol_per_unit == 0 else min(1.0, target_vol / strat_vol_per_unit)
    
    # Compute strategy returns
    strat = alpha * sig * realized
    
    # Compute summary stats
    strat_mean = np.mean(strat)
    strat_vol = np.std(strat)
    market_mean = np.mean(market)
    
    # Compute Sharpe-like ratios
    strat_sharpe = 0.0 if strat_vol == 0 else strat_mean / strat_vol
    market_sharpe = 0.0 if market_vol == 0 else market_mean / market_vol
    
    # Print summary
    print(f"Strategy mean: {strat_mean:.6f}")
    print(f"Strategy vol:  {strat_vol:.6f}")
    print(f"Market vol:    {market_vol:.6f}")
    print(f"Alpha scale:   {alpha:.6f}")
    print(f"Sharpe (strat): {strat_sharpe:.4f}")
    print(f"Sharpe (mkt):   {market_sharpe:.4f}")
    
    return strat, alpha

In [None]:
# Define function to prepare test features safely
def build_test_matrix(test_df, features):
    # Enumerate potential leak columns
    leak_like = [
        "forward_returns",
        "risk_free_rate",
        "market_forward_excess_returns",
    ]
    
    # Build feature matrix
    X_test = test_df[features].copy()
    
    # Ensure categorical dtype where applicable
    cat_cols = X_test.select_dtypes(include=["object", "category"]).columns.tolist()
    for c in cat_cols:
        X_test[c] = X_test[c].astype("category")
    
    return X_test

In [None]:
# Define the main function
def main():
    # Set file paths
    train_path = "/kaggle/input/hull-tactical-market-prediction/train.csv"
    test_path = "/kaggle/input/hull-tactical-market-prediction/test.csv"
    
    # Load data
    train_df, test_df = load_data(train_path, test_path)
    
    # Identify target and features
    target, features, id_cols = get_columns(train_df, target_pref="market_forward_excess_returns")
    
    # Print dataset diagnostics
    print("Target column:", target)
    print("Number of features:", len(features))
    print("Some features:", features[:10])
    print("Train shape:", train_df.shape)
    print("Columns:", train_df.columns.tolist()[:20])
    
    # Determine optimization settings
    n_trials = 768
    
    # Run Optuna hyperparameter search
    best_params, best_cv_rmse = optimize_lgb_params(
        train_df=train_df,
        features=features,
        target=target,
        n_splits=10,
        n_trials=n_trials,
        random_state=42,
    )
    
    # Print best parameters
    print("Best CV RMSE:", f"{best_cv_rmse:.6f}")
    print("Best params:", best_params)
    
    # Train final models using best parameters
    models, oof_pred = train_with_cv(
        train_df=train_df,
        features=features,
        target=target,
        params=best_params,
        n_splits=10,
        random_state=42,
    )
    
    # Attach OOF predictions
    train_df["oof_pred"] = oof_pred
    
    # Run simple betting strategy diagnostic
    _strat, _alpha = strategy_eval(
        train_df=train_df,
        target_col=target,
        pred_col="oof_pred",
        vol_cap_ratio=1.2,
    )
    
    # Build test matrix
    X_test = build_test_matrix(test_df, features)
    
    # Predict test
    test_pred = predict_ensemble(models, X_test)
    
    # Build submission DataFrame
    sub = pd.DataFrame({"prediction": test_pred})
    if "date_id" in test_df.columns:
        sub.insert(0, "date_id", test_df["date_id"].values)
    
    # Save submission
    out_path = "submission.csv"
    sub.to_csv(out_path, index=False)
    
    # Print confirmation
    print(f"Saved submission to {out_path}")
    print(f"Columns: {list(sub.columns)}")
    print(sub.head().to_string(index=False))

In [None]:
# Call the main function
if __name__ == "__main__":
    main()