In [285]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")  # nuke all warnings



In [286]:
data = pd.read_csv('train.csv')
X = data.drop(columns = ['forward_returns', 'market_forward_excess_returns', 'risk_free_rate'])
y = data['market_forward_excess_returns']


In [287]:
X

Unnamed: 0,date_id,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,V12,V13,V2,V3,V4,V5,V6,V7,V8,V9
0,0,0,0,0,1,1,0,0,0,1,...,,,,,,,,,,
1,1,0,0,0,1,1,0,0,0,1,...,,,,,,,,,,
2,2,0,0,0,1,0,0,0,0,1,...,,,,,,,,,,
3,3,0,0,0,1,0,0,0,0,0,...,,,,,,,,,,
4,4,0,0,0,1,0,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8985,8985,0,0,0,0,0,0,0,0,0,...,0.533730,-0.432282,0.785053,0.469577,0.837963,1.226772,0.822751,-0.707361,0.142857,-0.649616
8986,8986,0,0,0,0,0,0,0,0,0,...,0.526455,-0.429506,0.767857,0.671958,0.837963,0.785877,0.805556,-0.715692,0.196098,-0.668289
8987,8987,0,0,1,0,0,0,0,0,0,...,0.433532,-0.425462,0.734127,0.481481,0.787698,0.834898,0.823413,-0.723949,0.133929,-0.670946
8988,8988,0,0,0,0,0,0,0,0,0,...,0.394180,-0.385170,0.695106,0.655423,0.783730,0.994026,0.851852,-0.684937,0.101852,-0.646265


In [288]:
features = X.columns.tolist()


In [289]:
vol_features = [x for x in features if x.startswith('V')]
mkt_features = [x for x in features if x.startswith('M') and not x.startswith('MOM')]
econ_features = [x for x in features if x.startswith('E')]
interest_features = [x for x in features if x.startswith('I')]
price_features = [x for x in features if x.startswith('P')]
sentiment_features = [x for x in features if x.startswith('S')]
binary_features = [x for x in features if x.startswith('D')]


In [290]:
feature_types = {
    'volatility': vol_features,
    'market': mkt_features,
    'economic': econ_features,
    'interest': interest_features,
    'price': price_features,
    'sentiment': sentiment_features,
    'binary': binary_features}

In [291]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle = False)

In [292]:
X_features_train = X_train[-252:]
y_features_train = y_train[-252:]
X_train = X_train[:-252]
y_train = y_train[:-252]

In [293]:
X_train_cleaned = X_train.ffill().bfill()
X_features_train_cleaned = X_features_train.ffill().bfill()
X_val_cleaned = X_val.ffill().bfill()


In [294]:
cols_with_na_train = X_train_cleaned.columns[X_train_cleaned.isna().any()]
cols_with_na_val = X_val_cleaned.columns[X_val_cleaned.isna().any()]
cols_with_na_features_train = X_features_train_cleaned.columns[X_features_train_cleaned.isna().any()]
all_cols_with_na = set(cols_with_na_train).union(set(cols_with_na_val)).union(set(cols_with_na_features_train))
drop_cols = list(all_cols_with_na)
X_train_cleaned = X_train_cleaned.drop(columns=drop_cols)
X_val_cleaned = X_val_cleaned.drop(columns=drop_cols)
X_features_train_cleaned = X_features_train_cleaned.drop(columns=drop_cols)

In [295]:
def feature_selection(X_tr, y_tr, X_va, y_va, *, k=30, top_corr=60,
                     use_extratrees=True, val_last=252, n_repeats=5, seed=42, w_corr=0.6, w_perm=0.4):
    # 1) keep a small validation slice
    if val_last is not None and len(X_va) > val_last:
        Xv = X_va.iloc[-val_last:].astype(np.float32).copy()
        yv = y_va.iloc[-val_last:].to_numpy()
    else:
        Xv = X_va.astype(np.float32).copy()
        yv = y_va.to_numpy()

    # 2) univariate Pearson on TRAIN; take top N
    corr_abs = X_tr.apply(lambda c: np.corrcoef(c, y_tr)[0,1], axis=0).abs().fillna(0.0)
    cand = corr_abs.sort_values(ascending=False).head(min(top_corr, X_tr.shape[1])).index.tolist()

    # 3) small, fast tree on TRAIN
    Tree = ExtraTreesRegressor if use_extratrees else RandomForestRegressor
    tree = Tree(
        n_estimators=100, max_depth=8, min_samples_leaf=0.01, max_features=0.7,
        n_jobs=-1, random_state=seed
    ).fit(X_tr[cand].astype(np.float32), y_tr.to_numpy())

    # 4) permutation importance on VALID (few repeats, parallel)
    pi = permutation_importance(tree, Xv[cand], yv, n_repeats=n_repeats,
                                random_state=seed, scoring="neg_mean_squared_error", n_jobs=-1)
    perm = pd.Series(np.clip(pi.importances_mean, 0, None), index=cand)

    # 5) combine by rank (robust to scaling)
    r_corr = corr_abs.loc[cand].rank(ascending=False)
    r_perm = perm.rank(ascending=False)
    score = (w_corr * r_corr + w_perm * r_perm).sort_values(ascending=False)

    selected = score.index[:min(k, len(score))].tolist()
    summary = pd.DataFrame({"corr_abs": corr_abs.loc[cand], "perm": perm, "score_rank": score}).loc[score.index]
    return selected, summary

In [296]:
def hit_rate(y_true, y_pred, *, dropna: bool = True, margin: float = 0.0, count_ties: bool = False) -> float:
    """
    Sign accuracy (hit rate): fraction of times sign(y_pred) == sign(y_true).

    Parameters
    ----------
    y_true, y_pred : array-like
        Equal-length sequences of numbers.
    dropna : bool, default True
        If True, drop any pair with NaN in either array.
        If False and NaNs are present, returns np.nan.
    margin : float, default 0.0
        Treat predictions with |y_pred| <= margin as 0 (neutral band).
    count_ties : bool, default False
        If False, exclude any pair where sign is 0 on either side.
        If True, include pairs with sign==0 and count them as correct
        only when both are 0.

    Returns
    -------
    hit : float
        Proportion in [0,1], or np.nan if no eligible pairs.
    """
    a = np.asarray(y_true, dtype=float)
    b = np.asarray(y_pred, dtype=float)

    if a.shape != b.shape:
        raise ValueError("y_true and y_pred must have the same shape")

    mask = np.isfinite(a) & np.isfinite(b)
    if not dropna and not mask.all():
        return np.nan
    a = a[mask]
    b = b[mask]

    # Apply neutral band to predictions
    if margin > 0:
        b = b.copy()
        b[np.abs(b) <= margin] = 0.0

    s_true = np.sign(a)
    s_pred = np.sign(b)

    if count_ties:
        eligible = np.ones_like(s_true, dtype=bool)
    else:
        eligible = (s_true != 0) & (s_pred != 0)

    if not np.any(eligible):
        return np.nan

    hits = (s_true[eligible] == s_pred[eligible]).mean()
    return float(hits)

In [297]:
def pearson_corr(y_true, y_pred):
    """
    Pearson correlation coefficient between y_true and y_pred.

    Parameters
    ----------
    y_true, y_pred : array-like
        Equal-length sequences of numbers.

    Returns
    -------
    corr : float
        Pearson correlation coefficient in [-1,1], or np.nan if undefined.
    """
    a = np.asarray(y_true, dtype=float)
    b = np.asarray(y_pred, dtype=float)

    if a.shape != b.shape:
        raise ValueError("y_true and y_pred must have the same shape")

    mask = np.isfinite(a) & np.isfinite(b)
    a = a[mask]
    b = b[mask]

    if len(a) == 0:
        return np.nan

    corr, _ = pearsonr(a, b)
    return float(corr)

In [298]:
def ridge_train(X_train, y_train, features = None, alpha=1.0):
    """
    Train a Ridge regression model and evaluate on validation set.

    Parameters
    ----------
    X_train : pd.DataFrame
        Training features.
    y_train : pd.Series
        Training target.
   
    alpha : float, default 1.0
        Regularization strength for Ridge regression.

    Returns
    -------
    model : Ridge
        Trained Ridge regression model.
    
    """
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('ridge', Ridge(alpha=alpha))
    ])

    if features:
        model.fit(X_train[features], y_train)
    else:
        model.fit(X_train, y_train)

    return model

In [299]:
def logistic_train(X_train, y_train, features = None, C=1.0):
    """
    Train a Logistic regression model and evaluate on validation set.

    Parameters
    ----------
    X_train : pd.DataFrame
        Training features.
    y_train : pd.Series
        Training target.
   
    C : float, default 1.0
        Inverse of regularization strength for Logistic regression.

    Returns
    -------
    model : LogisticRegression
        Trained Logistic regression model.
    
    """
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(C=C, max_iter=1000))
    ])

    if features:
        model.fit(X_train[features], y_train)
    else:
        model.fit(X_train, y_train)

    return model

In [300]:
def trees_train(X_train, y_train, features = None, type='RandomForest'):
    """
    Train a Tree Regressor and evaluate on validation set.

    Parameters
    ----------
    X_train : pd.DataFrame
        Training features.
    y_train : pd.Series
        Training target.
   
    n_estimators : int, default 100
        Number of trees in the forest.
    
    max_depth : int or None, default None
        Maximum depth of the tree.

    random_state : int, default 42
        Random seed for reproducibility.

    Returns
    -------
    model : Regressor
    
    """
    
    if type == 'RandomForest':
        model = RandomForestRegressor(n_estimators=100,
            max_depth=8,
            min_samples_leaf=0.01,     # 1% of samples per leaf (robust)
            min_samples_split=0.02,
            max_features=0.7,
            bootstrap=True,
            n_jobs=-1, random_state=42)
        
    elif type == 'ExtraTrees':
        model = ExtraTreesRegressor(
            n_estimators=100,
            max_depth=8,
            min_samples_leaf=0.01,
            min_samples_split=0.02,
            max_features=0.7,
            bootstrap=False,
            n_jobs=-1, random_state=42
        )

    elif type == 'XGBoost':
        model = XGBRegressor(
            n_estimators=100,
            learning_rate=0.10,
            max_depth=4,
            subsample=0.7,
            colsample_bytree=0.7,
            min_child_weight=10,       # combats noise
            reg_lambda=2.0,
            objective="reg:squarederror",
            n_jobs=-1, random_state=42
        )
    elif type == 'LightGBM':
        model = LGBMRegressor(
            verbosity = -1,
            n_estimators=100,
            learning_rate=0.10,
            max_depth=6,
            num_leaves=31,             # <= 2^max_depth for safety
            min_data_in_leaf=100,      # robust on small-signal data
            feature_fraction=0.7,
            bagging_fraction=0.7,
            bagging_freq=1,
            lambda_l2=5.0,
            extra_trees=True,          # adds randomness like ExtraTrees
            n_jobs=-1, random_state=42
        )
    if features:
        model.fit(X_train[features], y_train)
    else:
        model.fit(X_train, y_train)

    return model

In [301]:
def linear_calibrate(pred_train, y_train, *, dropna: bool = True, fit_intercept: bool = True):
    """
    Fit a scikit-learn LinearRegression on TRAIN predictions:
        y = alpha + beta * pred

    Returns a Pipeline that reshapes 1D inputs and applies the fitted LinearRegression.
    You can call .predict(...) on it with a 1D array/list/Series of predictions.

    Parameters
    ----------
    pred_train : array-like, shape (n_samples,)
        Model predictions on the training window (e.g., your tree's outputs).
    y_train : array-like, shape (n_samples,)
        Realized targets on the training window (e.g., next-day returns).
    dropna : bool, default True
        Drop pairs with NaN/Inf before fitting. If False and NaNs exist, raises ValueError.
    fit_intercept : bool, default True
        Passed to LinearRegression.

    Returns
    -------
    model : sklearn Pipeline
        Use model.predict(new_pred) to get calibrated predictions.
        Access alpha/beta via:
            alpha = model.named_steps['lr'].intercept_
            beta  = model.named_steps['lr'].coef_[0]
    """
    x = np.asarray(pred_train, dtype=float)
    y = np.asarray(y_train, dtype=float)
    if x.shape != y.shape:
        raise ValueError("pred_train and y_train must have the same shape")

    mask = np.isfinite(x) & np.isfinite(y)
    if not dropna and not mask.all():
        raise ValueError("NaNs/Infs present; set dropna=True to filter them out.")
    x, y = x[mask], y[mask]
    if x.size < 2:
        raise ValueError("Not enough data to calibrate (need ≥2 finite pairs).")

    # Pipeline so you can pass 1D arrays to .predict() without manual reshape
    model = Pipeline(steps=[
        ("reshape", FunctionTransformer(lambda z: np.asarray(z, dtype=float).reshape(-1, 1), validate=False)),
        ("lr", LinearRegression(fit_intercept=fit_intercept)),
    ])
    model.fit(x, y)
    return model


In [302]:
def predict(model, X, calibrate_model):
    """
    Generate predictions using the trained model.

    Parameters
    ----------
    model : Trained model
        The trained regression model.
    X : pd.DataFrame
        Features for prediction.

    Returns
    -------
    y_pred : np.ndarray
        Predicted values.
    """
    y_pred = model.predict(X)

    calibrated_pred = calibrate_model.predict(y_pred)
    return calibrated_pred

In [303]:
features_selected, feature_summary = feature_selection(X_train_cleaned, y_train, X_features_train_cleaned, y_features_train, w_corr=0.9, w_perm=0.1)

In [304]:
ridge = ridge_train(X_train_cleaned, y_train, alpha=1.0)
ridge_predict_train = ridge.predict(X_train_cleaned)
calibrate_model = linear_calibrate(ridge_predict_train, y_train)
ridge_predict_val = predict(ridge, X_val_cleaned, calibrate_model)
ridge_hit_rate = hit_rate(y_val, ridge_predict_val)
ridge_pearson = pearson_corr(y_val, ridge_predict_val)
print("Ridge Hit Rate:", ridge_hit_rate, "Pearson Correlation:", ridge_pearson)

Ridge Hit Rate: 0.5205784204671857 Pearson Correlation: 0.0075661210040566355


In [None]:
logistic = logistic_train(X_train_cleaned, y_train.apply(lambda x: 1 if x > 0 else 0), C=1.0)
logistic_predict_train = logistic.predict_proba(X_train_cleaned)
logistic_calibrate_model = linear_calibrate(logistic_predict_train[:,1], y_train)
logistic_predict_val = predict(logistic, X_val_cleaned, logistic_calibrate_model)
logistic_hit_rate = hit_rate(y_val, logistic_predict_val)
logistic_pearson = pearson_corr(y_val, logistic_predict_val)
print("Logistic Hit Rate:", logistic_hit_rate, "Pearson Correlation:", logistic_pearson)

Logistic Hit Rate: 0.4860956618464961 Pearson Correlation: -0.010489447890083457


In [306]:
lightgbm = trees_train(X_train_cleaned, y_train, features=features_selected, type='LightGBM')
lightgbm_predict_train = lightgbm.predict(X_train_cleaned[features_selected])
lightgbm_calibrate_model = linear_calibrate(lightgbm_predict_train, y_train)
lightgbm_predict_val = predict(lightgbm, X_val_cleaned[features_selected], lightgbm_calibrate_model)
lightgbm_hit_rate = hit_rate(y_val, lightgbm_predict_val)
lightgbm_pearson = pearson_corr(y_val, lightgbm_predict_val)
print("LightGBM Hit Rate:", lightgbm_hit_rate, "Pearson Correlation:", lightgbm_pearson)   

LightGBM Hit Rate: 0.5005561735261401 Pearson Correlation: 0.010728478686426124


In [307]:
rf = trees_train(X_train_cleaned, y_train, type='RandomForest')
rf_predict_train = rf.predict(X_train_cleaned)
rf_calibrate_model = linear_calibrate(rf_predict_train, y_train)        
rf_predict_val = predict(rf, X_val_cleaned, rf_calibrate_model)
rf_hit_rate = hit_rate(y_val, rf_predict_val)
rf_pearson = pearson_corr(y_val, rf_predict_val)
print("Random Forest Hit Rate:", rf_hit_rate, "Pearson Correlation:", rf_pearson)

Random Forest Hit Rate: 0.5072302558398221 Pearson Correlation: 0.0293201629869648


In [308]:
models = ['RandomForest', 'ExtraTrees', 'XGBoost', 'LightGBM']

In [309]:
ridge_all = ridge_train(X_train_cleaned, y_train, list(X_train_cleaned.columns), alpha=1.0)
ridge_all_predict_train = ridge_all.predict(X_train_cleaned)
calibrate_model = linear_calibrate(ridge_all_predict_train, y_train)
y_pred_ridge = predict(ridge_all, X_val_cleaned, calibrate_model)
pearson_score = pearson_corr(y_val, y_pred_ridge)
hitrate_score = hit_rate(y_val, y_pred_ridge)
print('Ridge All: Pearson Correlation: ', pearson_score, '  Hit Rate: ', hitrate_score)
for key, value in feature_types.items():
    ridge_subset = ridge_train(X_train_cleaned[value], y_train, alpha=1.0)
    ridge_subset_predict_train = ridge_subset.predict(X_train_cleaned[value])
    calibrate_model = linear_calibrate(ridge_subset_predict_train, y_train)
    y_pred_subset = predict(ridge_subset, X_val_cleaned[value], calibrate_model)
    pearson_score = pearson_corr(y_val, y_pred_subset)
    hitrate_score = hit_rate(y_val, y_pred_subset)
    print(f'Ridge - Feature Type: {key}, Pearson Correlation: {pearson_score:.4f}, Hit Rate: {hitrate_score:.4f}')

Ridge All: Pearson Correlation:  0.0075661210040566355   Hit Rate:  0.5205784204671857
Ridge - Feature Type: volatility, Pearson Correlation: 0.0536, Hit Rate: 0.5323
Ridge - Feature Type: market, Pearson Correlation: 0.0361, Hit Rate: 0.5050


KeyError: "['E7'] not in index"

In [None]:
for model_type in models:
    tree_all = trees_train(X_train_cleaned, y_train, type=model_type)
    tree_all_predict_train = tree_all.predict(X_train_cleaned)
    calibrate_model = linear_calibrate(tree_all_predict_train, y_train)
    y_pred_tree = predict(tree_all, X_val_cleaned, calibrate_model)
    pearson_score = pearson_corr(y_val, y_pred_tree)
    hitrate_score = hit_rate(y_val, y_pred_tree)
    print(f'{model_type} All: Pearson Correlation: {pearson_score:.4f}, Hit Rate: {hitrate_score:.4f}')
    for key, value in feature_types.items():
        tree_subset = trees_train(X_train_cleaned[value], y_train, type=model_type)
        tree_subset_predict_train = tree_subset.predict(X_train_cleaned[value])
        calibrate_model = linear_calibrate(tree_subset_predict_train, y_train)
        y_pred_subset = predict(tree_subset, X_val_cleaned[value], calibrate_model)
        pearson_score = pearson_corr(y_val, y_pred_subset)
        hitrate_score = hit_rate(y_val, y_pred_subset)
        print(f'{model_type} - Feature Type: {key}, Pearson Correlation: {pearson_score:.4f}, Hit Rate: {hitrate_score:.4f}')

RandomForest All: Pearson Correlation: 0.0240, Hit Rate: 0.5111
RandomForest - Feature Type: volatility, Pearson Correlation: 0.0303, Hit Rate: 0.5245
RandomForest - Feature Type: market, Pearson Correlation: 0.0281, Hit Rate: 0.5044
RandomForest - Feature Type: economic, Pearson Correlation: -0.0137, Hit Rate: 0.4989
RandomForest - Feature Type: interest, Pearson Correlation: 0.0044, Hit Rate: 0.4967
RandomForest - Feature Type: price, Pearson Correlation: 0.0355, Hit Rate: 0.4900
RandomForest - Feature Type: sentiment, Pearson Correlation: 0.0001, Hit Rate: 0.5195
RandomForest - Feature Type: binary, Pearson Correlation: 0.0067, Hit Rate: 0.4967
ExtraTrees All: Pearson Correlation: 0.0143, Hit Rate: 0.4967
ExtraTrees - Feature Type: volatility, Pearson Correlation: 0.0235, Hit Rate: 0.5239
ExtraTrees - Feature Type: market, Pearson Correlation: 0.0239, Hit Rate: 0.5000
ExtraTrees - Feature Type: economic, Pearson Correlation: -0.0147, Hit Rate: 0.5095
ExtraTrees - Feature Type: inter

In [None]:
y_val

7192   -0.004158
7193   -0.001529
7194    0.001474
7195    0.004647
7196    0.008136
          ...   
8985    0.001990
8986    0.001845
8987    0.002424
8988    0.007843
8989   -0.000368
Name: market_forward_excess_returns, Length: 1798, dtype: float64

In [None]:
y_val[y_val>0]

7194    0.001474
7195    0.004647
7196    0.008136
7200    0.004543
7202    0.005060
          ...   
8983    0.007887
8985    0.001990
8986    0.001845
8987    0.002424
8988    0.007843
Name: market_forward_excess_returns, Length: 942, dtype: float64