In [114]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from arch.univariate import ConstantMean, GARCH, StudentsT, Normal

import warnings

warnings.filterwarnings("ignore")  # nuke all warnings



In [115]:
X_train = pd.read_csv('X_train_enriched.csv')
y_train = pd.read_csv('y_train.csv')['market_forward_excess_returns']


In [116]:
X_features_train = X_train[-252:]
X_train = X_train[:-252]
y_features_train = y_train[-252:]
y_train = y_train[:-252]

In [117]:
X_train_cleaned = X_train.ffill().fillna(0)
X_features_train_cleaned = X_features_train.ffill().fillna(0)

In [118]:
good_features = ['D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'E1',
 'E10',
 'E11',
 'E12',
 'E13',
 'E14',
 'E15',
 'E16',
 'E17',
 'E18',
 'E19',
 'E2',
 'E20',
 'E3',
 'E4',
 'E5',
 'E6',
 'E8',
 'E9',
 'I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'M10',
 'M11',
 'M12',
 'M15',
 'M16',
 'M17',
 'M18',
 'M2',
 'M3',
 'M4',
 'M5',
 'M7',
 'M8',
 'M9',
 'P1',
 'P10',
 'P11',
 'P12',
 'P13',
 'P2',
 'P3',
 'P4',
 'P5',
 'P6',
 'P7',
 'P8',
 'P9',
 'S1',
 'S10',
 'S11',
 'S12',
 'S2',
 'S4',
 'S5',
 'S6',
 'S7',
 'S8',
 'S9',
 'V1',
 'V11',
 'V12',
 'V13',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8']

In [119]:
def feature_selection(X_tr, y_tr, X_va, y_va, *, k=30, top_corr=60,
                     use_extratrees=True, val_last=252, n_repeats=5, seed=42, w_corr=0.6, w_perm=0.4):
    # 1) keep a small validation slice
    if val_last is not None and len(X_va) > val_last:
        Xv = X_va.iloc[-val_last:].astype(np.float32).copy()
        yv = y_va.iloc[-val_last:].to_numpy()
    else:
        Xv = X_va.astype(np.float32).copy()
        yv = y_va.to_numpy()

    # 2) univariate Pearson on TRAIN; take top N
    corr_abs = X_tr.apply(lambda c: np.corrcoef(c, y_tr)[0,1], axis=0).abs().fillna(0.0)
    cand = corr_abs.sort_values(ascending=False).head(min(top_corr, X_tr.shape[1])).index.tolist()

    # 3) small, fast tree on TRAIN
    Tree = ExtraTreesRegressor if use_extratrees else RandomForestRegressor
    tree = Tree(
        n_estimators=100, max_depth=8, min_samples_leaf=0.01, max_features=0.7,
        n_jobs=-1, random_state=seed
    ).fit(X_tr[cand].astype(np.float32), y_tr.to_numpy())

    # 4) permutation importance on VALID (few repeats, parallel)
    pi = permutation_importance(tree, Xv[cand], yv, n_repeats=n_repeats,
                                random_state=seed, scoring="neg_mean_squared_error", n_jobs=-1)
    perm = pd.Series(np.clip(pi.importances_mean, 0, None), index=cand)

    # 5) combine by rank (robust to scaling)
    r_corr = corr_abs.loc[cand].rank(ascending=False)
    r_perm = perm.rank(ascending=False)
    score = (w_corr * r_corr + w_perm * r_perm).sort_values(ascending=False)

    selected = score.index[:min(k, len(score))].tolist()
    summary = pd.DataFrame({"corr_abs": corr_abs.loc[cand], "perm": perm, "score_rank": score}).loc[score.index]
    return selected, summary

In [120]:
def logistic_train(X_train, y_train, features = None, C=1.0):
    """
    Train a Logistic regression model and evaluate on validation set.

    Parameters
    ----------
    X_train : pd.DataFrame
        Training features.
    y_train : pd.Series
        Training target.
   
    C : float, default 1.0
        Inverse of regularization strength for Logistic regression.

    Returns
    -------
    model : LogisticRegression
        Trained Logistic regression model.
    
    """
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(C=C, max_iter=1000))
    ])

    if features:
        model.fit(X_train[features], y_train)
    else:
        model.fit(X_train, y_train)

    return model

In [121]:
def trees_train(X_train, y_train, features = None, type='RandomForest'):
    """
    Train a Tree Regressor and evaluate on validation set.

    Parameters
    ----------
    X_train : pd.DataFrame
        Training features.
    y_train : pd.Series
        Training target.
   
    n_estimators : int, default 100
        Number of trees in the forest.
    
    max_depth : int or None, default None
        Maximum depth of the tree.

    random_state : int, default 42
        Random seed for reproducibility.

    Returns
    -------
    model : Regressor
    
    """
    
    if type == 'RandomForest':
        model = RandomForestRegressor(n_estimators=100,
            max_depth=8,
            min_samples_leaf=0.01,     # 1% of samples per leaf (robust)
            min_samples_split=0.02,
            max_features=0.7,
            bootstrap=True,
            n_jobs=-1, random_state=42)
        
    elif type == 'ExtraTrees':
        model = ExtraTreesRegressor(
            n_estimators=100,
            max_depth=8,
            min_samples_leaf=0.01,
            min_samples_split=0.02,
            max_features=0.7,
            bootstrap=False,
            n_jobs=-1, random_state=42
        )

    elif type == 'XGBoost':
        model = XGBRegressor(
            n_estimators=100,
            learning_rate=0.10,
            max_depth=4,
            subsample=0.7,
            colsample_bytree=0.7,
            min_child_weight=10,       # combats noise
            reg_lambda=2.0,
            objective="reg:squarederror",
            n_jobs=-1, random_state=42
        )
    elif type == 'LightGBM':
        model = LGBMRegressor(
            verbosity = -1,
            n_estimators=100,
            learning_rate=0.10,
            max_depth=6,
            num_leaves=31,             # <= 2^max_depth for safety
            min_data_in_leaf=100,      # robust on small-signal data
            feature_fraction=0.7,
            bagging_fraction=0.7,
            bagging_freq=1,
            lambda_l2=5.0,
            extra_trees=True,          # adds randomness like ExtraTrees
            n_jobs=-1, random_state=42
        )
    if features:
        model.fit(X_train[features], y_train)
    else:
        model.fit(X_train, y_train)

    return model

In [122]:
def linear_calibrate(pred_train, y_train, *, dropna: bool = True, fit_intercept: bool = True):
    """
    Fit a scikit-learn LinearRegression on TRAIN predictions:
        y = alpha + beta * pred

    Returns a Pipeline that reshapes 1D inputs and applies the fitted LinearRegression.
    You can call .predict(...) on it with a 1D array/list/Series of predictions.

    Parameters
    ----------
    pred_train : array-like, shape (n_samples,)
        Model predictions on the training window (e.g., your tree's outputs).
    y_train : array-like, shape (n_samples,)
        Realized targets on the training window (e.g., next-day returns).
    dropna : bool, default True
        Drop pairs with NaN/Inf before fitting. If False and NaNs exist, raises ValueError.
    fit_intercept : bool, default True
        Passed to LinearRegression.

    Returns
    -------
    model : sklearn Pipeline
        Use model.predict(new_pred) to get calibrated predictions.
        Access alpha/beta via:
            alpha = model.named_steps['lr'].intercept_
            beta  = model.named_steps['lr'].coef_[0]
    """
    x = np.asarray(pred_train, dtype=float).flatten()
    y = np.asarray(y_train, dtype=float).flatten()
    if x.shape != y.shape:
        raise ValueError("pred_train and y_train must have the same shape")

    mask = np.isfinite(x) & np.isfinite(y)
    if not dropna and not mask.all():
        raise ValueError("NaNs/Infs present; set dropna=True to filter them out.")
    x, y = x[mask], y[mask]
    if x.size < 2:
        raise ValueError("Not enough data to calibrate (need ≥2 finite pairs).")

    # Pipeline so you can pass 1D arrays to .predict() without manual reshape
    model = Pipeline(steps=[
        ("reshape", FunctionTransformer(lambda z: np.asarray(z, dtype=float).reshape(-1, 1), validate=False)),
        ("lr", LinearRegression(fit_intercept=fit_intercept)),
    ])
    model.fit(x, y)
    return model


In [123]:
def mean_predict(model, X, calibrate_model):
    """
    Generate predictions using the trained model.

    Parameters
    ----------
    model : Trained model
        The trained regression model.
    X : pd.DataFrame
        Features for prediction.

    Returns
    -------
    y_pred : np.ndarray
        Predicted values.
    """
    y_pred = model.predict(X)

    calibrated_pred = calibrate_model.predict(y_pred)
    return calibrated_pred

In [124]:
def make_lag_features(df, cols, lags=(1, 5, 20), *, keep_original=True, dtype="float32"):
    """
    Returns a new DataFrame with optional originals + lagged copies.
    Uses shift(L), so expect NaNs at the head.
    """
    out = pd.DataFrame(index=df.index)
    if keep_original:
        out[cols] = df[cols]
    for L in lags:
        out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
    if dtype is not None:
        for c in out.columns:
            if pd.api.types.is_float_dtype(out[c]):
                out[c] = out[c].astype(dtype)
    return out

In [125]:
# 4) Rolling stats (+ optional z-scores), time-safe via past_only=True
def make_rolling_features(
    df, cols,
    windows=(5, 20),
    *,
    stats=("mean", "std"),     # any of: "mean","std","min","max","sum"
    zscore: bool = False,       # z_t = (x_t - mean_{past}) / (std_{past}+eps)
    past_only: bool = True,    # shift(1) inside rolling to avoid leakage
    min_periods: int = None,  # default = window size
    eps: float = 1e-9,
    dtype: str = "float32",
):
    """
    Builds rolling features for each col over each window.
    If past_only=True, the rolling window excludes the current row (safe for t+1 targets).
    """
    out = pd.DataFrame(index=df.index)
    for w in windows:
        mp = w if min_periods is None else min_periods
        base = df[cols].shift(1) if past_only else df[cols]
        roll = base.rolling(window=w, min_periods=mp)

        if "mean" in stats:
            m = roll.mean()
            out[[f"{c}_rollmean{w}" for c in cols]] = m
        if "std" in stats:
            s = roll.std(ddof=0)
            out[[f"{c}_rollstd{w}" for c in cols]] = s
        if "min" in stats:
            out[[f"{c}_rollmin{w}" for c in cols]] = roll.min()
        if "max" in stats:
            out[[f"{c}_rollmax{w}" for c in cols]] = roll.max()
        if "sum" in stats:
            out[[f"{c}_rollsum{w}" for c in cols]] = roll.sum()

        if zscore:
            # need mean & std; compute if not already available
            if "mean" in stats:
                m = out[[f"{c}_rollmean{w}" for c in cols]].copy()
                m.columns = cols
            else:
                m = roll.mean()
            if "std" in stats:
                s = out[[f"{c}_rollstd{w}" for c in cols]].copy()
                s.columns = cols
            else:
                s = roll.std(ddof=0)
            for c in cols:
                out[f"{c}_z{w}"] = (df[c] - m[c]) / (s[c] + eps)

    if dtype is not None:
        for c in out.columns:
            if pd.api.types.is_float_dtype(out[c]):
                out[c] = out[c].astype(dtype)
    return out

In [126]:
regression_features, regression_feature_summary = feature_selection(X_train_cleaned, y_train, X_features_train_cleaned, y_features_train, w_corr=0.9, w_perm=0.1, k = 30, top_corr=30)

In [127]:
logistic_y = [1 if x > 0 else 0 for x in list(np.asarray(y_train).flatten())]
logistic = logistic_train(X_train_cleaned, pd.Series(logistic_y), features = regression_features, C=1.0)
logistic_predict_train = logistic.predict_proba(X_train_cleaned[regression_features])
logistic_calibrate_model = linear_calibrate(logistic_predict_train[:,1], y_train)


In [128]:
btree_features, btree_feature_summary = feature_selection(X_train_cleaned, y_train, X_features_train_cleaned, y_features_train, w_corr=0.7, w_perm=0.3, k = 200, top_corr=200)

In [129]:
lightgbm = trees_train(X_train_cleaned, y_train, features=btree_features, type='LightGBM')
lightgbm_predict_train = lightgbm.predict(X_train_cleaned[btree_features])
lightgbm_calibrate_model = linear_calibrate(lightgbm_predict_train, y_train)


In [130]:
tree_features, tree_feature_summary = feature_selection(X_train_cleaned, y_train, X_features_train_cleaned, y_features_train, w_corr=0.7, w_perm=0.3, k = 30, top_corr=100)

In [131]:
rf = trees_train(X_train_cleaned, y_train, features=tree_features, type='RandomForest')
rf_predict_train = rf.predict(X_train_cleaned[tree_features])
rf_calibrate_model = linear_calibrate(rf_predict_train, y_train)  

In [132]:
def ewma_step(prev_var: float, prev_ret: float, lam: float = 0.94) -> float:
    """Given yesterday's variance v_t and return r_t (same units you trained on),
    return v_{t+1} = λ v_t + (1-λ) r_t^2."""
    #print(lam, prev_var, prev_ret)
    return lam * float(prev_var) + (1.0 - lam) * (float(prev_ret) ** 2)

In [None]:
def predict(test: pd.DataFrame) -> float:
    X_test = test.drop(columns = ['forward_returns', 'market_forward_excess_returns', 'risk_free_rate', 'date_id'])
    global X_hist
    global rf
    global rf_calibrate_model
    global logistic
    global logistic_calibrate_model
    global lightgbm
    global lightgbm_calibrate_model
    global regression_features
    global tree_features
    global btree_features
    global tree_var_features
    global tree_model
    global good_features
    global prev_var
    global prev_ret
    global alpha
    global beta
    global omega
    print(X_test)
    print(X_hist)
  
    X_hist = pd.concat([X_hist,X_test], ignore_index = True)
    #print(prev_var, prev_ret)

    X_enriched_hist = make_rolling_features(make_lag_features(X_hist, good_features, lags = (1, 2, 5, 20)), good_features)
    X_enriched_hist_cleaned = X_enriched_hist.ffill().fillna(0)
    X_enriched_test = X_enriched_hist_cleaned.iloc[-1:]
    logistic_mean = mean_predict(logistic, X_enriched_test[regression_features], logistic_calibrate_model)
    lightgbm_mean = mean_predict(lightgbm, X_enriched_test[btree_features], lightgbm_calibrate_model)
    rf_mean = mean_predict(rf, X_enriched_test[tree_features], rf_calibrate_model)
    ensemble_mean = (0.3*logistic_mean + 0.4*lightgbm_mean + 0.3*rf_mean)[0]
    ewma_var = ewma_step(prev_var, prev_ret)
    # garch_var = garch_step(prev_var, prev_ret, alpha, beta, omega)
    # tree_var = tree_step(tree_model, X_enriched_test[tree_var_features])
    #print(ewma_var, garch_var, tree_var)
    ensemble_var = (1*ewma_var)
    prev_ret = ensemble_mean
    prev_var = ensemble_var

    #print(ensemble_mean, ensemble_var)
    w = min(max(0, min(ensemble_mean/(8*ensemble_var), ensemble_mean*200)), 2)
    print(ensemble_mean, ensemble_var, ensemble_mean/(10*ensemble_var),w)
    return float(w), ensemble_mean, logistic_mean, lightgbm_mean, rf_mean

In [157]:
test_loc = 7192


In [158]:
raw_train = pd.read_csv('X_train.csv')

In [159]:
X_tests = pd.read_csv('train.csv').iloc[test_loc:]
y_tests = X_tests[['forward_returns', 'market_forward_excess_returns', 'risk_free_rate']]

In [160]:
X_features_train_cleaned

Unnamed: 0,D1_rollmean5,D2_rollmean5,D3_rollmean5,D4_rollmean5,D5_rollmean5,D6_rollmean5,D7_rollmean5,D8_rollmean5,D9_rollmean5,E1_rollmean5,...,V11_rollstd20,V12_rollstd20,V13_rollstd20,V2_rollstd20,V3_rollstd20,V4_rollstd20,V5_rollstd20,V6_rollstd20,V7_rollstd20,V8_rollstd20
6940,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.715571,...,0.112619,0.203303,0.075878,0.058565,0.137396,0.025455,0.539845,0.155436,0.121786,0.047750
6941,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.714518,...,0.108789,0.191063,0.076502,0.055130,0.140583,0.031152,0.586112,0.154636,0.124032,0.046827
6942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.713465,...,0.107945,0.194004,0.077238,0.052311,0.137758,0.031804,0.571162,0.157478,0.127434,0.047042
6943,0.0,0.0,0.0,0.0,0.0,-0.2,0.0,0.0,0.0,0.712414,...,0.107272,0.194644,0.078845,0.050736,0.134478,0.031201,0.589364,0.153907,0.132294,0.046997
6944,0.0,0.0,0.0,0.0,0.0,-0.4,0.0,0.0,0.0,0.711362,...,0.107225,0.194362,0.081285,0.049739,0.142204,0.031065,0.587173,0.159326,0.138731,0.047024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7187,0.0,0.0,0.2,0.0,0.2,0.0,0.2,0.0,0.4,1.068772,...,0.105407,0.169268,0.561259,0.034088,0.177390,0.020063,0.403754,0.191928,0.508592,0.006913
7188,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.2,1.067526,...,0.105115,0.170534,0.547991,0.034827,0.185225,0.019414,0.403754,0.192869,0.495477,0.007027
7189,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,1.066283,...,0.100689,0.167621,0.554014,0.034569,0.184154,0.019306,0.392002,0.193821,0.499186,0.006981
7190,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,1.065042,...,0.096758,0.165098,0.554706,0.035329,0.182496,0.020616,0.381425,0.197756,0.497592,0.007026


In [161]:
X_tests

Unnamed: 0,date_id,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,V3,V4,V5,V6,V7,V8,V9,forward_returns,risk_free_rate,market_forward_excess_returns
7192,7192,0,0,0,0,0,0,0,0,0,...,0.550265,0.875000,2.808941,0.128307,-0.710536,0.973545,-0.728618,-0.003771,0.000078,-0.004158
7193,7193,0,0,0,0,0,0,0,0,0,...,0.886243,0.870370,3.405066,0.074074,-0.499816,0.955357,-0.531049,-0.001143,0.000077,-0.001529
7194,7194,0,0,0,0,0,-1,0,0,0,...,0.599206,0.875000,2.969902,0.102513,-0.501887,0.957011,-0.492945,0.001859,0.000077,0.001474
7195,7195,0,0,0,0,0,-1,0,0,0,...,0.667328,0.867063,3.184580,0.138228,-0.566672,0.948082,-0.566723,0.005032,0.000076,0.004647
7196,7196,0,0,0,0,0,-1,0,0,0,...,0.697090,0.876323,3.309322,0.138889,-0.623320,0.970899,-0.620151,0.008522,0.000078,0.008136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8985,8985,0,0,0,0,0,0,0,0,0,...,0.469577,0.837963,1.226772,0.822751,-0.707361,0.142857,-0.649616,0.002457,0.000155,0.001990
8986,8986,0,0,0,0,0,0,0,0,0,...,0.671958,0.837963,0.785877,0.805556,-0.715692,0.196098,-0.668289,0.002312,0.000156,0.001845
8987,8987,0,0,1,0,0,0,0,0,0,...,0.481481,0.787698,0.834898,0.823413,-0.723949,0.133929,-0.670946,0.002891,0.000156,0.002424
8988,8988,0,0,0,0,0,0,0,0,0,...,0.655423,0.783730,0.994026,0.851852,-0.684937,0.101852,-0.646265,0.008310,0.000156,0.007843


In [162]:
prev_var = 1e-4
X_hist = raw_train[-25:]
alpha = 0.10080081885102102
beta = 0.880106446271156
omega = 0.007218758128689602
prev_ret = y_features_train.iloc[-1]
positions = []
predicted_returns = []
logistic_mean_list = []
lightgbm_mean_list = []
rf_mean_list = []
for i in range (1):

    w, pred_return, logistic_mean, lightgbm_mean, rf_mean = predict(X_tests.iloc[i:i+1])
    positions.append(w)
    predicted_returns.append(pred_return)
    logistic_mean_list.append(logistic_mean)
    lightgbm_mean_list.append(lightgbm_mean)
    rf_mean_list.append(rf_mean)


      D1  D2  D3  D4  D5  D6  D7  D8  D9        E1  ...       V12       V13  \
7167   1   1   1   0   0   0   0   0   0  1.090323  ...  0.763228 -0.622539   
7168   0   0   0   0   0   0   0   0   0  1.089036  ...  0.890873 -0.423329   
7169   0   0   0   0   0   0   0   0   0  1.087750  ...  0.872354 -0.621087   
7170   0   0   0   0   0   0   0   0   0  1.086468  ...  0.930556 -0.656546   
7171   0   0   0   0   0   0   0   0   0  1.085188  ...  0.909392 -0.545416   
7172   0   0   0   0   0   0   0   0   0  1.083910  ...  0.911376 -0.247013   
7173   0   0   0   0   0  -1   0   0   0  1.082635  ...  0.824735 -0.380386   
7174   0   0   0   0   0  -1   0   0   0  1.081362  ...  0.798942  0.170880   
7175   0   0   0   0   0  -1   0   0   0  1.080092  ...  0.761243 -0.087064   
7176   0   0   0   0   0  -1   0   0   0  1.078824  ...  0.891534  1.038756   
7177   0   0   0   0   0  -1   0   0   0  1.077559  ...  0.881944  0.497164   
7178   0   0   0   0   0   0   0   1   0  1.076296  

In [163]:
def score(solution: pd.DataFrame, submission) -> float:
    """
    Calculates a custom evaluation metric (volatility-adjusted Sharpe ratio).

    This metric penalizes strategies that take on significantly more volatility
    than the underlying market.

    Returns:
        float: The calculated adjusted Sharpe ratio.
    """

  
    solution = solution.copy()
    solution.loc[:, 'position'] = submission

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    # Calculate strategy's Sharpe ratio
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()
    trading_days_per_yr = 252
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    # Calculate market return and volatility
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()


    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)


    # Calculate the volatility penalty
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol

    # Calculate the return penalty
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100

    # Adjust the Sharpe ratio by the volatility and return penalty
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

In [164]:
score(y_tests, [1.0]*len(y_tests))

0.6420136080214519

In [165]:
score(y_tests, positions)

ValueError: Length of values (1) does not match length of index (1798)

In [None]:
hit_rate(y_tests['market_forward_excess_returns'], [1.0]*len(y_tests))

0.5239154616240267

In [None]:
hit_rate(y_tests['market_forward_excess_returns'], positions)

0.5509977827050998

In [None]:
pearson_corr(y_tests['market_forward_excess_returns'], means)

0.031636523472569685

In [None]:
def hit_rate(y_true, y_pred, *, dropna: bool = True, margin: float = 0.0, count_ties: bool = False) -> float:
    """
    Sign accuracy (hit rate): fraction of times sign(y_pred) == sign(y_true).

    Parameters
    ----------
    y_true, y_pred : array-like
        Equal-length sequences of numbers.
    dropna : bool, default True
        If True, drop any pair with NaN in either array.
        If False and NaNs are present, returns np.nan.
    margin : float, default 0.0
        Treat predictions with |y_pred| <= margin as 0 (neutral band).
    count_ties : bool, default False
        If False, exclude any pair where sign is 0 on either side.
        If True, include pairs with sign==0 and count them as correct
        only when both are 0.

    Returns
    -------
    hit : float
        Proportion in [0,1], or np.nan if no eligible pairs.
    """
    a = np.asarray(y_true, dtype=float).flatten()
    b = np.asarray(y_pred, dtype=float).flatten()

    if a.shape != b.shape:
        raise ValueError("y_true and y_pred must have the same shape")

    mask = np.isfinite(a) & np.isfinite(b)
    if not dropna and not mask.all():
        return np.nan
    a = a[mask]
    b = b[mask]

    # Apply neutral band to predictions
    if margin > 0:
        b = b.copy()
        b[np.abs(b) <= margin] = 0.0

    s_true = np.sign(a)
    s_pred = np.sign(b)

    if count_ties:
        eligible = np.ones_like(s_true, dtype=bool)
    else:
        eligible = (s_true != 0) & (s_pred != 0)

    if not np.any(eligible):
        return np.nan

    hits = (s_true[eligible] == s_pred[eligible]).mean()
    return float(hits)

In [None]:
def pearson_corr(y_true, y_pred):
    """
    Pearson correlation coefficient between y_true and y_pred.

    Parameters
    ----------
    y_true, y_pred : array-like
        Equal-length sequences of numbers.

    Returns
    -------
    corr : float
        Pearson correlation coefficient in [-1,1], or np.nan if undefined.
    """
    a = np.asarray(y_true, dtype=float).flatten()
    b = np.asarray(y_pred, dtype=float).flatten()

    if a.shape != b.shape:
        raise ValueError("y_true and y_pred must have the same shape")

    mask = np.isfinite(a) & np.isfinite(b)
    a = a[mask]
    b = b[mask]

    if len(a) == 0:
        return np.nan

    corr, _ = pearsonr(a, b)
    return float(corr)