In [179]:
import pandas as pd
import numpy as np

In [None]:
X_train = pd.read_csv('X_train.csv')
X_val = pd.read_csv('X_val.csv')

In [182]:
X_train

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,E1,...,V2,V3,V4,V5,V6,V7,V8,V9,lagged_forward_returns,lagged_risk_free_rate
0,0,0,0,1,1,0,0,0,1,,...,,,,,,,,,,
1,0,0,0,1,1,0,0,0,1,,...,,,,,,,,,-0.002421,0.000301
2,0,0,0,1,0,0,0,0,1,,...,,,,,,,,,-0.008495,0.000303
3,0,0,0,1,0,0,0,0,0,,...,,,,,,,,,-0.009624,0.000301
4,0,0,0,1,0,0,0,0,0,,...,,,,,,,,,0.004662,0.000299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7187,0,0,0,0,0,0,0,0,0,1.065040,...,0.849868,0.539683,0.883598,1.476379,0.226852,-0.298147,0.988095,-0.398543,-0.007314,0.000077
7188,0,0,0,0,0,0,0,0,0,1.063801,...,0.861111,0.815476,0.888228,1.158328,0.219577,-0.583421,0.991071,-0.607360,0.009066,0.000076
7189,0,0,0,0,0,0,0,0,0,1.062565,...,0.841270,0.771164,0.869709,1.336420,0.126984,-0.691435,0.987434,-0.680801,0.000787,0.000077
7190,0,0,0,0,0,0,0,0,0,1.061331,...,0.827381,0.882275,0.859127,1.892457,0.109788,-0.513384,0.992063,-0.574819,-0.000894,0.000077


In [183]:
def hygiene(
    X_train,
    *,
    max_nan_pct: float = 0.50,   # drop columns with >50% NaNs on TRAIN
    numeric_only: bool = True
):
    """
    Returns:
      keep_cols : list[str]  -> columns with NaN fraction <= max_nan_pct
      report    : dict       -> quick stats of what was dropped/kept
    """
    cols = X_train.select_dtypes(include=[np.number]).columns if numeric_only else X_train.columns
    na_pct = X_train[cols].isna().mean()           # fraction of NaNs per column
    keep_cols = na_pct[na_pct <= max_nan_pct].index.tolist()
    dropped = na_pct[na_pct > max_nan_pct].sort_values(ascending=False)

    if not keep_cols:  # safety: don't return empty schema
        keep_cols = list(cols)

    report = {
        "cols_considered": int(len(cols)),
        "kept_cols": int(len(keep_cols)),
        "dropped_cols": int(dropped.size),
        "max_nan_pct": float(max_nan_pct),
        "dropped_top5": dropped.head(5).to_dict()
    }
    return keep_cols, report


In [184]:
# 3) Lag features (time-safe)
def make_lag_features(df, cols, lags=(1, 5, 20), *, keep_original=True, dtype="float32"):
    """
    Returns a new DataFrame with optional originals + lagged copies.
    Uses shift(L), so expect NaNs at the head.
    """
    out = pd.DataFrame(index=df.index)
    if keep_original:
        out[cols] = df[cols]
    for L in lags:
        out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
    if dtype is not None:
        for c in out.columns:
            if pd.api.types.is_float_dtype(out[c]):
                out[c] = out[c].astype(dtype)
    return out


In [185]:
# 4) Rolling stats (+ optional z-scores), time-safe via past_only=True
def make_rolling_features(
    df, cols,
    windows=(5, 20),
    *,
    stats=("mean", "std"),     # any of: "mean","std","min","max","sum"
    zscore: bool = False,       # z_t = (x_t - mean_{past}) / (std_{past}+eps)
    past_only: bool = True,    # shift(1) inside rolling to avoid leakage
    min_periods: int = None,  # default = window size
    eps: float = 1e-9,
    dtype: str = "float32",
):
    """
    Builds rolling features for each col over each window.
    If past_only=True, the rolling window excludes the current row (safe for t+1 targets).
    """
    out = pd.DataFrame(index=df.index)
    for w in windows:
        mp = w if min_periods is None else min_periods
        base = df[cols].shift(1) if past_only else df[cols]
        roll = base.rolling(window=w, min_periods=mp)

        if "mean" in stats:
            m = roll.mean()
            out[[f"{c}_rollmean{w}" for c in cols]] = m
        if "std" in stats:
            s = roll.std(ddof=0)
            out[[f"{c}_rollstd{w}" for c in cols]] = s
        if "min" in stats:
            out[[f"{c}_rollmin{w}" for c in cols]] = roll.min()
        if "max" in stats:
            out[[f"{c}_rollmax{w}" for c in cols]] = roll.max()
        if "sum" in stats:
            out[[f"{c}_rollsum{w}" for c in cols]] = roll.sum()

        if zscore:
            # need mean & std; compute if not already available
            if "mean" in stats:
                m = out[[f"{c}_rollmean{w}" for c in cols]].copy()
                m.columns = cols
            else:
                m = roll.mean()
            if "std" in stats:
                s = out[[f"{c}_rollstd{w}" for c in cols]].copy()
                s.columns = cols
            else:
                s = roll.std(ddof=0)
            for c in cols:
                out[f"{c}_z{w}"] = (df[c] - m[c]) / (s[c] + eps)

    if dtype is not None:
        for c in out.columns:
            if pd.api.types.is_float_dtype(out[c]):
                out[c] = out[c].astype(dtype)
    return out


In [186]:
good_features, report = hygiene(
    X_train)

In [187]:
report

{'cols_considered': 96,
 'kept_cols': 88,
 'dropped_cols': 8,
 'max_nan_pct': 0.5,
 'dropped_top5': {'E7': 0.9689933259176863,
  'V10': 0.8410734149054505,
  'S3': 0.7971357063403782,
  'M1': 0.771273637374861,
  'M13': 0.7703003337041157}}

In [188]:
X_train_enriched = make_rolling_features(make_lag_features(X_train, good_features, lags = (1, 2, 5, 20)), good_features)
X_val_enriched = make_rolling_features(make_lag_features(X_val, good_features, lags = (1, 2, 5, 20)), good_features)

  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in c

In [189]:
X_train_enriched.to_csv('X_train_enriched.csv', index = False)
X_val_enriched.to_csv('X_val_enriched.csv', index = False)

In [190]:
X_train_enriched

Unnamed: 0,D1_rollmean5,D2_rollmean5,D3_rollmean5,D4_rollmean5,D5_rollmean5,D6_rollmean5,D7_rollmean5,D8_rollmean5,D9_rollmean5,E1_rollmean5,...,V13_rollstd20,V2_rollstd20,V3_rollstd20,V4_rollstd20,V5_rollstd20,V6_rollstd20,V7_rollstd20,V8_rollstd20,lagged_forward_returns_rollstd20,lagged_risk_free_rate_rollstd20
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7187,0.0,0.0,0.2,0.0,0.2,0.0,0.2,0.0,0.4,1.068772,...,0.561259,0.034088,0.177390,0.020063,0.403754,0.191928,0.508592,0.006913,0.005591,6.806308e-07
7188,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.2,1.067526,...,0.547991,0.034827,0.185225,0.019414,0.403754,0.192869,0.495477,0.007027,0.005825,7.546882e-07
7189,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,1.066283,...,0.554014,0.034569,0.184154,0.019306,0.392002,0.193821,0.499186,0.006981,0.006108,7.876856e-07
7190,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,1.065042,...,0.554706,0.035329,0.182496,0.020616,0.381425,0.197756,0.497592,0.007026,0.006091,8.387273e-07


In [191]:
X_train_enriched['D1']

KeyError: 'D1'

In [None]:
make_rolling_features(X_train, good_features)['D1']

  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in cols]] = m
  out[[f"{c}_rollmean{w}" for c in

0       0
1       0
2       0
3       0
4       0
       ..
7187    0
7188    0
7189    0
7190    0
7191    0
Name: D1, Length: 7192, dtype: int64

In [None]:
test = pd.read_csv('test.csv')

In [None]:
X_test = test.drop(columns = ['lagged_forward_returns', 'lagged_market_forward_excess_returns', 'lagged_risk_free_rate', 'date_id' , 'is_scored']).iloc[:-1]
X_test

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,E1,...,V12,V13,V2,V3,V4,V5,V6,V7,V8,V9
0,0,0,0,0,1,0,0,1,0,1.577651,...,0.699074,-0.5024,0.882937,0.892196,0.828042,0.999172,0.759921,-0.803127,0.170966,-0.751909
1,0,0,0,0,1,0,0,1,0,1.575182,...,0.598545,-0.394268,0.863757,0.699074,0.831349,1.120336,0.556217,-0.686192,0.141865,-0.660326
2,0,0,0,0,1,0,0,0,1,1.57272,...,0.603836,-0.17042,0.848545,0.647487,0.832672,1.088992,0.665344,-0.459367,0.199405,-0.510979
3,0,0,0,0,1,0,0,0,1,1.570266,...,0.558862,-0.275099,0.826058,0.445767,0.835979,1.040988,0.594577,-0.561643,0.161706,-0.575997
4,0,0,0,0,0,0,1,0,1,1.567818,...,0.487434,-0.39548,0.80754,0.707672,0.839947,0.944593,0.715608,-0.692649,0.124669,-0.654045
5,0,0,0,0,0,0,0,0,0,1.565379,...,0.53373,-0.432282,0.785053,0.469577,0.837963,1.226772,0.822751,-0.707361,0.142857,-0.649616
6,0,0,0,0,0,0,0,0,0,1.562946,...,0.526455,-0.429506,0.767857,0.671958,0.837963,0.785877,0.805556,-0.715692,0.196098,-0.668289
7,0,0,1,0,0,0,0,0,0,1.56052,...,0.433532,-0.425462,0.734127,0.481481,0.787698,0.834898,0.823413,-0.723949,0.133929,-0.670946
8,0,0,0,0,0,0,0,0,0,1.558102,...,0.39418,-0.38517,0.695106,0.655423,0.78373,0.994026,0.851852,-0.684937,0.101852,-0.646265


In [None]:
y_test = test['lagged_market_forward_excess_returns'].iloc[1:]

In [None]:
X_hist = pd.concat([X_val, X_test], axis = 0).reset_index(drop = True)
X_hist

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,E1,...,V12,V13,V2,V3,V4,V5,V6,V7,V8,V9
0,0,0,0,0,0,0,0,0,0,1.058871,...,0.706019,-0.563259,0.807540,0.550265,0.875000,2.808941,0.128307,-0.710536,0.973545,-0.728618
1,0,0,0,0,0,0,0,0,0,1.057645,...,0.684854,-0.326752,0.785053,0.886243,0.870370,3.405066,0.074074,-0.499816,0.955357,-0.531049
2,0,0,0,0,0,-1,0,0,0,1.056420,...,0.701720,-0.305448,0.762566,0.599206,0.875000,2.969902,0.102513,-0.501887,0.957011,-0.492945
3,0,0,0,0,0,-1,0,0,0,1.055199,...,0.689153,-0.346416,0.731481,0.667328,0.867063,3.184580,0.138228,-0.566672,0.948082,-0.566723
4,0,0,0,0,0,-1,0,0,0,1.053979,...,0.838624,-0.407278,0.768519,0.697090,0.876323,3.309322,0.138889,-0.623320,0.970899,-0.620151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1802,0,0,0,0,0,0,1,0,1,1.567818,...,0.487434,-0.395480,0.807540,0.707672,0.839947,0.944593,0.715608,-0.692649,0.124669,-0.654045
1803,0,0,0,0,0,0,0,0,0,1.565379,...,0.533730,-0.432282,0.785053,0.469577,0.837963,1.226772,0.822751,-0.707361,0.142857,-0.649616
1804,0,0,0,0,0,0,0,0,0,1.562946,...,0.526455,-0.429506,0.767857,0.671958,0.837963,0.785877,0.805556,-0.715692,0.196098,-0.668289
1805,0,0,1,0,0,0,0,0,0,1.560520,...,0.433532,-0.425462,0.734127,0.481481,0.787698,0.834898,0.823413,-0.723949,0.133929,-0.670946


In [None]:
X_test_enriched = make_rolling_features(make_lag_features(X_hist, good_features, lags = (1, 2, 5, 20)), good_features).iloc[-9:]


  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in c

In [None]:
X_test_enriched.to_csv('X_enriched_test.csv', index = False)
y_test.to_csv('y_test.csv', index = False)

In [None]:
good_features

['D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'E1',
 'E10',
 'E11',
 'E12',
 'E13',
 'E14',
 'E15',
 'E16',
 'E17',
 'E18',
 'E19',
 'E2',
 'E20',
 'E3',
 'E4',
 'E5',
 'E6',
 'E8',
 'E9',
 'I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'M10',
 'M11',
 'M12',
 'M15',
 'M16',
 'M17',
 'M18',
 'M2',
 'M3',
 'M4',
 'M5',
 'M7',
 'M8',
 'M9',
 'P1',
 'P10',
 'P11',
 'P12',
 'P13',
 'P2',
 'P3',
 'P4',
 'P5',
 'P6',
 'P7',
 'P8',
 'P9',
 'S1',
 'S10',
 'S11',
 'S12',
 'S2',
 'S4',
 'S5',
 'S6',
 'S7',
 'S8',
 'S9',
 'V1',
 'V11',
 'V12',
 'V13',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8']