In [80]:
import pandas as pd
import numpy as np

In [81]:
X_train = pd.read_csv('X_train.csv')
X_val = pd.read_csv('X_val.csv')

In [82]:
X_train

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,E1,...,V12,V13,V2,V3,V4,V5,V6,V7,V8,V9
0,0,0,0,1,1,0,0,0,1,,...,,,,,,,,,,
1,0,0,0,1,1,0,0,0,1,,...,,,,,,,,,,
2,0,0,0,1,0,0,0,0,1,,...,,,,,,,,,,
3,0,0,0,1,0,0,0,0,0,,...,,,,,,,,,,
4,0,0,0,1,0,0,0,0,0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7187,0,0,0,0,0,0,0,0,0,1.065040,...,0.662698,-0.276683,0.849868,0.539683,0.883598,1.476379,0.226852,-0.298147,0.988095,-0.398543
7188,0,0,0,0,0,0,0,0,0,1.063801,...,0.730159,-0.563242,0.861111,0.815476,0.888228,1.158328,0.219577,-0.583421,0.991071,-0.607360
7189,0,0,0,0,0,0,0,0,0,1.062565,...,0.753968,-0.633782,0.841270,0.771164,0.869709,1.336420,0.126984,-0.691435,0.987434,-0.680801
7190,0,0,0,0,0,0,0,0,0,1.061331,...,0.737103,-0.416334,0.827381,0.882275,0.859127,1.892457,0.109788,-0.513384,0.992063,-0.574819


In [83]:
def hygiene(
    X_train,
    *,
    max_nan_pct: float = 0.50,   # drop columns with >50% NaNs on TRAIN
    numeric_only: bool = True
):
    """
    Returns:
      keep_cols : list[str]  -> columns with NaN fraction <= max_nan_pct
      report    : dict       -> quick stats of what was dropped/kept
    """
    cols = X_train.select_dtypes(include=[np.number]).columns if numeric_only else X_train.columns
    na_pct = X_train[cols].isna().mean()           # fraction of NaNs per column
    keep_cols = na_pct[na_pct <= max_nan_pct].index.tolist()
    dropped = na_pct[na_pct > max_nan_pct].sort_values(ascending=False)

    if not keep_cols:  # safety: don't return empty schema
        keep_cols = list(cols)

    report = {
        "cols_considered": int(len(cols)),
        "kept_cols": int(len(keep_cols)),
        "dropped_cols": int(dropped.size),
        "max_nan_pct": float(max_nan_pct),
        "dropped_top5": dropped.head(5).to_dict()
    }
    return keep_cols, report


In [84]:
# 3) Lag features (time-safe)
def make_lag_features(df, cols, lags=(1, 5, 20), *, keep_original=True, dtype="float32"):
    """
    Returns a new DataFrame with optional originals + lagged copies.
    Uses shift(L), so expect NaNs at the head.
    """
    out = pd.DataFrame(index=df.index)
    if keep_original:
        out[cols] = df[cols]
    for L in lags:
        out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
    if dtype is not None:
        for c in out.columns:
            if pd.api.types.is_float_dtype(out[c]):
                out[c] = out[c].astype(dtype)
    return out


In [85]:
# 4) Rolling stats (+ optional z-scores), time-safe via past_only=True
def make_rolling_features(
    df, cols,
    windows=(5, 20),
    *,
    stats=("mean", "std"),     # any of: "mean","std","min","max","sum"
    zscore: bool = False,       # z_t = (x_t - mean_{past}) / (std_{past}+eps)
    past_only: bool = True,    # shift(1) inside rolling to avoid leakage
    min_periods: int = None,  # default = window size
    eps: float = 1e-9,
    dtype: str = "float32",
):
    """
    Builds rolling features for each col over each window.
    If past_only=True, the rolling window excludes the current row (safe for t+1 targets).
    """
    out = pd.DataFrame(index=df.index)
    for w in windows:
        mp = w if min_periods is None else min_periods
        base = df[cols].shift(1) if past_only else df[cols]
        roll = base.rolling(window=w, min_periods=mp)

        if "mean" in stats:
            m = roll.mean()
            out[[f"{c}_rollmean{w}" for c in cols]] = m
        if "std" in stats:
            s = roll.std(ddof=0)
            out[[f"{c}_rollstd{w}" for c in cols]] = s
        if "min" in stats:
            out[[f"{c}_rollmin{w}" for c in cols]] = roll.min()
        if "max" in stats:
            out[[f"{c}_rollmax{w}" for c in cols]] = roll.max()
        if "sum" in stats:
            out[[f"{c}_rollsum{w}" for c in cols]] = roll.sum()

        if zscore:
            # need mean & std; compute if not already available
            if "mean" in stats:
                m = out[[f"{c}_rollmean{w}" for c in cols]].copy()
                m.columns = cols
            else:
                m = roll.mean()
            if "std" in stats:
                s = out[[f"{c}_rollstd{w}" for c in cols]].copy()
                s.columns = cols
            else:
                s = roll.std(ddof=0)
            for c in cols:
                out[f"{c}_z{w}"] = (df[c] - m[c]) / (s[c] + eps)

    if dtype is not None:
        for c in out.columns:
            if pd.api.types.is_float_dtype(out[c]):
                out[c] = out[c].astype(dtype)
    return out


In [86]:
good_features, report = hygiene(
    X_train)

In [87]:
report

{'cols_considered': 94,
 'kept_cols': 86,
 'dropped_cols': 8,
 'max_nan_pct': 0.5,
 'dropped_top5': {'E7': 0.9689933259176863,
  'V10': 0.8410734149054505,
  'S3': 0.7971357063403782,
  'M1': 0.771273637374861,
  'M13': 0.7703003337041157}}

In [90]:
X_train_enriched = make_rolling_features(make_lag_features(X_train, good_features, lags = (1, 2, 5, 20)), good_features)
X_val_enriched = make_rolling_features(make_lag_features(X_val, good_features, lags = (1, 2, 5, 20)), good_features)

  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in cols]] = df[cols].shift(L)
  out[[f"{c}_lag{L}" for c in c

In [95]:
X_train_enriched.to_csv('X_train_enriched.csv', index = False)
X_val_enriched.to_csv('X_val_enriched.csv', index = False)