In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone
from sklearn.metrics import root_mean_squared_error
from scipy.stats import spearmanr

In [None]:
class WalkForwardCV:
    """
    Simple walk-forward cross-validation using sklearn's TimeSeriesSplit.

    Parameters
    ----------
    model : sklearn estimator
        Must have fit() and predict() methods.
    n_splits : int
        Number of folds.
    scoring : callable, optional
        Custom scoring function (y_true, y_pred) -> float.
    gap : int, default=0
        Number of samples to exclude between train and test to prevent leakage.
    """

    def __init__(self, model, n_splits=5, scoring=None, gap=0):
        self.model = model
        self.n_splits = n_splits
        self.scoring = scoring or (lambda y_true, y_pred: root_mean_squared_error(y_true, y_pred))
        self.gap = gap

    def evaluate(self, X, y, verbose=True):
        """Perform walk-forward CV and return fold scores.
        Parameters
        ----------
         X : pandas.DataFrame or numpy.ndarray
            Feature matrix (time-ordered, not shuffled).
            Shape: (n_samples, n_features)
            Must be aligned with `y`.

        y : pandas.Series, numpy.ndarray, or list-like
            Target variable corresponding to X.
            Shape: (n_samples,)
        """
        
        
        tscv = TimeSeriesSplit(n_splits=self.n_splits, gap=self.gap)
        scores = []

        for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model = clone(self.model)
            model.fit(X_train, y_train)
            preds = model.predict(X_test)

            score = self.scoring(y_test, preds)
            scores.append(score)

            if verbose:
                print(f"Fold {fold+1}: {score:.4f}")

        print(f"Average score: {np.mean(scores):.4f}")
        return scores
    
    # Example usage:
    # X = np.random.randn(1000, 10)
    # y = np.random.randn(1000)

    # # Custom metric: Spearman correlation
    # def spearman_corr(y_true, y_pred):
    #     return spearmanr(y_true, y_pred).correlation

    # # Instantiate and run
    # cv = WalkForwardCV(model=RandomForestRegressor(), n_splits=5, scoring=spearman_corr)
    # scores = cv.evaluate(X, y)

In [6]:
train_df = pd.read_csv('train_stationalized.csv')
features = [col for col in train_df.columns if col[0] in ['D','E','I','M','P','S','V']]
target = 'forward_returns'

X = train_df[features]
y = train_df[target]

In [11]:
X

Unnamed: 0,E1,E10,E11,E12,E13,E14,E15,E16,E17,E18,...,V8,D1,D2,D3,D4,D5,D6,D7,D8,D9
0,1.590988,0.315807,1.346614,-1.398602,0.005291,0.005291,0.964383,0.021722,0.001348,-1.113150,...,1.525426,0,0,0,1,1,0,0,0,1
1,1.588434,0.315476,1.123464,-1.191637,0.004960,0.004960,0.817643,0.022282,0.001900,-1.111688,...,1.525426,0,0,0,1,1,0,0,0,1
2,1.585888,0.315146,0.945888,-1.030759,0.004630,0.004630,0.687338,0.022843,0.002452,-1.110228,...,1.525426,0,0,0,1,0,0,1,0,1
3,1.583349,0.314815,0.795139,-0.898777,0.004299,0.004299,0.565628,0.023404,0.003005,-1.108771,...,1.525426,0,0,0,1,0,0,0,0,0
4,1.580818,0.314484,0.659882,-0.786384,0.003968,0.003968,0.444979,0.023966,0.003558,-1.107315,...,1.525426,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353,1.565379,0.184524,1.608723,1.608723,0.005952,0.005952,-3.164382,-0.083496,-0.572447,0.223638,...,0.395739,0,0,0,0,0,0,0,0,0
3354,1.562946,0.184193,1.292477,1.292477,0.005622,0.005622,-2.127981,-0.083542,-0.572080,0.222910,...,1.385570,0,0,0,0,0,0,0,0,0
3355,1.560520,0.183862,1.057111,1.057111,0.005291,0.005291,-1.640118,-0.083874,-0.572016,0.222211,...,0.068362,0,0,1,0,0,0,0,0,0
3356,1.558102,0.183532,0.865309,0.865309,0.004960,0.004960,-1.330068,-0.084206,-0.571952,0.221513,...,-0.713060,0,0,0,0,0,0,0,0,0


In [15]:
cv = WalkForwardCV(model=RandomForestRegressor(), n_splits=5)
scores = cv.evaluate(X, y)


Fold 1: 0.0092
Fold 2: 0.0080
Fold 3: 0.0140
Fold 4: 0.0121
Fold 5: 0.0093
Average score: 0.0105
