In [7]:
import pandas as pd
import numpy as np

file_path = "/Users/sambrown04/Documents/570/Market_Regime/sp-500-historical-data/versions/1/SPX.csv"
df = pd.read_csv(file_path)

In [35]:
def regime_labels(df, N = 21, threshold = 0.05):
    y = pd.Series(name = 'Regime')
    for i in range(len(df) - N):

        # return over the next N days
        forward_return = (df.iloc[i+N]['Close'] / df.iloc[i]['Close']) - 1

        if forward_return >= threshold:
            y[i] = 1 # Bull
        elif forward_return <= -threshold:
            y[i] = -1 # Bear
        else:
            y[i] = 0 # Sideways (not significant)

    return y

In [53]:
N = 21
y = regime_labels(df, N = 21)
X = df[49:-N]
y = y[49:]
assert y.shape[0] == X.shape[0], "Feature and Target dim error"

In [67]:
# Split into train, validation, and test splits
# Need to think about where in S and P data I should put these splits to allow the model to generalize...

def train_validate_test(
    X: pd.DataFrame,
    y: pd.Series,
    train_range: tuple, # all of these pass as datetime
    val_range: tuple,
    test_range: tuple,
    date_col: str = "Date"
):
    
    X = X.copy()

    X[date_col] = pd.to_datetime(df[date_col])

    # helper func to create masks
    def _mask_maker(df, date_range):
        start, end = date_range
        return (df[date_col] >= start) & (df[date_col] <= end)

    # Create masks using function
    train_mask = _mask_maker(X, train_range)
    val_mask = _mask_maker(X, val_range)
    test_mask = _mask_maker(X, test_range)

    y = y.loc[X.index]

    assert len(y) == len(X), "dimension error"
    assert not y.isna().any(), "y NaNs"

    # build the splits
    X_train = X.loc[train_mask].drop(columns=[date_col])
    y_train = y.loc[train_mask]

    X_val = X.loc[val_mask].drop(columns=[date_col])
    y_val = y.loc[val_mask]

    X_test = X.loc[test_mask].drop(columns=[date_col])
    y_test = y.loc[test_mask]

    return X_train, y_train, X_val, y_val, X_test, y_test

In [69]:
train_range = ("1995-01-01", "2012-12-31")
val_range = ("2013-01-01", "2016-12-31")
test_range = ("2017-01-01", "2020-12-31")

X_train, y_train, X_val, y_val, X_test, y_test = train_validate_test(
    X,
    y,
    train_range,
    val_range,
    test_range
)

In [71]:
X_train.shape

(4532, 19)

In [75]:
X_val.shape

(1008, 19)

In [77]:
X_test.shape

(947, 19)