In [23]:
import pandas as pd
import numpy as np
from sklearn.kernel_approximation import RBFSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('data/data.csv')
data['date'] = pd.to_datetime(data['yyyymm'].astype(str), format='%Y%m')
data['Index'] = data['Index'].str.replace(',', '').astype(float)
data['returns'] = data['Index'].pct_change().shift(-1)
data = data.drop(columns=['yyyymm'])
data = data[['date'] + [col for col in data.columns if col != 'date']]
data = data[data['date'] >= '1926-01-01']
data = data.set_index('date')
data.head()

Unnamed: 0_level_0,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx,returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1926-01-01,12.74,0.6075,1.249,0.387671,0.0349,0.0482,0.0609,0.0374,,0.002925,0.0,0.0138,0.0072,0.000857,,-0.001783,-0.00398,-0.043956
1926-02-01,12.18,0.615,1.248,0.394363,0.0318,0.0477,0.0602,0.0372,,0.002908,0.0,0.0063,0.0045,0.001099,,-0.033296,-0.037876,-0.059113
1926-03-01,11.46,0.6225,1.248,0.494091,0.0314,0.0479,0.0605,0.0371,,0.00265,-0.005587,0.0041,0.0084,0.006016,,-0.057708,-0.062007,0.022688
1926-04-01,11.72,0.63,1.247,0.482917,0.0308,0.0474,0.0598,0.0368,,0.002617,0.005618,0.0076,0.0097,0.001466,,0.038522,0.034856,0.007679
1926-05-01,11.81,0.6375,1.246,0.48386,0.0317,0.0471,0.0586,0.0369,,0.002567,-0.005587,0.0014,0.0044,0.00069,,0.013623,0.00907,0.043184


In [3]:
data[['csp', 'ntis']] = data[['csp', 'ntis']].interpolate(method='time')
data[['csp', 'ntis']] = data[['csp', 'ntis']].ffill()
data.dropna(inplace=True)
data.head()

Unnamed: 0_level_0,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx,returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1937-05-01,16.26,0.81,1.15,0.489382,0.0041,0.0333,0.0484,0.0282,0.028278,0.0006,0.006993,0.0053,0.004,0.003593,0.003398,-0.005699,-0.011686,-0.052891
1937-06-01,15.4,0.84,1.17,0.504961,0.0036,0.0328,0.0493,0.0285,0.031266,0.0003,0.0,-0.0018,0.0053,0.002984,0.004206,-0.053803,-0.057853,0.102597
1937-07-01,16.98,0.8167,1.1867,0.464649,0.0028,0.0325,0.0491,0.0277,0.027225,0.0003,0.006944,0.0138,0.0039,0.002982,0.00363,0.098991,0.098012,-0.055359
1937-08-01,16.04,0.7933,1.2033,0.481935,0.0029,0.0324,0.0492,0.0286,0.031064,0.0002,0.0,-0.0104,-0.0017,0.002141,0.003325,-0.051183,-0.056291,-0.142145
1937-09-01,13.76,0.77,1.22,0.553147,0.0031,0.0328,0.0516,0.0284,0.035847,0.0004,0.006897,0.0045,0.0025,0.015821,0.003339,-0.13961,-0.143683,-0.101744


In [5]:
# Interpolate missing values using time-based interpolation
data[['csp', 'ntis']] = data[['csp', 'ntis']].interpolate(method='time')
data[['csp', 'ntis']] = data[['csp', 'ntis']].ffill()
data.dropna(inplace=True)
data.head()

Unnamed: 0_level_0,Index,D12,E12,b/m,tbl,AAA,BAA,lty,ntis,Rfree,infl,ltr,corpr,svar,csp,CRSP_SPvw,CRSP_SPvwx,returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1937-05-01,16.26,0.81,1.15,0.489382,0.0041,0.0333,0.0484,0.0282,0.028278,0.0006,0.006993,0.0053,0.004,0.003593,0.003398,-0.005699,-0.011686,-0.052891
1937-06-01,15.4,0.84,1.17,0.504961,0.0036,0.0328,0.0493,0.0285,0.031266,0.0003,0.0,-0.0018,0.0053,0.002984,0.004206,-0.053803,-0.057853,0.102597
1937-07-01,16.98,0.8167,1.1867,0.464649,0.0028,0.0325,0.0491,0.0277,0.027225,0.0003,0.006944,0.0138,0.0039,0.002982,0.00363,0.098991,0.098012,-0.055359
1937-08-01,16.04,0.7933,1.2033,0.481935,0.0029,0.0324,0.0492,0.0286,0.031064,0.0002,0.0,-0.0104,-0.0017,0.002141,0.003325,-0.051183,-0.056291,-0.142145
1937-09-01,13.76,0.77,1.22,0.553147,0.0031,0.0328,0.0516,0.0284,0.035847,0.0004,0.006897,0.0045,0.0025,0.015821,0.003339,-0.13961,-0.143683,-0.101744


In [43]:
buy_and_hold = data['Index'].pct_change()
buy_and_hold = buy_and_hold.cumsum()
sharp_ratio_buy_and_hold = np.mean(buy_and_hold - 0.03) / np.std(buy_and_hold)
sharp_ratio_buy_and_hold

1.4511649703346714

In [63]:
number_of_features = 10000
features = data.drop(columns=['returns'])
target = data['returns']
scaled_features = StandardScaler().fit_transform(features)
rff = RBFSampler(gamma=1.0, n_components=number_of_features, random_state=42)
rff_features = rff.fit_transform(scaled_features)
rff_df = pd.DataFrame(rff_features, columns=[f'RFF_{i}' for i in range(1, number_of_features+1)])

In [64]:
X = rff_df.values
Y = data['returns'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

beta = np.linalg.lstsq(X_train, Y_train, rcond=None)[0]
Y_train_pred = X_train @ beta
Y_test_pred = X_test @ beta
predictions_df = pd.DataFrame({
    'Actual_Test_Returns': Y_test,
    'Predicted_Test_Returns': Y_test_pred
})

In [65]:
positions = np.sign(Y_test_pred)
pnl = positions * Y_test
cumulative_pnl = np.cumsum(pnl)
trading_results = pd.DataFrame({
    'Actual_Returns': Y_test,
    'Predicted_Returns': Y_test_pred,
    'Positions': positions,
    'PnL': pnl,
    'Cumulative_PnL': cumulative_pnl
})
trading_results

Unnamed: 0,Actual_Returns,Predicted_Returns,Positions,PnL,Cumulative_PnL
0,0.048520,0.002719,1.0,0.048520,0.048520
1,0.038595,0.016576,1.0,0.038595,0.087115
2,0.035464,-0.004203,-1.0,-0.035464,0.051651
3,-0.035228,-0.027158,-1.0,0.035228,0.086879
4,0.110364,0.009104,1.0,0.110364,0.197243
...,...,...,...,...,...
307,-0.085349,0.010452,1.0,-0.085349,2.134477
308,0.020509,0.000409,1.0,0.020509,2.154986
309,-0.101795,0.002084,1.0,-0.101795,2.053191
310,0.023201,0.019282,1.0,0.023201,2.076393


In [66]:
exc_ret = trading_results['Cumulative_PnL'] - 0.03
sharp_ratio = np.mean(exc_ret) / np.std(exc_ret)
alpha = np.mean((trading_results['Cumulative_PnL'].values - 0.03) - (buy_and_hold[len(buy_and_hold) - len(trading_results):].values - 0.03))
win_rate = np.sum(np.sign(trading_results['Actual_Returns']) == np.sign(trading_results['Predicted_Returns'])) / len(trading_results)
win_rate, sharp_ratio, alpha

(0.5608974358974359, 1.918364612036545, -4.348402241924515)

In [67]:
alpha = 100
ridge_model = Ridge(alpha=alpha)
ridge_model.fit(X_train, Y_train)

Y_train_pred = ridge_model.predict(X_train)
Y_test_pred = ridge_model.predict(X_test)

# Evaluate the model
train_mse = mean_squared_error(Y_train, Y_train_pred)
test_mse = mean_squared_error(Y_test, Y_test_pred)

print(f"Train MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

predictions_df = pd.DataFrame({
    'Actual_Test_Returns': Y_test,
    'Predicted_Test_Returns': Y_test_pred
})

Train MSE: 0.002017394509812748
Test MSE: 0.00203635549048664


In [68]:
positions = np.sign(Y_test_pred)
pnl = positions * Y_test
cumulative_pnl = np.cumsum(pnl)
trading_results = pd.DataFrame({
    'Actual_Returns': Y_test,
    'Predicted_Returns': Y_test_pred,
    'Positions': positions,
    'PnL': pnl,
    'Cumulative_PnL': cumulative_pnl
})
trading_results

Unnamed: 0,Actual_Returns,Predicted_Returns,Positions,PnL,Cumulative_PnL
0,0.048520,0.005484,1.0,0.048520,0.048520
1,0.038595,0.005261,1.0,0.038595,0.087115
2,0.035464,0.005546,1.0,0.035464,0.122580
3,-0.035228,0.005061,1.0,-0.035228,0.087351
4,0.110364,0.005578,1.0,0.110364,0.197715
...,...,...,...,...,...
307,-0.085349,0.005658,1.0,-0.085349,2.770471
308,0.020509,0.005523,1.0,0.020509,2.790980
309,-0.101795,0.005612,1.0,-0.101795,2.689185
310,0.023201,0.005767,1.0,0.023201,2.712386


In [69]:
exc_ret = trading_results['Cumulative_PnL'] - 0.03
sharp_ratio = np.mean(exc_ret) / np.std(exc_ret)
alpha = np.mean((trading_results['Cumulative_PnL'].values - 0.03) - (buy_and_hold[len(buy_and_hold) - len(trading_results):].values - 0.03))
win_rate = np.sum(np.sign(trading_results['Actual_Returns']) == np.sign(trading_results['Predicted_Returns'])) / len(trading_results)
win_rate, sharp_ratio, alpha

(0.625, 1.9902754636583349, -4.022353037149507)