In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("train_enriched.csv")
dummy_features = [col for col in df.columns if col[0] == 'D']
standard_features = [col for col in df.columns if col[0] in ['E','I','M','P','S','V']] + ['forward_returns_lag1', 'forward_returns_lag5']
features = dummy_features + standard_features
target = ['forward_returns']

In [3]:
df

Unnamed: 0,date_id,forward_returns,risk_free_rate,market_forward_excess_returns,E1,E10,E11,E12,E13,E14,...,D2,D3,D4,D5,D6,D7,D8,D9,forward_returns_lag1,forward_returns_lag5
0,2047,0.011194,0.000198,0.010682,1.942390,0.996693,0.018849,0.002976,0.016204,0.002976,...,0,0,1,0,0,1,0,0,-0.000622,0.000636
1,2048,-0.003382,0.000200,-0.003895,1.938593,0.997024,0.018519,0.002646,0.015873,0.002646,...,0,0,1,0,0,0,0,0,0.011194,0.016529
2,2049,0.009564,0.000199,0.009054,1.934812,0.997354,0.018188,0.002315,0.015542,0.002315,...,0,0,1,0,0,0,0,0,-0.003382,0.007505
3,2050,-0.000916,0.000202,-0.001431,1.931047,0.997685,0.017857,0.001984,0.015212,0.001984,...,0,0,1,0,0,0,0,0,0.009564,-0.001241
4,2051,0.004283,0.000201,0.003769,1.927299,0.998016,0.017526,0.001653,0.014881,0.001653,...,0,0,1,0,0,0,0,0,-0.000916,-0.000622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6938,8985,0.002457,0.000155,0.001990,1.565379,0.184524,0.019180,0.019180,0.005952,0.005952,...,0,0,0,0,0,0,0,0,-0.002896,-0.005964
6939,8986,0.002312,0.000156,0.001845,1.562946,0.184193,0.018849,0.018849,0.005622,0.005622,...,0,0,0,0,0,0,0,0,0.002457,-0.007410
6940,8987,0.002891,0.000156,0.002424,1.560520,0.183862,0.018519,0.018519,0.005291,0.005291,...,0,1,0,0,0,0,0,0,0.002312,0.005420
6941,8988,0.008310,0.000156,0.007843,1.558102,0.183532,0.018188,0.018188,0.004960,0.004960,...,0,0,0,0,0,0,0,0,0.002891,0.008357


### 1. Filter Methods for feature selection (statistical/corr/MI)

In [None]:
from sklearn.feature_selection import mutual_info_regression, VarianceThreshold
from scipy.stats import spearmanr

In [5]:
X = df.copy()[features]
y = df.copy()[target]

In [20]:
# Correlation-based ranking
def rank_features_by_correlation(df, features, target):
    corr_results = []
    for i, col in enumerate(features):
        corr, pval = spearmanr(df[col], df[target])
        corr_results.append({
            'feature': col,
            'spearman_corr': corr,
            'p_value': pval,
            'abs_corr': abs(corr)
        })
    corr_df = pd.DataFrame(corr_results).sort_values(by='abs_corr', ascending=False).reset_index(drop=True)
    return corr_df

In [21]:
a = rank_features_by_correlation(df, features, target)

In [22]:
a

Unnamed: 0,feature,spearman_corr,p_value,abs_corr
0,M4_lag1,-0.049095,0.000043,0.049095
1,M4_roll_mean_5,-0.048016,0.000063,0.048016
2,M4,-0.046643,0.000101,0.046643
3,P5_lag1,-0.044636,0.000199,0.044636
4,V13,0.043271,0.000310,0.043271
...,...,...,...,...
807,I5_lag1,-0.000127,0.991563,0.000127
808,E2_lag10,0.000123,0.991829,0.000123
809,E2_lag5,0.000113,0.992474,0.000113
810,M9_roll_mean_15,0.000082,0.994545,0.000082


In [28]:
# Mutual Information-based ranking
def rank_features_by_MI(df, features, target, n_neighbors=3):
    mi = mutual_info_regression(df[features], df[target].values.ravel(), n_neighbors=n_neighbors)
    mi_df = pd.DataFrame({
        'feature': features,
        'mutual_info': mi}) \
        .sort_values(by='mutual_info', ascending=False).reset_index(drop=True)
    return mi_df

In [29]:
a = rank_features_by_MI(df, features, target)
a

Unnamed: 0,feature,mutual_info
0,E19_roll_mean_5,0.130913
1,E19,0.129498
2,V7_roll_mean_5,0.128396
3,V7_lag1,0.119387
4,E19_roll_mean_15,0.118469
...,...,...
807,S11_lag1,0.000000
808,E6_roll_mean_15,0.000000
809,E6_roll_std_15,0.000000
810,E6_pctchg,0.000000


### 2. Wrapper Methods (forward/backward/recursive selection)

In [12]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector, RFECV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [11]:
X = df.copy()[features][:-1000]
y = df.copy()[target][:-1000]

In [None]:
# without feature selection
base_reg = LinearRegression()

pipe_sfs = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', base_reg)
])

tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(
    pipe_sfs, 
    X, 
    y, 
    cv=tscv,
    scoring="neg_mean_squared_error"
)

print("CV scores:", scores)
print("Mean score:", scores.mean())

# pipe_sfs.fit(X, y)
# sfs_step = pipe_sfs.named_steps["sfs"]
# support_mask = sfs_step.get_support()
# selected_features = [f for f, keep in zip(features, support_mask) if keep]
# print("Selected features (SFS):", selected_features)

CV scores: [-1.43443921e+24 -2.61889311e+03 -6.32433758e+01 -1.36027923e+03
 -2.87606455e-04]
Mean score: -2.8688784183292958e+23


In [None]:
# with Sequential Feature Selector (SFS)
base_reg = LinearRegression()

pipe_sfs = Pipeline([
    ('scaler', StandardScaler()),
    ('sfs', SequentialFeatureSelector(base_reg, n_features_to_select=30, direction='forward')),
    ('regressor', base_reg)
])

tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(
    pipe_sfs, 
    X, 
    y, 
    cv=tscv,
    scoring="neg_mean_squared_error"
)

print("CV scores:", scores)
print("Mean score:", scores.mean())

# pipe_sfs.fit(X, y)
# sfs_step = pipe_sfs.named_steps["sfs"]
# support_mask = sfs_step.get_support()
# selected_features = [f for f, keep in zip(features, support_mask) if keep]
# print("Selected features (SFS):", selected_features)

CV scores: [-0.00022039 -0.00023723 -0.00060195 -0.00011108 -0.00010975]
Mean score: -0.00025608008190074075
Selected features (SFS): ['D1', 'D6', 'P12', 'S2', 'S6', 'V13', 'E10_lag10', 'E10_roll_mean_30', 'E11_roll_mean_15', 'E11_roll_std_15', 'E13_lag1', 'E13_roll_std_15', 'E13_roll_mean_30', 'E5_pctchg', 'E8_pctchg', 'I1_roll_std_30', 'M4_lag1', 'P13_pctchg', 'P4_roll_std_30', 'S10_roll_std_5', 'S2_roll_mean_5', 'S2_roll_std_5', 'S5_roll_mean_5', 'S5_roll_mean_30', 'V11_pctchg', 'V13_lag1', 'V4_pctchg', 'V5_roll_std_30', 'V6_roll_std_5', 'V7_roll_std_5']


In [16]:
# with Sequential Feature Selector (SFS) on Random Forest regressor
base_reg = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

pipe_sfs = Pipeline([
    ('scaler', StandardScaler()),
    ('sfs', SequentialFeatureSelector(base_reg, n_features_to_select=30, direction='forward')),
    ('regressor', base_reg)
])
                         
tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(
    pipe_sfs, 
    X, 
    y.values.ravel(), 
    cv=tscv,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

print("CV scores:", scores)
print("Mean score:", scores.mean())

KeyboardInterrupt: 

In [None]:
# with recursive feature elimination (RFECV)
pipe_rfecv = Pipeline([
    ("scaler", StandardScaler()),
    ("rfecv", RFECV(
        estimator=base_reg,
        step=1,
        cv=tscv,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    )),
    ("reg", LinearRegression())
])

scores_rfecv = cross_val_score(
    pipe_rfecv,
    X,
    y,
    cv=tscv,
    scoring="neg_mean_squared_error"
)

print("CV scores (RFECV):", scores_rfecv)
print("Mean score (RFECV):", scores_rfecv.mean())

# pipe_rfecv.fit(X, y)
# rfecv_step = pipe_rfecv.named_steps["rfecv"]
# support_mask = rfecv_step.get_support()
# selected_features_rfecv = [f for f, keep in zip(features, support_mask) if keep]
# print("Selected features (RFECV):", selected_features_rfecv)
 

CV scores (RFECV): [-7.76309469e-05 -2.11976798e-04 -7.17036106e-05 -1.02384337e-04
 -1.08137664e-04]
Mean score (RFECV): -0.00011436667130141206


### Test with a simple Linear Regression

In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error


In [None]:
# without feature selection
X = df.copy()[features]
y = df.copy()[target]

tscv = TimeSeriesSplit(n_splits=5)
mse_scores = []
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)
    print("MSE for fold:", mse)
print("MSE without feature selection:", np.mean(mse_scores)) 

MSE for fold: 5.3890774259840045e+22
MSE for fold: 1439.217165597082
MSE for fold: 3864.2164070011895
MSE for fold: 0.0028170580039279117
MSE for fold: 0.0011514353759978014
MSE without feature selection: 1.077815485196801e+22


In [56]:
# with feature selection (example using top 30 features from correlation ranking)
tscv = TimeSeriesSplit(n_splits=5)
scores = []
for train_index, test_index in tscv.split(X):
    df_train, df_test = df.copy().iloc[train_index], df.copy().iloc[test_index]
    top_features = rank_features_by_MI(df_train, features, target).head(20)['feature']

    X_train = df_train[top_features]
    X_test = df_test[top_features]
    y_train = df_train[target]
    y_test = df_test[target]

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    corr_coef = spearmanr(y_test, y_pred)[0]
    scores.append([mse, corr_coef])
    print("MSE for fold:", mse)
    print("Corr for fold:", corr_coef)
print("MSE with feature selection:", np.mean(scores, axis=0)[0])
print("Corr with feature selection:", np.mean(scores, axis=0)[1])

MSE for fold: 0.000935776103876099
Corr for fold: 0.003053815877709928
MSE for fold: 0.0006444741562944553
Corr for fold: 0.07414404456467874
MSE for fold: 9.716096571710187e-05
Corr for fold: 0.0016512270610485596
MSE for fold: 0.0002475422932544207
Corr for fold: 0.011243898715374713
MSE for fold: 0.00016056441926691928
Corr for fold: 0.025574196325290795
MSE with feature selection: 0.0004171035876817992
Corr with feature selection: 0.02313343650882055


### can try Lasso as a feature selector

In [115]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr

In [56]:
X = df[features]
y = df[target]
X_train = X.iloc[:5000]
y_train = y.iloc[:5000]
X_test = X.iloc[5000:]
y_test = y.iloc[5000:]

In [130]:
X_train_scaled = (X_train - X_train.mean()) / X_train.std()
X_test_scaled = (X_test - X_train.mean()) / X_train.std()

In [142]:
lasso = Lasso(alpha = 0.0001)
lasso.fit(X_train_scaled,y_train)

0,1,2
,alpha,0.0001
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [143]:
y_train_pred = lasso.predict(X_train_scaled)
y_test_pred = lasso.predict(X_test_scaled)

In [149]:
y_train_pred

array([ 0.00011477, -0.0022524 , -0.00089482, ...,  0.0021856 ,
        0.0030783 ,  0.00221145], shape=(5000,))

In [150]:
y_train

Unnamed: 0,forward_returns
0,0.011194
1,-0.003382
2,0.009564
3,-0.000916
4,0.004283
...,...
4995,0.001765
4996,-0.000112
4997,-0.004086
4998,0.008327


In [153]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f"Lasso train data Mean Squared Error: {mse_train}")
print(f"Lasso test data Mean Squared Error: {mse_test}")

Lasso train data Mean Squared Error: 0.00011859914819505049
Lasso test data Mean Squared Error: 0.00013986593297867552


In [None]:

R2 = r2_score(y_test, y_test_pred)
print(f"Lasso test data R2 Score: {R2}")

Lasso test data R2 Score: -0.1582320636066734


In [112]:
rho, pval = spearmanr(y_test, lasso.predict(X_test))
print(f"Lasso Spearman's rho: {rho}, p-value: {pval}")

Lasso Spearman's rho: nan, p-value: nan


  rho, pval = spearmanr(y_test, lasso.predict(X_test))


In [113]:
accuracy = np.mean(np.sign(lasso.predict(X_test)) == np.sign(y_test.squeeze()))
print("Directional accuracy:", accuracy)

Directional accuracy: 0.5532681420483788


In [101]:
baseline = np.array([np.mean(y_test)]*len(y_test))
accuracy = np.mean(np.sign(baseline) == np.sign(y_test.squeeze()))
mse = mean_squared_error(y_test, baseline)
R2 = r2_score(y_test, baseline)
print("Baseline directional accuracy:", accuracy)
print(f"Baseline R2 Score: {R2}")
print(f"Baseline Mean Squared Error: {mse}")


Baseline directional accuracy: 0.5532681420483788
Baseline R2 Score: 0.0
Baseline Mean Squared Error: 0.00012075812557212446
