In [128]:
import pandas as pd
import numpy as np

In [129]:
df = pd.read_csv('train_enriched.csv')
dummy_features = [col for col in df.columns if col[0] == 'D']
standard_features = [col for col in df.columns if col[0] in ['E','I','M','P','S','V']] + ['forward_returns_lag1', 'forward_returns_lag5']
features = dummy_features + standard_features
target = ['forward_returns']

In [130]:
X = df.copy().drop(columns = ['forward_returns', 'market_forward_excess_returns', 'risk_free_rate', "D1", 'date_id'])[:-1000]
y = df.copy()[target][:-1000]

### 1. Filter Methods for feature selection (statistical/corr/MI)

In [51]:
from sklearn.feature_selection import mutual_info_regression, VarianceThreshold
from scipy.stats import spearmanr

In [52]:
def rank_features_by_correlation(X, y):
    corr_results = []
    for col in X.columns:
        corr, pval = spearmanr(X[col], y)
        if np.isnan(corr):
            corr = 0.0
        corr_results.append({
            'feature': col,
            'spearman_corr': corr,
            'p_value': pval,
            'abs_corr': abs(corr)
        })
    corr_df = pd.DataFrame(corr_results).sort_values(by='abs_corr', ascending=False).reset_index(drop=True)
    return corr_df

In [53]:
a = rank_features_by_correlation(X, y)

In [54]:
a

Unnamed: 0,feature,spearman_corr,p_value,abs_corr
0,M4,-0.057941,0.000005,0.057941
1,forward_returns_win5_mean,-0.057532,0.000006,0.057532
2,M4_roll_mean_5,-0.055711,0.000012,0.055711
3,M4_lag1,-0.054628,0.000017,0.054628
4,V13,0.052845,0.000032,0.052845
...,...,...,...,...
584,E4_roll_std_20,-0.000067,0.995785,0.000067
585,I5_roll_mean_5,-0.000051,0.996804,0.000051
586,V3_roll_mean_5,-0.000023,0.998573,0.000023
587,I5_roll_mean_20,0.000013,0.999184,0.000013


### 2. Wrapper Methods (forward/backward/recursive selection)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector, RFECV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [124]:
# with Sequential Feature Selector (SFS)
base_reg = Ridge()
tscv = TimeSeriesSplit(n_splits=5)

pipe_sfs = Pipeline([
    ('scaler', StandardScaler()),
    ('sfs', SequentialFeatureSelector(base_reg, n_features_to_select=30, direction='forward')),
    # ("rfecv", RFECV(
    #     estimator=base_reg,
    #     step=1,
    #     cv=tscv,
    #     scoring="neg_mean_squared_error",
    #     n_jobs=-1
    # )),
    ('regressor', base_reg)
])

scores = cross_val_score(
    pipe_sfs, 
    X, 
    y, 
    cv=tscv,
    scoring="neg_mean_squared_error"
)

print("CV scores:", scores)
print("Mean score:", scores.mean())

# pipe_sfs.fit(X, y)
# sfs_step = pipe_sfs.named_steps["sfs"]
# support_mask = sfs_step.get_support()
# selected_features = [f for f, keep in zip(features, support_mask) if keep]
# print("Selected features (SFS):", selected_features)

CV scores: [-9.68219330e+00 -3.48091761e-04 -1.19909106e-04 -8.64036521e-05
 -1.30888872e-04]
Mean score: -1.9365757180370788


In [125]:
import warnings

In [138]:
def forward_selection(X, y, model, k):
    features = []
    for i in range (k):
        print(i)
        best_feature = None
        best_score = -np.inf
        for feature in X.columns:
            if feature in features:
                continue
            current_features = features + [feature]
            X_subset = X[current_features]
            scores = cross_val_score(
                model,
                X_subset,
                y,
                cv=tscv,
                scoring=make_scorer(lambda y_true, y_pred: spearmanr(y_true, y_pred)[0])
            )
            mean_score = scores.mean()
            if mean_score > best_score:
                best_score = mean_score
                best_feature = feature
        features.append(best_feature)
    return features

In [139]:
tscv = TimeSeriesSplit(n_splits=5)
results = {}
scores = {}

warnings.filterwarnings("ignore")
Ks = [10, 20, 30, 40, 50, 100, 200, 300, 400, 500]
for k in Ks:
    results[k] = []
i = 1
for train_index, test_index in tscv.split(X):
    print(f"Processing fold...{i}")
    i += 1
    X_train, X_test, y_train, y_test = X.copy().iloc[train_index], X.copy().iloc[test_index], y.copy().iloc[train_index], y.copy().iloc[test_index]
    model = Ridge(alpha=1.0)
    all_features = forward_selection(X, y, model, k=500)
    X_train = (X_train - X_train.mean()) / (X_train.std()+1e-8)
    X_test = (X_test - X_train.mean()) / (X_train.std()+1e-8)
    for k in Ks:
        selected_features = all_features[:k]
        X_train_fs = X_train[selected_features]
        X_test_fs = X_test[selected_features]
        
        model.fit(X_train_fs, y_train)
        y_pred = model.predict(X_test_fs)
        
        corr_coef = spearmanr(y_test, y_pred)[0]
        results[k].append(corr_coef)

for k in Ks:
    scores[k] = {'mean': np.mean(results[k]), 'std': np.std(results[k])}
scores_df = pd.DataFrame.from_dict(scores, orient='index').reset_index().rename(columns={'index': 'num_features'})
    


Processing fold...1
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
27

KeyboardInterrupt: 

### 3. Model embedded feature importance as feature selector (faster)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr

In [None]:
X = df.copy()[features][:-1000]
y = df.copy()[target][:-1000]

In [None]:
def spearman_corr(y_true, y_pred):
    rho = spearmanr(y_true, y_pred).correlation
    # In case of constant predictions or NaNs
    if np.isnan(rho):
        return 0.0
    return rho

spearman_scorer = make_scorer(spearman_corr, greater_is_better=True)

In [None]:
# with model-embedded feature importance metric on Random Forest regressor
rf_for_fs = RandomForestRegressor(
    n_estimators=200,  
    max_depth=6,
    max_features=0.8,
    random_state=42,
    n_jobs=-1
)

rf_for_fs.fit(X, y.values.ravel())
importances = rf_for_fs.feature_importances_
idx_sorted = np.argsort(importances)[::-1]   # descending

results = []
param_grid = [30, 50, 60 , 70, 80, 90, 100, 200, 250, 300, 400, 500]
tscv = TimeSeriesSplit(n_splits=5)
for k in param_grid:
    top_features = X.columns[idx_sorted[:k]]

    base_reg = RandomForestRegressor(
        n_estimators=300,
        max_depth=6,
        min_samples_leaf=0.03,     # 3% of samples per leaf (robust)
        min_samples_split=0.02,
        max_features=0.3,
        bootstrap=True,
        n_jobs=-1, random_state=42
    )

    scores = cross_val_score(
        base_reg, 
        X[top_features], 
        y.values.ravel(), 
        cv=tscv,
        scoring=spearman_scorer,
        n_jobs=-1
    )

    results.append({
        'n_features': k,
        'top_features': top_features,
        'cv_scores': scores,
        'mean_cv_score': scores.mean(),
        'std_cv_score': scores.std()
    })


In [None]:
results_df = pd.DataFrame(results).sort_values(by='n_features', ascending=False).reset_index(drop=True)
results_df

Unnamed: 0,n_features,top_features,cv_scores,mean_cv_score,std_cv_score
0,500,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.037035034543911466, 0.008719123541622886, 0...",0.04122,0.030013
1,400,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.05685620343183619, 0.005957724203884055, 0....",0.043864,0.034445
2,300,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.0382951908655449, 0.014138534330916936, 0.0...",0.038054,0.031437
3,250,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.04592113768513597, 0.013305162488552442, 0....",0.047371,0.029923
4,200,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.026982193195125872, 0.016895115762500208, 0...",0.040562,0.030352
5,100,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.04694524943253772, 0.03947899064285702, 0.0...",0.055156,0.023314
6,90,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.0438769108628562, 0.04195520826044026, 0.04...",0.054111,0.025253
7,80,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.041593239557252644, 0.046338829450116026, 0...",0.051078,0.019876
8,70,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.01614835276115398, 0.05847194319314159, 0.0...",0.052633,0.026007
9,60,"Index(['M4', 'E19_lag5', 'M4_lag1', 'V13', 'P7...","[0.021882789448023625, 0.031148360059066325, 0...",0.048625,0.022076


In [None]:
# without feature selection
base_reg = RandomForestRegressor(
    n_estimators=300,
        max_depth=6,
        min_samples_leaf=0.03,     # 3% of samples per leaf (robust)
        min_samples_split=0.02,
        max_features=0.3,
        bootstrap=True,
        n_jobs=-1, random_state=42
)

scores = cross_val_score(
    base_reg, 
    X, 
    y.values.ravel(), 
    cv=tscv,
    scoring=spearman_scorer,
    n_jobs=-1
)

print("CV scores:", scores)
print("Mean score:", scores.mean())


CV scores: [0.04980389 0.00076541 0.03533868 0.03396514 0.10135229]
Mean score: 0.0442450833741991


#### 3.1 Model-embedded feature selection on ExtraTreesRegressor

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
from scipy.stats import spearmanr

In [None]:
X = df.copy()[features][:-1000]
y = df.copy()[target][:-1000]

In [None]:
def spearman_corr(y_true, y_pred):
    rho = spearmanr(y_true, y_pred).correlation
    # In case of constant predictions or NaNs
    if np.isnan(rho):
        return 0.0
    return rho

spearman_scorer = make_scorer(spearman_corr, greater_is_better=True)

In [None]:
etr_for_fs = ExtraTreesRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

etr_for_fs.fit(X, y.values.ravel())
importances = etr_for_fs.feature_importances_
idx_sorted = np.argsort(importances)[::-1]

results = []
param_grid = [10, 30, 50, 60 , 70, 80, 90, 100, 200, 250, 300, 400, 500, 800]
tscv = TimeSeriesSplit(n_splits=5)
for k in param_grid:
    top_features = X.columns[idx_sorted[:k]]

    base_reg = ExtraTreesRegressor(
        n_estimators=100,
        max_depth=6,
        random_state=42,
        n_jobs=-1,
        max_features = 0.7
    )

    scores = cross_val_score(
        base_reg, 
        X[top_features], 
        y.values.ravel(), 
        cv=tscv,
        scoring=spearman_scorer,
        n_jobs=-1
    )

    results.append({
        'n_features': k,
        'cv_scores': scores,
        'mean_cv_score': scores.mean(),
        'std_cv_score': scores.std()
    })

In [None]:
pd.DataFrame(results).sort_values(by='n_features', ascending=False).reset_index(drop=True)

Unnamed: 0,n_features,cv_scores,mean_cv_score,std_cv_score
0,800,"[0.032782761125028974, -0.031468165818326004, ...",0.023679,0.038823
1,500,"[0.02473313416855679, -0.0328031345956066, 0.0...",0.012449,0.024452
2,400,"[0.024545739501237552, -0.026349984672757953, ...",0.026918,0.029555
3,300,"[0.046745755249769864, -0.03191356991592486, 0...",0.020092,0.028058
4,250,"[0.04734584741687069, -0.019746577088783253, -...",0.02185,0.029625
5,200,"[0.07629809904704443, -0.009892938969350886, 0...",0.033214,0.031928
6,100,"[0.05508610301445022, 0.012795632278135866, 0....",0.043723,0.037595
7,90,"[0.04966905840599008, 0.03348487674774283, 0.0...",0.036186,0.014076
8,80,"[0.055672394505194185, 0.0165389829768065, 0.0...",0.037696,0.020945
9,70,"[0.03484714650653129, 0.03070366891785172, 0.0...",0.022623,0.025302


#### 3.2 On Linear models (Ridge regression)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.linear_model import Ridge, Lasso
from scipy.stats import spearmanr
from scipy.stats import ConstantInputWarning
import warnings

In [None]:
tscv = TimeSeriesSplit(n_splits=5)
scores = []

warnings.filterwarnings("ignore", category=ConstantInputWarning)
params_grid = [10, 30, 50, 60 , 70, 80, 90, 100, 200, 250, 300, 400, 500, 800]
for k in params_grid:
    corr_results = []
    for train_index, test_index in tscv.split(X):
        df_train, df_test = df.copy().iloc[train_index], df.copy().iloc[test_index]
        top_features = rank_features_by_correlation(X, y).head(k)['feature']

        X_train = df_train[top_features]
        X_test = df_test[top_features]
        y_train = df_train[target]
        y_test = df_test[target]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model = Ridge(alpha=1.0)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        corr_coef = spearmanr(y_test, y_pred)[0]
        corr_results.append(corr_coef)
    scores.append({
        'n_features': k,
        'corr_coef_cv': corr_results,
        'mean_corr_coef': np.mean(corr_results),
        'std_corr_coef': np.std(corr_results)
    })


KeyboardInterrupt: 

In [None]:
pd.DataFrame(scores)

Unnamed: 0,n_features,corr_coef_cv,mean_corr_coef,std_corr_coef
0,10,"[0.036769540515562985, 0.09981471345780991, 0....",0.058449,0.024438
1,30,"[0.07275569489221315, 0.03878458801861362, 0.0...",0.058788,0.015189
2,50,"[0.016454574524987674, 0.03316750620460767, 0....",0.04988,0.021408
3,60,"[0.030544591662363656, 0.05432062172867081, 0....",0.057662,0.015071
4,70,"[0.0028876177979875704, 0.05172643658389003, 0...",0.05472,0.028793
5,80,"[0.004892753878048705, 0.040931206106223014, 0...",0.049901,0.026212
6,90,"[-0.010699869781018119, 0.03252858614966022, 0...",0.041226,0.028947
7,100,"[-0.022989621366879355, 0.06238375997274751, 0...",0.023896,0.038866
8,200,"[0.0008041633611424673, 0.028777780508837086, ...",0.009215,0.012701
9,250,"[-0.005931565715486464, 0.02645308056495282, 0...",0.017179,0.018136


In [None]:
pd.DataFrame(scores).sort_values(by='n_features', ascending=False).reset_index(drop=True)

Unnamed: 0,n_features,corr_coef_cv,mean_corr_coef,std_corr_coef
0,800,"[-0.04211158608400803, -0.03194137098759129, 0...",0.007209,0.038685
1,500,"[-0.0361913095996777, 0.021889417507179824, 0....",0.024207,0.032986
2,400,"[-0.017294620340542036, 0.029477176235823514, ...",0.017993,0.020362
3,300,"[-0.04209637135394843, 0.03547425397316912, 0....",0.012488,0.029818
4,250,"[-0.005931565715486464, 0.02645308056495282, 0...",0.017179,0.018136
5,200,"[0.0008041633611424673, 0.028777780508837086, ...",0.009215,0.012701
6,100,"[-0.022989621366879355, 0.06238375997274751, 0...",0.023896,0.038866
7,90,"[-0.010699869781018119, 0.03252858614966022, 0...",0.041226,0.028947
8,80,"[0.004892753878048705, 0.040931206106223014, 0...",0.049901,0.026212
9,70,"[0.0028876177979875704, 0.05172643658389003, 0...",0.05472,0.028793


In [None]:
# with feature selection (example using top 30 features from correlation ranking)
tscv = TimeSeriesSplit(n_splits=5)
scores = []

warnings.filterwarnings("ignore", category=ConstantInputWarning)
params_grid = [10, 30, 50, 60 , 70, 80, 90, 100, 200, 250, 300, 400, 500, 800]
for k in params_grid:
    corr_results = []
    for train_index, test_index in tscv.split(X):
        df_train, df_test = df.copy().iloc[train_index], df.copy().iloc[test_index]
        top_features = rank_features_by_correlation(df_train, features, target).head(30)['feature']

        X_train = df_train[top_features]
        X_test = df_test[top_features]
        y_train = df_train[target]
        y_test = df_test[target]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model = Lasso(alpha=1.0)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        corr_coef = spearmanr(y_test, y_pred)[0]
        corr_results.append(corr_coef)
    scores.append({
        'n_features': k,
        'corr_coef_cv': corr_results,
        'mean_corr_coef': np.mean(corr_results)
    })


TypeError: rank_features_by_correlation() takes 2 positional arguments but 3 were given

In [None]:
pd.DataFrame(results).sort_values(by='n_features', ascending=False).reset_index(drop=True)

Unnamed: 0,n_features,cv_scores,mean_cv_score
0,800,"[0.03165082134289388, -0.02646468126343698, 0....",0.025436
1,500,"[0.014918806571137408, -0.03700128010130046, 0...",0.015506
2,400,"[0.03038710634024615, -0.03816872852794713, 0....",0.010033
3,300,"[0.03181974919319583, -0.020766085670216924, 0...",0.026563
4,250,"[0.06706396237751512, -0.025181806851276693, 0...",0.033384
5,200,"[0.06920917909208689, -0.004040739955559865, 0...",0.042673
6,100,"[0.06377349641381479, 0.05300612481893907, -0....",0.044116
7,90,"[0.053092937019486905, 0.028548517313154897, 0...",0.044125
8,80,"[0.05918449512246523, 0.028229668461496736, 0....",0.04128
9,70,"[0.042843535595029514, 0.006481091301538243, 0...",0.03675


### Test with a simple Linear Regression

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error


In [None]:
# without feature selection
X = df.copy()[features]
y = df.copy()[target]

tscv = TimeSeriesSplit(n_splits=5)
mse_scores = []
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)
    print("MSE for fold:", mse)
print("MSE without feature selection:", np.mean(mse_scores)) 

MSE for fold: 4.448523737375691e+23
MSE for fold: 78.6685818228171
MSE for fold: 226.18401356297505
MSE for fold: 5068.088537482515
MSE for fold: 0.00022046746844485435
MSE without feature selection: 8.897047474751382e+22


In [None]:
# with feature selection (example using top 30 features from correlation ranking)
tscv = TimeSeriesSplit(n_splits=5)
scores = []

for train_index, test_index in tscv.split(X):
    df_train, df_test = df.copy().iloc[train_index], df.copy().iloc[test_index]
    top_features = rank_features_by_correlation(df_train, features, target).head(20)['feature']

    X_train = df_train[top_features]
    X_test = df_test[top_features]
    y_train = df_train[target]
    y_test = df_test[target]

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    corr_coef = spearmanr(y_test, y_pred)[0]
    scores.append(corr_coef)
    print("Corr for fold:", corr_coef)
print("Corr with feature selection:", np.mean(scores, axis=0)[1])

Corr for fold: 0.01577476454262383
Corr for fold: 0.04775367293561232
Corr for fold: 0.07070455065968329
Corr for fold: 0.05893174471213953
Corr for fold: 0.0662900454452452


IndexError: invalid index to scalar variable.

### can try Lasso as a feature selector

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr

In [None]:
X = df[features]
y = df[target]
X_train = X.iloc[:5000]
y_train = y.iloc[:5000]
X_test = X.iloc[5000:]
y_test = y.iloc[5000:]

In [None]:
X_train_scaled = (X_train - X_train.mean()) / X_train.std()
X_test_scaled = (X_test - X_train.mean()) / X_train.std()

In [None]:
lasso = Lasso(alpha = 0.0001)
lasso.fit(X_train_scaled,y_train)

0,1,2
,alpha,0.0001
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [None]:
y_train_pred = lasso.predict(X_train_scaled)
y_test_pred = lasso.predict(X_test_scaled)

In [None]:
y_train_pred

array([ 0.00011477, -0.0022524 , -0.00089482, ...,  0.0021856 ,
        0.0030783 ,  0.00221145], shape=(5000,))

In [None]:
y_train

Unnamed: 0,forward_returns
0,0.011194
1,-0.003382
2,0.009564
3,-0.000916
4,0.004283
...,...
4995,0.001765
4996,-0.000112
4997,-0.004086
4998,0.008327


In [None]:
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f"Lasso train data Mean Squared Error: {mse_train}")
print(f"Lasso test data Mean Squared Error: {mse_test}")

Lasso train data Mean Squared Error: 0.00011859914819505049
Lasso test data Mean Squared Error: 0.00013986593297867552


In [None]:

R2 = r2_score(y_test, y_test_pred)
print(f"Lasso test data R2 Score: {R2}")

Lasso test data R2 Score: -0.1582320636066734


In [None]:
rho, pval = spearmanr(y_test, lasso.predict(X_test))
print(f"Lasso Spearman's rho: {rho}, p-value: {pval}")

Lasso Spearman's rho: nan, p-value: nan


  rho, pval = spearmanr(y_test, lasso.predict(X_test))


In [None]:
accuracy = np.mean(np.sign(lasso.predict(X_test)) == np.sign(y_test.squeeze()))
print("Directional accuracy:", accuracy)

Directional accuracy: 0.5532681420483788


In [None]:
baseline = np.array([np.mean(y_test)]*len(y_test))
accuracy = np.mean(np.sign(baseline) == np.sign(y_test.squeeze()))
mse = mean_squared_error(y_test, baseline)
R2 = r2_score(y_test, baseline)
print("Baseline directional accuracy:", accuracy)
print(f"Baseline R2 Score: {R2}")
print(f"Baseline Mean Squared Error: {mse}")


Baseline directional accuracy: 0.5532681420483788
Baseline R2 Score: 0.0
Baseline Mean Squared Error: 0.00012075812557212446
