In [1]:
import pandas as pd
import numpy as np
from tstools import FeatureEnricher
from sklearn.decomposition import KernelPCA

In [16]:
train_df = pd.read_csv('train_imputed.csv')
dummy_features = [col for col in train_df.columns if col[0] == 'D']
standard_features = [col for col in train_df.columns if col[0] in ['E','I','M','P','S','V']]
target = ['forward_returns']

### Make additional features

In [17]:
ts_enricher = FeatureEnricher(lags=[1, 5, 10], 
                              rolling_windows=[5, 15, 30],
                              add_pct_change=True)   
train_df_enriched = ts_enricher.transform(train_df, standard_features)    

In [18]:
train_df_enriched

Unnamed: 0,E1,E10,E11,E12,E13,E14,E15,E16,E17,E18,...,V8_lag1,V8_lag5,V8_lag10,V8_roll_mean_5,V8_roll_std_5,V8_roll_mean_15,V8_roll_std_15,V8_roll_mean_30,V8_roll_std_30,V8_pctchg
29,1.942390,0.996693,0.018849,0.002976,0.016204,0.002976,0.003638,0.218648,0.481752,-0.280715,...,0.000661,0.000661,0.000661,0.000661,0.000000,0.000661,0.000000,0.000661,0.000000,0.000000
30,1.938593,0.997024,0.018519,0.002646,0.015873,0.002646,0.003307,0.217855,0.480794,-0.281153,...,0.000661,0.000661,0.000661,0.000661,0.000000,0.000661,0.000000,0.000661,0.000000,0.000000
31,1.934812,0.997354,0.018188,0.002315,0.015542,0.002315,0.002976,0.217062,0.479836,-0.281591,...,0.000661,0.000661,0.000661,0.000661,0.000000,0.000661,0.000000,0.000661,0.000000,0.000000
32,1.931047,0.997685,0.017857,0.001984,0.015212,0.001984,0.002646,0.216268,0.478877,-0.282030,...,0.000661,0.000661,0.000661,0.000661,0.000000,0.000661,0.000000,0.000661,0.000000,0.000000
33,1.927299,0.998016,0.017526,0.001653,0.014881,0.001653,0.002315,0.215474,0.477919,-0.282468,...,0.000661,0.000661,0.000661,0.000661,0.000000,0.000661,0.000000,0.000661,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6967,1.565379,0.184524,0.019180,0.019180,0.005952,0.005952,0.911376,-0.083496,-0.572447,0.223638,...,0.124669,0.170966,0.109127,0.154101,0.028516,0.121274,0.044973,0.077370,0.062256,0.145889
6968,1.562946,0.184193,0.018849,0.018849,0.005622,0.005622,0.911706,-0.083542,-0.572080,0.222910,...,0.142857,0.141865,0.064153,0.164947,0.032705,0.126874,0.048815,0.083477,0.064652,0.372685
6969,1.560520,0.183862,0.018519,0.018519,0.005291,0.005291,0.912037,-0.083874,-0.572016,0.222211,...,0.196098,0.199405,0.098876,0.151852,0.028266,0.132363,0.044151,0.087776,0.063529,-0.317032
6970,1.558102,0.183532,0.018188,0.018188,0.004960,0.004960,0.912368,-0.084206,-0.571952,0.221513,...,0.133929,0.161706,0.057870,0.139881,0.034937,0.128880,0.044374,0.091127,0.061429,-0.239506


In [19]:
kpca = KernelPCA(
    n_components=5, 
    kernel='rbf')

X_kpca = kpca.fit_transform(train_df_enriched)

kpca_features = pd.DataFrame(
    X_kpca,
    columns=[f'kpca_{i+1}' for i in range(X_kpca.shape[1])],
    index=train_df_enriched.index
)

In [20]:
kpca_features

Unnamed: 0,kpca_1,kpca_2,kpca_3,kpca_4,kpca_5
29,-0.211952,-0.321313,0.306368,-0.070926,0.060542
30,-0.213137,-0.322343,0.305833,-0.073102,0.058708
31,-0.147462,-0.296300,0.270879,-0.072801,0.047209
32,-0.188694,-0.317626,0.291546,-0.076237,0.046600
33,-0.212619,-0.328513,0.309443,-0.080442,0.041222
...,...,...,...,...,...
6967,-0.354057,-0.186218,0.056856,0.078440,0.082835
6968,-0.305506,-0.180620,0.046429,0.047961,0.073913
6969,-0.404858,-0.188392,0.061307,0.068892,0.081982
6970,-0.397783,-0.187337,0.061327,0.071374,0.088999


In [21]:
train_df_enriched1 = pd.merge(
    train_df_enriched,
    kpca_features,
    left_index=True,
    right_index=True,
    how='left')

In [22]:
train_df_enriched2 = pd.merge(
    train_df_enriched1,
    train_df.drop(columns=standard_features),
    left_index=True,
    right_index=True,
    how='left')

In [25]:
train_df_enriched2.to_csv('train_enriched.csv', index=False)

In [11]:
corr_matrix = pd.DataFrame({
    kpca_col: train_df_enriched.apply(lambda f: np.corrcoef(f, kpca_features[kpca_col])[0,1])
    for kpca_col in kpca_features.columns
})

corr_matrix

Unnamed: 0,kpca_1,kpca_2,kpca_3,kpca_4,kpca_5
E1,-0.170955,-0.025756,0.059751,-0.017589,0.164639
E10,0.025240,-0.161708,-0.102168,0.091794,-0.111192
E11,0.060822,0.219471,0.185152,0.183670,0.179818
E12,0.067553,0.212133,0.198017,0.213180,0.174849
E13,0.000657,-0.144161,0.108128,0.058359,0.096053
...,...,...,...,...,...
V8_roll_mean_15,0.033314,-0.266389,0.104867,-0.113405,-0.244074
V8_roll_std_15,-0.022989,-0.310124,0.067077,-0.041819,-0.035269
V8_roll_mean_30,0.033539,-0.267990,0.100374,-0.106571,-0.246941
V8_roll_std_30,-0.035698,-0.308095,0.077457,-0.064274,-0.041668


In [12]:
corr_matrix['kpca_5'].abs().sort_values(ascending=False)

M10_roll_mean_5     0.557136
M10_roll_mean_15    0.556205
M10_lag1            0.555256
M10                 0.555178
M10_roll_mean_30    0.552875
                      ...   
I9_pctchg           0.000412
E10_roll_std_15     0.000367
S2_pctchg           0.000340
I2_pctchg           0.000332
P10_pctchg          0.000090
Name: kpca_5, Length: 803, dtype: float64