# Author: Ryan Butler

Goal: Figure out how to implement subset selection

In [2]:
import consts, test_helper
import helper
import importlib
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import sklearn, scipy, requests
import random

In [3]:
# General Constants
USER = "RYAN"
FILE_PATH = consts.PATH_MAP[USER]

RESPONSE_NAME = consts.RESPONSE_NAME

LASSO = "LASSO"
XGBOOST = "XGBOOST"

ALPHA = 0.05 # Set significance level

In [4]:
TEST_START  = "20180201"
TRAIN_TEST_GAP = 31

In [5]:
importlib.reload(test_helper)
data = test_helper.Data(FILE_PATH)
train_df = data.update_and_get_train_df(FILE_PATH, TEST_START, movingBack_dayCount=TRAIN_TEST_GAP, years_count=1)
test_dfs = data.get_df_between_date(data_path = FILE_PATH, start_date="20150701", end_date="20150801")

No YYYYMMDD datetime matched.

Filtered File Dates: ['data.20170103_1200', 'data.20170104_1200', 'data.20170105_1200', 'data.20170106_1200', 'data.20170109_1200', 'data.20170110_1200', 'data.20170111_1200', 'data.20170112_1200', 'data.20170113_1200', 'data.20170117_1200', 'data.20170118_1200', 'data.20170119_1200', 'data.20170120_1200', 'data.20170123_1200', 'data.20170124_1200', 'data.20170125_1200', 'data.20170126_1200', 'data.20170127_1200', 'data.20170130_1200', 'data.20170131_1200', 'data.20170201_1200', 'data.20170202_1200', 'data.20170203_1200', 'data.20170206_1200', 'data.20170207_1200', 'data.20170208_1200', 'data.20170209_1200', 'data.20170210_1200', 'data.20170213_1200', 'data.20170214_1200', 'data.20170215_1200', 'data.20170216_1200', 'data.20170217_1200', 'data.20170221_1200', 'data.20170222_1200', 'data.20170223_1200', 'data.20170224_1200', 'data.20170227_1200', 'data.20170228_1200', 'data.20170301_1200', 'data.20170302_1200', 'data.20170303_1200', 'data.20170306_1200', '

In [5]:
# Get test DFs


### Predictors we cannot use: 
- Equity ID (eqid): Arbitrarily Chosen
- tonight: Response var

In [6]:
# all_cols = list(train_df.columns)
# group = (list(filter(lambda c: re.match("relntrds70tr_.*",c),all_cols)))
# group_corr_matrix = train_df[group].corr()
# min_coer_predictors(group_corr_matrix, group)

In [7]:
# all_cols = list(train_df.columns)
# group = (list(filter(lambda c: re.match("relvol_.*",c),all_cols)))
# group_corr_matrix = train_df[group].corr()
# min_coer_predictors(group_corr_matrix, group)

2 functions to get the minimally correlated predictors

In [6]:
def min_coer_predictors(df, group):
    pred1 = None
    pred2 = None
    min_corr = 1
    for predictor1 in group:
        for predictor2 in group: 
            corr = abs(df.loc[predictor1, predictor2])
            if corr < min_corr:
                pred1 = predictor1
                pred2 = predictor2
                min_corr = corr
    return pred1, pred2

def group_cols(regex: re, all_cols_df:pd.DataFrame):
    all_cols = list(all_cols_df.columns)
    group = (list(filter(lambda c: re.match(regex,c),all_cols)))
    group_corr_matrix = train_df[group].corr()
    return min_coer_predictors(group_corr_matrix, group)

In [7]:
best_columns = []
for regex in [r"(cft).*|momr.*", r"qe_", r"moorelvol_open_", r"mocrelvol_open_.*",r"relntrds70tr_.*",r"relvol_.*", r"relavgts70tr_.*", r"rel2retcfrtxm_.*",r"rrretstr_.*",r"rvdelta_.*", r"rrirpnxm_l.*",r"llirpnxm.*", r"tr_.*",r"nnetticksrelmultstdev.*", r"nnetticksrelrrsign",r"nsameticksrelmultstdev",r"nsameticksrelrrsign_" ]:
    p1, p2 = group_cols(regex, train_df)
    best_columns.append(p1)
    best_columns.append(p2)

In [8]:
# 34 minimally correlated columns and 34 random columns
FEATURE_COL_NAMES = best_columns
column_names = list(train_df.columns)
RANDOM_COLUMN_NAMES = random.sample(column_names[8:], 34) # 8 cols before are response. Can't use them

In [11]:
ols_model = test_helper.Regression(data_path = FILE_PATH)
ols_model.train(train_df, feature_col_names = FEATURE_COL_NAMES)
ols_model.get_metric(dataframes = test_dfs)

No YYYYMMDD datetime matched.

You're using: OLS.

Features being used: ['cftorrrelstd_open_0', 'momr10d_open_0', 'qe_nt0_open_0', 'qe_prv2_open_0', 'moorelvol_open_0', 'moorelvol_open_17to21', 'mocrelvol_open_1', 'mocrelvol_open_18to22', 'relntrds70tr_dy_7to11', 'relntrds70tr_md_1', 'relvol_lst15_0', 'relvol_nt_12to16', 'relavgts70tr_close_12to16', 'relavgts70tr_close_1to3', 'rel2retcfrtxm_lst15_0', 'rel2retcfrtxm_nt_1', 'rrretstr_dy_7to11', 'rrretstr_nt_1to3', 'rvdelta_am_1', 'rvdelta_lsthrx15_0', 'rrirpnxm_lst15_0', 'rrirpnxm_lsthrx15_0', 'llirpnxm_am_1', 'llirpnxm_lsthrx15_0', 'tr_nt_0', 'tr_nt_12to16', 'nnetticksrelmultstdev_lsthrx15_0', 'nnetticksrelmultstdev_nt_4to6', 'nnetticksrelrrsign_nt_17to21', 'nnetticksrelrrsign_pm_1', 'nsameticksrelmultstdev_nt_17to21', 'nsameticksrelmultstdev_pm_1', 'nsameticksrelrrsign_dy_12to16', 'nsameticksrelrrsign_lst15_0']
response_corr = 0.02317092359060354
mean_return = 0.00022756024780829908
scale factor = 0.9648677776859903
response_corr = 0.0

(0.02317092359060354, 0.00022756024780829908, 0.9648677776859903)

In [12]:
ols_model = test_helper.Regression(data_path = FILE_PATH)
ols_model.train(train_df, feature_col_names = RANDOM_COLUMN_NAMES)
ols_model.get_metric(dataframes = test_dfs)

No YYYYMMDD datetime matched.

You're using: OLS.

Features being used: ['rvdelta_am_1', 'rrretstr_dy_4to6', 'nsameticksrelrrsign_nt_17to21', 'relntrds70tr_md_1', 'rrirpnxm_nt_0', 'relntrds70tr_lst15_0', 'llirpnxm_nt_17to21', 'nsameticksrelmultstdev_dy_17to21', 'nsameticksrelmultstdev_pm_1', 'tr_lsthrx15_0', 'rvdelta_dy_7to11', 'rrirpnxm_md_1', 'rrretstr_am_1', 'relntrds70tr_toxhr_0', 'tr_nt_4to6', 'rvdelta_nt_1to3', 'relntrds70tr_dy_4to6', 'rvdelta_nt_1', 'rrretstr_nt_1to3', 'rrirpnxm_dy_7to11', 'rvdelta_lst15_0', 'rrretstr_nt_7to11', 'llirpnxm_nt_1', 'relvol_nt_0', 'rvdelta_lsthrx15_0', 'llirpnxm_nt_4to6', 'rrirpnxm_dy_4to6', 'nsameticksrelmultstdev_md_1', 'nsameticksrelrrsign_nt_1to3', 'nnetticksrelrrsign_am_1', 'relntrds70tr_nt_0', 'liqlog_open_0', 'rrretstr_pm_1', 'nnetticksrelrrsign_nt_1to3']
response_corr = 0.044583569514859915
mean_return = 0.0003009560742270175
scale factor = 0.8980192318304716
response_corr = 0.044583569514859915
mean_return = 0.0003009560742270175
scale fact

(0.044583569514859915, 0.0003009560742270175, 0.8980192318304716)

In [13]:
lasso_model = test_helper.Regression(regression_type="Lasso", data_path=FILE_PATH)
lasso_model.train(train_df, feature_col_names = FEATURE_COL_NAMES)
lasso_model.get_metric(dataframes = test_dfs)

No YYYYMMDD datetime matched.

You're using: LASSO.

Features being used: ['cftorrrelstd_open_0', 'momr10d_open_0', 'qe_nt0_open_0', 'qe_prv2_open_0', 'moorelvol_open_0', 'moorelvol_open_17to21', 'mocrelvol_open_1', 'mocrelvol_open_18to22', 'relntrds70tr_dy_7to11', 'relntrds70tr_md_1', 'relvol_lst15_0', 'relvol_nt_12to16', 'relavgts70tr_close_12to16', 'relavgts70tr_close_1to3', 'rel2retcfrtxm_lst15_0', 'rel2retcfrtxm_nt_1', 'rrretstr_dy_7to11', 'rrretstr_nt_1to3', 'rvdelta_am_1', 'rvdelta_lsthrx15_0', 'rrirpnxm_lst15_0', 'rrirpnxm_lsthrx15_0', 'llirpnxm_am_1', 'llirpnxm_lsthrx15_0', 'tr_nt_0', 'tr_nt_12to16', 'nnetticksrelmultstdev_lsthrx15_0', 'nnetticksrelmultstdev_nt_4to6', 'nnetticksrelrrsign_nt_17to21', 'nnetticksrelrrsign_pm_1', 'nsameticksrelmultstdev_nt_17to21', 'nsameticksrelmultstdev_pm_1', 'nsameticksrelrrsign_dy_12to16', 'nsameticksrelrrsign_lst15_0']
response_corr = -0.0008448045417282249
mean_return = -3.192025374517135e-05
scale factor = -0.05616739885540533
response_cor

(-0.0008448045417282249, -3.192025374517135e-05, -0.05616739885540533)

In [14]:
lasso_model = test_helper.Regression(regression_type="Lasso", data_path=FILE_PATH)
lasso_model.train(train_df, feature_col_names = RANDOM_COLUMN_NAMES)
lasso_model.get_metric(dataframes = test_dfs)

No YYYYMMDD datetime matched.

You're using: LASSO.

Features being used: ['rvdelta_am_1', 'rrretstr_dy_4to6', 'nsameticksrelrrsign_nt_17to21', 'relntrds70tr_md_1', 'rrirpnxm_nt_0', 'relntrds70tr_lst15_0', 'llirpnxm_nt_17to21', 'nsameticksrelmultstdev_dy_17to21', 'nsameticksrelmultstdev_pm_1', 'tr_lsthrx15_0', 'rvdelta_dy_7to11', 'rrirpnxm_md_1', 'rrretstr_am_1', 'relntrds70tr_toxhr_0', 'tr_nt_4to6', 'rvdelta_nt_1to3', 'relntrds70tr_dy_4to6', 'rvdelta_nt_1', 'rrretstr_nt_1to3', 'rrirpnxm_dy_7to11', 'rvdelta_lst15_0', 'rrretstr_nt_7to11', 'llirpnxm_nt_1', 'relvol_nt_0', 'rvdelta_lsthrx15_0', 'llirpnxm_nt_4to6', 'rrirpnxm_dy_4to6', 'nsameticksrelmultstdev_md_1', 'nsameticksrelrrsign_nt_1to3', 'nnetticksrelrrsign_am_1', 'relntrds70tr_nt_0', 'liqlog_open_0', 'rrretstr_pm_1', 'nnetticksrelrrsign_nt_1to3']
response_corr = 0.04217455044187878
mean_return = 0.00029171788379682905
scale factor = 1.2291349808739915
response_corr = 0.04217455044187878
mean_return = 0.00029171788379682905
scale fa

(0.04217455044187878, 0.00029171788379682905, 1.2291349808739915)

In [11]:
xgb_model = test_helper.Regression(regression_type="XGBoost", data_path=FILE_PATH)
xgb_model.train(train_df, feature_col_names = FEATURE_COL_NAMES)
corr,mr,sf = xgb_model.get_metric(test_dfs, printMetrics=False)
print(f" Mean Return: {mr}")

No YYYYMMDD datetime matched.

You're using: XGBOOST.

Features being used: ['cftorrrelstd_open_0', 'momr10d_open_0', 'qe_nt0_open_0', 'qe_prv2_open_0', 'moorelvol_open_0', 'moorelvol_open_17to21', 'mocrelvol_open_1', 'mocrelvol_open_18to22', 'relntrds70tr_dy_7to11', 'relntrds70tr_md_1', 'relvol_lst15_0', 'relvol_nt_12to16', 'relavgts70tr_close_12to16', 'relavgts70tr_close_1to3', 'rel2retcfrtxm_lst15_0', 'rel2retcfrtxm_nt_1', 'rrretstr_dy_7to11', 'rrretstr_nt_1to3', 'rvdelta_am_1', 'rvdelta_lsthrx15_0', 'rrirpnxm_lst15_0', 'rrirpnxm_lsthrx15_0', 'llirpnxm_am_1', 'llirpnxm_lsthrx15_0', 'tr_nt_0', 'tr_nt_12to16', 'nnetticksrelmultstdev_lsthrx15_0', 'nnetticksrelmultstdev_nt_4to6', 'nnetticksrelrrsign_nt_17to21', 'nnetticksrelrrsign_pm_1', 'nsameticksrelmultstdev_nt_17to21', 'nsameticksrelmultstdev_pm_1', 'nsameticksrelrrsign_dy_12to16', 'nsameticksrelrrsign_lst15_0']
 Mean Return: 7.131090243368708e-05


In [10]:
xgb_model = test_helper.Regression(regression_type="XGBoost", data_path=FILE_PATH)
xgb_model.train(train_df, feature_col_names = RANDOM_COLUMN_NAMES)
xgb_model.get_metric(test_dfs, printMetrics=False)

No YYYYMMDD datetime matched.

You're using: XGBOOST.

Features being used: ['nsameticksrelmultstdev_lsthrx15_0', 'nsameticksrelmultstdev_nt_0', 'nnetticksrelmultstdev_lst15_0', 'rrretstr_nt_1', 'rel2retcfrtxm_pm_1', 'nnetticksrelmultstdev_nt_17to21', 'rrretstr_nt_0', 'llirpnxm_dy_12to16', 'rvdelta_toxhr_0', 'tr_lst15_0', 'rrirpnxm_toxhr_0', 'tr_dy_12to16', 'relvol_dy_4to6', 'rel2retcfrtxm_toxhr_0', 'relvol_nt_1', 'llirpnxm_nt_4to6', 'rrirpnxm_lst15_0', 'rrirpnxm_nt_17to21', 'nsameticksrelrrsign_dy_12to16', 'rrretstr_pm_1', 'rvdelta_am_1', 'nnetticksrelmultstdev_dy_1to3', 'nsameticksrelmultstdev_md_1', 'nnetticksrelmultstdev_nt_12to16', 'rrretstr_dy_17to21', 'tr_nt_7to11', 'mocrelvol_open_13to17', 'rrretstr_nt_17to21', 'relntrds70tr_dy_7to11', 'nnetticksrelrrsign_am_1', 'nsameticksrelrrsign_lst15_0', 'tr_nt_17to21', 'rel2retcfrtxm_dy_1to3', 'nnetticksrelrrsign_pm_1']


(0.00874143268130256, 9.869090818199318e-05, 0.06824858)

## Null Hypothesis:  
Using my feature method results in higher mean returns than random features. 
## Alternative Hypothesis: 
There is no difference in mean returns between the model using our method of feature selection and using 34 random features

If we fail to reject the null hypothesis then it means that my method of feature selection may result in higher mean returns. 


In [13]:
def random_feats(model_type, *, numFeats = 10):
    # Collect Data: 
    column_names = list(train_df.columns)
    mean_returns = []
    for trial in range(30):
        rand_feats = random.sample(column_names[8:], numFeats)     
        model = test_helper.Regression(regression_type=model_type, data_path=FILE_PATH)
        model.train(train_df, feature_col_names = rand_feats)
        corr,mr,sf = model.get_metric(test_dfs, printMetrics=False)
        mean_returns.append(mr)
    return mean_returns

    

In [14]:
lasso_model = test_helper.Regression(regression_type="Lasso", data_path=FILE_PATH)
lasso_model.train(train_df, feature_col_names = FEATURE_COL_NAMES)
_, mr, _ = lasso_model.get_metric(dataframes = test_dfs)
lasso_mean_returns = [mr for x in range(30)]
lasso_random_feats = random_feats('LASSO', numFeats=34)


No YYYYMMDD datetime matched.

You're using: LASSO.

Features being used: ['cftorrrelstd_open_0', 'momr10d_open_0', 'qe_nt0_open_0', 'qe_prv2_open_0', 'moorelvol_open_0', 'moorelvol_open_17to21', 'mocrelvol_open_1', 'mocrelvol_open_18to22', 'relntrds70tr_dy_7to11', 'relntrds70tr_md_1', 'relvol_lst15_0', 'relvol_nt_12to16', 'relavgts70tr_close_12to16', 'relavgts70tr_close_1to3', 'rel2retcfrtxm_lst15_0', 'rel2retcfrtxm_nt_1', 'rrretstr_dy_7to11', 'rrretstr_nt_1to3', 'rvdelta_am_1', 'rvdelta_lsthrx15_0', 'rrirpnxm_lst15_0', 'rrirpnxm_lsthrx15_0', 'llirpnxm_am_1', 'llirpnxm_lsthrx15_0', 'tr_nt_0', 'tr_nt_12to16', 'nnetticksrelmultstdev_lsthrx15_0', 'nnetticksrelmultstdev_nt_4to6', 'nnetticksrelrrsign_nt_17to21', 'nnetticksrelrrsign_pm_1', 'nsameticksrelmultstdev_nt_17to21', 'nsameticksrelmultstdev_pm_1', 'nsameticksrelrrsign_dy_12to16', 'nsameticksrelrrsign_lst15_0']
response_corr = -0.0008448045417282249
mean_return = -3.192025374517135e-05
scale factor = -0.05616739885540533
response_cor

In [None]:
# run t-test


### Key Takeaway: Taking the 2 least corellated features from each group does not work for finding a model equal to or better than picking 34 random predictors 

In [20]:
from sklearn.feature_selection import SequentialFeatureSelector

In [25]:
# Subset Selection: 
xgboost_regressor = test_helper.Regression(data_path=FILE_PATH, regression_type='xgboost')
model = xgboost_regressor.saved_model
result  = SequentialFeatureSelector(model, n_features_to_select = 20, direction = 'forward', cv=10)
type(result)

No YYYYMMDD datetime matched.

You're using: XGBOOST.



sklearn.feature_selection._sequential.SequentialFeatureSelector