In [1]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import math
from sklearn.svm import SVR, LinearSVR, SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

In [2]:
sys.path.insert(0, r'../../src')
from function_cache import *

In [3]:
T_0min = pd.read_csv('../../data/processed/price_before_close/T_0min.csv', index_col='Trading_Day', parse_dates=True)
TF_0min = pd.read_csv('../../data/processed/price_before_close/TF_0min.csv', index_col='Trading_Day', parse_dates=True)
T_5min = pd.read_csv('../../data/processed/price_before_close/T_5min.csv', index_col='Trading_Day', parse_dates=True)
TF_5min = pd.read_csv('../../data/processed/price_before_close/TF_5min.csv', index_col='Trading_Day', parse_dates=True)
T_10min = pd.read_csv('../../data/processed/price_before_close/T_10min.csv', index_col='Trading_Day', parse_dates=True)
TF_10min = pd.read_csv('../../data/processed/price_before_close/TF_10min.csv', index_col='Trading_Day', parse_dates=True)
T_15min = pd.read_csv('../../data/processed/price_before_close/T_15min.csv', index_col='Trading_Day', parse_dates=True)
TF_15min = pd.read_csv('../../data/processed/price_before_close/TF_15min.csv', index_col='Trading_Day', parse_dates=True)

In [4]:
T_features = pd.read_csv('../../data/processed/candidate_features/T_features.csv', index_col=0)
TF_features = pd.read_csv('../../data/processed/candidate_features/TF_features.csv', index_col=0)
T_spread_change = pd.read_csv('../../data/processed/different_ys/T_spread_change.csv', index_col=0)
TF_spread_change = pd.read_csv('../../data/processed/different_ys/TF_spread_change.csv', index_col=0)

In [7]:
def featImpMDI(fit, featNames):
    # feature importance based on IS mean impurity reduction
    df0 = {i:tree.feature_importances_ for i, tree in enumerate(fit.estimators_)}
    df0 = pd.DataFrame.from_dict(df0, orient='index')
    df0.columns = featNames
    df0 = df0.replace(0, np.nan)
    imp = pd.concat({'mean': df0.mean(), 'std': df0.std()*df0.shape[0]**-.5}, axis=1)
    imp /= imp['mean'].sum()
    return imp

In [5]:
features = pd.concat([T_features, TF_features])
spread_change = pd.concat([T_spread_change, TF_spread_change])

In [6]:
spread_sign = np.sign(spread_change)

X = features.values
y = spread_sign['0min_0min'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [18]:
clf = RandomForestClassifier(n_estimators=100, max_features=1)
clf = clf.fit(X, y)
feature_importances = pd.Series(clf.feature_importances_, index=features.columns)
feature_importances.sort_values()

10d_spread_change                 0.021642
5d_spread_change                  0.024881
-1d_corr_act_price_def_price      0.025861
-1d_corr_act_price_def_volume     0.027683
10d_current_spread                0.028527
-1d_corr_act_OI_spread            0.030461
-1d_corr_act_OI_def_volume        0.031037
-1d_corr_def_volume_def_OI        0.031203
15d_spread_change                 0.031707
10d_spread_std_divided_by_std     0.031714
1d_z_diff_act_volume_act_OI       0.031962
-1d_std                           0.032547
-1d_corr_def_price_def_volume     0.032716
-1d_corr_act_volume_def_volume    0.032866
-1d_corr_act_volume_def_OI        0.033106
-1d_corr_act_OI_def_OI            0.033186
-1d_corr_act_price_act_OI         0.035796
-1d_corr_def_OI_spread            0.036446
10d_current_price                 0.040004
1d_z_diff_act_price_def_OI        0.041141
10d_def_volume_exp_mva            0.042340
-1d_corr_def_price_def_OI         0.043055
1d_z_diff_act_price_def_price     0.043973
-1d_corr_ac

In [19]:
clf.score(X, y)

1.0

In [20]:
clf.predict(X)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1., -1.,  1.,  1.,  1.,
        1.,  1.,  1., -1., -1., -1., -1., -1., -1., -1.,  1.,  1.,  1.,
        1., -1.,  1.,  1., -1.,  1., -1.,  1., -1.,  1.,  1., -1.,  1.])

In [17]:
y

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,  1., -1.,  1.,  1.,  1.,
        1.,  1.,  1., -1., -1., -1., -1., -1., -1., -1.,  1.,  1.,  1.,
        1., -1.,  1.,  1., -1.,  1., -1.,  1., -1.,  1.,  1., -1.,  1.])