In [1]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import statsmodels.discrete.discrete_model as sm

In [2]:
sys.path.insert(0, r'../../src')
from function_cache import *

In [3]:
T_0min = pd.read_csv('../../data/processed/price_before_close/T_0min.csv', index_col='Trading_Day', parse_dates=True)
TF_0min = pd.read_csv('../../data/processed/price_before_close/TF_0min.csv', index_col='Trading_Day', parse_dates=True)
T_5min = pd.read_csv('../../data/processed/price_before_close/T_5min.csv', index_col='Trading_Day', parse_dates=True)
TF_5min = pd.read_csv('../../data/processed/price_before_close/TF_5min.csv', index_col='Trading_Day', parse_dates=True)
T_10min = pd.read_csv('../../data/processed/price_before_close/T_10min.csv', index_col='Trading_Day', parse_dates=True)
TF_10min = pd.read_csv('../../data/processed/price_before_close/TF_10min.csv', index_col='Trading_Day', parse_dates=True)
T_15min = pd.read_csv('../../data/processed/price_before_close/T_15min.csv', index_col='Trading_Day', parse_dates=True)
TF_15min = pd.read_csv('../../data/processed/price_before_close/TF_15min.csv', index_col='Trading_Day', parse_dates=True)

In [4]:
T_features = pd.read_csv('../../data/processed/candidate_features/T_features.csv', index_col=0)
TF_features = pd.read_csv('../../data/processed/candidate_features/TF_features.csv', index_col=0)
T_spread_change = pd.read_csv('../../data/processed/different_ys/T_spread_change.csv', index_col=0)
TF_spread_change = pd.read_csv('../../data/processed/different_ys/TF_spread_change.csv', index_col=0)

In [5]:
features = pd.concat([T_features, TF_features])
spread_change = pd.concat([T_spread_change, TF_spread_change])

In [6]:
spread_sign = np.sign(spread_change)

X = features.values
y = spread_sign['0min_0min'].values
y[y == -1] = 0

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [7]:
top_feat = ['-1d_corr_def_OI_spread', '10d_def_volume_exp_mva']#, '-1d_corr_def_volume_spread', '1d_z_diff_act_price_def_price', '20d_spread_change']
feat_idx = [list(features.columns).index(x) for x in top_feat]
feat_idx

[18, 25]

In [8]:
clf = LogisticRegression(penalty='l2')
clf.fit(X, y)
clf.score(X, y)



0.9230769230769231

In [9]:
clf.predict_proba(X)

array([[0.59172404, 0.40827596],
       [0.05694669, 0.94305331],
       [0.06054197, 0.93945803],
       [0.1035494 , 0.8964506 ],
       [0.23604287, 0.76395713],
       [0.03721279, 0.96278721],
       [0.1471091 , 0.8528909 ],
       [0.89350043, 0.10649957],
       [0.43921623, 0.56078377],
       [0.61491158, 0.38508842],
       [0.14452277, 0.85547723],
       [0.17990077, 0.82009923],
       [0.10471707, 0.89528293],
       [0.11243085, 0.88756915],
       [0.14552053, 0.85447947],
       [0.06076669, 0.93923331],
       [0.94141037, 0.05858963],
       [0.81592506, 0.18407494],
       [0.90858031, 0.09141969],
       [0.94497216, 0.05502784],
       [0.64589729, 0.35410271],
       [0.84990185, 0.15009815],
       [0.38624879, 0.61375121],
       [0.15055584, 0.84944416],
       [0.07700369, 0.92299631],
       [0.02208508, 0.97791492],
       [0.0574811 , 0.9425189 ],
       [0.82431705, 0.17568295],
       [0.13180551, 0.86819449],
       [0.29866916, 0.70133084],
       [0.

In [10]:
FI_sr = pd.Series(np.abs(clf.coef_[0]), features.columns)
FI_sr

5d_spread_change                  0.471027
10d_spread_change                 0.476790
15d_spread_change                 0.034541
20d_spread_change                 0.664142
10d_act_price_exp_mva             0.267740
-1d_corr_act_price_def_OI         0.063464
-1d_corr_def_price_def_OI         0.011010
10d_spread_std_divided_by_std     0.598247
10d_current_price                 0.275725
1d_z_diff_act_price_def_price     0.729329
-1d_corr_act_price_def_volume     0.174968
-1d_corr_def_price_def_volume     0.167137
10d_current_spread                0.006676
-1d_std                           0.256544
-1d_corr_act_price_def_price      0.713832
-1d_corr_act_OI_def_volume        0.232550
-1d_corr_act_OI_def_OI            0.059327
-1d_corr_def_volume_def_OI        0.001227
-1d_corr_def_OI_spread            0.561008
-1d_corr_def_volume_spread        0.195247
1d_z_diff_act_price_def_OI        0.729619
-1d_corr_act_volume_def_volume    0.620428
-1d_corr_act_volume_def_OI        0.198545
-1d_corr_ac

In [11]:
FI_sr.to_csv('Logistic_Reg_L2.csv', header=True)