In [1]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import statsmodels.discrete.discrete_model as sm

In [2]:
sys.path.insert(0, r'../../src')
from function_cache import *

In [3]:
T_0min = pd.read_csv('../../data/processed/price_before_close/T_0min.csv', index_col='Trading_Day', parse_dates=True)
TF_0min = pd.read_csv('../../data/processed/price_before_close/TF_0min.csv', index_col='Trading_Day', parse_dates=True)
T_5min = pd.read_csv('../../data/processed/price_before_close/T_5min.csv', index_col='Trading_Day', parse_dates=True)
TF_5min = pd.read_csv('../../data/processed/price_before_close/TF_5min.csv', index_col='Trading_Day', parse_dates=True)
T_10min = pd.read_csv('../../data/processed/price_before_close/T_10min.csv', index_col='Trading_Day', parse_dates=True)
TF_10min = pd.read_csv('../../data/processed/price_before_close/TF_10min.csv', index_col='Trading_Day', parse_dates=True)
T_15min = pd.read_csv('../../data/processed/price_before_close/T_15min.csv', index_col='Trading_Day', parse_dates=True)
TF_15min = pd.read_csv('../../data/processed/price_before_close/TF_15min.csv', index_col='Trading_Day', parse_dates=True)

In [4]:
T_features = pd.read_csv('../../data/processed/candidate_features/T_features.csv', index_col=0)
TF_features = pd.read_csv('../../data/processed/candidate_features/TF_features.csv', index_col=0)
T_spread_change = pd.read_csv('../../data/processed/different_ys/T_spread_change.csv', index_col=0)
TF_spread_change = pd.read_csv('../../data/processed/different_ys/TF_spread_change.csv', index_col=0)

In [5]:
features = pd.concat([T_features, TF_features])
spread_change = pd.concat([T_spread_change, TF_spread_change])

In [6]:
spread_sign = np.sign(spread_change)

X = features.values
y = spread_sign['0min_0min'].values
y[y == -1] = 0

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [7]:
top_feat = ['-1d_corr_def_OI_spread', '10d_def_volume_exp_mva']#, '-1d_corr_def_volume_spread', '1d_z_diff_act_price_def_price', '20d_spread_change']
feat_idx = [list(features.columns).index(x) for x in top_feat]
feat_idx

[18, 25]

In [8]:
clf = LogisticRegression(penalty='l1')
clf.fit(X, y)
clf.score(X, y)



0.8974358974358975

In [33]:
pred = clf.predict_proba(X)

In [24]:
pred = clf.predict(X)

In [10]:
FI_sr = pd.Series(np.abs(clf.coef_[0]), features.columns)
FI_sr

5d_spread_change                  0.192002
10d_spread_change                 0.000000
15d_spread_change                 0.000000
20d_spread_change                 0.000000
10d_act_price_exp_mva             0.000000
-1d_corr_act_price_def_OI         0.000000
-1d_corr_def_price_def_OI         0.000000
10d_spread_std_divided_by_std     0.303499
10d_current_price                 0.000000
1d_z_diff_act_price_def_price     0.618395
-1d_corr_act_price_def_volume     0.000000
-1d_corr_def_price_def_volume     0.000000
10d_current_spread                0.000000
-1d_std                           0.000000
-1d_corr_act_price_def_price      0.565512
-1d_corr_act_OI_def_volume        0.000000
-1d_corr_act_OI_def_OI            0.000000
-1d_corr_def_volume_def_OI        0.000000
-1d_corr_def_OI_spread            0.907547
-1d_corr_def_volume_spread        0.000000
1d_z_diff_act_price_def_OI        0.755141
-1d_corr_act_volume_def_volume    0.229390
-1d_corr_act_volume_def_OI        0.000000
-1d_corr_ac

In [28]:
FI_sr.to_csv('Logistic_Reg_L1.csv', header=True)

In [31]:
from scipy.stats import mode

In [36]:
np.argmax(pred, 1)

array([1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1], dtype=int64)

In [37]:
def ensemble_vote(pred_ensemble, probability=True):
    if probability:
        pred_tot = np.array(pred_ensemble).sum(0)
        pred_tot = np.argmax(pred_tot, 1)
    else:
        pred_tot = mode(np.array(pred_ensemble)).mode
        
    return pred_tot