In [144]:
import sys
sys.path.append('..')
#from project_helper import VolFeatures, FuturesCloseData, TradeModel
from modules import opts

In [145]:
from __future__ import absolute_import


import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.api import OLS

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import TimeSeriesSplit

from tqdm import tqdm
from pandas.plotting import scatter_matrix
from sklearn.model_selection import GridSearchCV
#import seaborn as sns


import sys
sys.path.append('..')
from modules.project_helper import VolFeatures, FuturesCloseData, TradeModel

import warnings
warnings.filterwarnings('ignore')



In [146]:
import pickle
file = open("../data/features/full_features.pkl",'rb')
full_features = pickle.load(file)

In [147]:
instrument_list = ['ES', 'NQ', 'CD', 'EC', 'JY', 'MP', 'TY', 'US', 'C', 'S', 'W', 'CL', 'GC']
x_dict={}
y_dict={}
for inst in instrument_list:
    #y_dict[inst] = 2 * (full_features[inst][inst] >=0)- 1
    y_dict[inst] = (full_features[inst][inst]>=0).astype(int)    
    x_dict[inst] = full_features[inst].drop([inst], axis=1)

In [6]:
class MLModel:
    def __init__(self,model,inst,x_dict,y_dict):
        self.inst = inst
        self.x = x_dict[inst]
        self.y = y_dict[inst]

        self.model = model
        self.accuracy_train = None
        self.accuracy_test = None
    
    def split_data(self):
        self.X_train, \
        self.X_test, \
        self.y_train, \
        self.y_test = train_test_split(self.x, self.y, test_size=0.50, shuffle=False)


    def train_model(self):
        #self.model = OLS(self.y_train, self.X_train)
        #self.model = self.model.fit()
        self.model = self.model(random_state=0).fit(self.X_train,self.y_train)


    def evaluate_model(self):
        self.accuracy_train = self.model.score(self.X_train, self.y_train)
        self.accuracy_test = self.model.score(self.X_test, self.y_test)

        
    
class ModelBuilder:
    def __init__(self,inst,x_dict,y_dict):
        #self.model = None
        self.logistic_model = MLModel(LogisticRegression,inst,x_dict,y_dict)
        self.rf_model = MLModel(RandomForestClassifier,inst,x_dict,y_dict)
        self.tree_model = MLModel(DecisionTreeClassifier,inst,x_dict,y_dict)
        self.boosted_tree_model = MLModel(GradientBoostingClassifier,inst,x_dict,y_dict)

        self.models = {'logistic':self.logistic_model,
                 'rf':self.rf_model,
                 'tree':self.tree_model,
                    'boosted_tree':self.boosted_tree_model}


        self.accuracy_train = None
        self.accuracy_test = None
        #ml_models = [LinearRegression, ]

        
    def run(self):
        {model.split_data() for model in self.models.values()}
        {model.train_model() for model in self.models.values()}
        {model.evaluate_model() for model in self.models.values()}
        return self


In [7]:
models = {inst: ModelBuilder(inst,x_dict,y_dict) for inst in instrument_list}
models = {inst: model.run() for inst,model in models.items()}

In [8]:

logistic = pd.DataFrame([(inst, model.models['logistic'].accuracy_train) for inst,model in models.items()],columns=['asset','logistic']).set_index('asset')
tree = pd.DataFrame([(inst, model.models['tree'].accuracy_train) for inst,model in models.items()],columns=['asset','tree']).set_index('asset')
rf = pd.DataFrame([(inst, model.models['rf'].accuracy_train) for inst,model in models.items()],columns=['asset','rf']).set_index('asset')
boosted_tree = pd.DataFrame([(inst, model.models['boosted_tree'].accuracy_train) for inst,model in models.items()],columns=['asset','boosted_tree']).set_index('asset')

train_accuracies = logistic.join(tree).join(rf).join(boosted_tree)
train_accuracies

Unnamed: 0_level_0,logistic,tree,rf,boosted_tree
asset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ES,0.60745,1.0,0.979943,0.982808
NQ,0.616046,1.0,0.985673,0.985673
CD,0.593123,1.0,0.979943,0.982808
EC,0.621777,1.0,0.977077,0.979943
JY,0.604585,1.0,0.971347,0.982808
MP,0.590258,1.0,0.982808,1.0
TY,0.590258,1.0,0.979943,0.994269
US,0.610315,1.0,0.974212,0.988539
C,0.610315,1.0,0.982808,0.979943
S,0.598854,1.0,0.979943,0.985673


In [9]:

logistic = pd.DataFrame([(inst, model.models['logistic'].accuracy_test) for inst,model in models.items()],columns=['asset','logistic']).set_index('asset')
tree = pd.DataFrame([(inst, model.models['tree'].accuracy_test) for inst,model in models.items()],columns=['asset','tree']).set_index('asset')
rf = pd.DataFrame([(inst, model.models['rf'].accuracy_test) for inst,model in models.items()],columns=['asset','rf']).set_index('asset')
boosted_tree = pd.DataFrame([(inst, model.models['boosted_tree'].accuracy_test) for inst,model in models.items()],columns=['asset','boosted_tree']).set_index('asset')

test_accuracies = logistic.join(tree).join(rf).join(bohttp://localhost:8888/notebooks/notebooks_modelling/all_models.ipynb#osted_tree)
test_accuracies.to_csv('test_accuracies.csv')

In [10]:
imp_df = pd.DataFrame()
for inst in instrument_list:
    feature_imp_df = pd.DataFrame(models[inst].models['tree'].model.feature_importances_,columns=[inst])
    feature_imp_df.index = x_dict[inst].columns
    imp_df = imp_df.join(feature_imp_df,how='outer')

In [9]:
imp_df.to_csv('imp_df.csv')

In [17]:
X,\
X_test, \
y, \
y_test = train_test_split(x_dict['ES'], y_dict['ES'], test_size=0.80, shuffle=False)

In [148]:
instrument_list = ['ES', 'NQ', 'CD', 'EC', 'JY', 'MP', 'TY', 'US', 'C', 'S', 'W', 'CL', 'GC']

C_list = [0.001, 0.01, 0.1,1,10,100,1000,10000,100000]
accuracy = pd.DataFrame(columns=C_list, index=instrument_list)
f1score = pd.DataFrame(columns=C_list, index=instrument_list)
plong = pd.DataFrame(columns=C_list, index=instrument_list)

n_splits = 5
for inst in tqdm(instrument_list):
    for c in tqdm(C_list):
        X, X_test, y, y_test = train_test_split(x_dict[inst], y_dict[inst], test_size=0.20, shuffle=False)
        
        tm = TradeModel(model=LogisticRegression, C=c)
        tscv = TimeSeriesSplit(n_splits=n_splits)
        time_split = tscv.split(X)
        ac = 0
        f1 = 0
        p = 0
        for train_index, valid_index in time_split:
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]    
            tm.fit(X_train, y_train)
            ac = ac + tm.model.score(X_valid, y_valid)
            f1 = f1 + f1_score(tm.model.predict(X_valid),y_valid)
            p = p + tm.model.predict(X_valid).mean()

        accuracy.loc[inst][c] = ( round(ac / n_splits, 3) ) 
        f1score.loc[inst][c] = ( round(f1 / n_splits, 3) ) 
        plong.loc[inst][c] = ( round(p / n_splits, 3) ) 
        
        #strat_models[inst] = tm
        #strat_rets[inst], strat_cum_rets[inst] = tm.strategy_returns(X_test, y_dict[inst][-len(X_test):], cutoff=0.55)
        #strat_sharpes[inst] = tm.sharpe(X_test, y_dict[inst][-len(X_test):],cutoff=0.55)
        #print('{} sharpe {}:'.format(inst, strat_sharpes[inst]))



  0%|          | 0/13 [00:00<?, ?it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
 44%|████▍     | 4/9 [00:00<00:00, 33.69it/s][A
 78%|███████▊  | 7/9 [00:00<00:00, 30.30it/s][A
100%|██████████| 9/9 [00:00<00:00, 17.83it/s][A
  8%|▊         | 1/13 [00:00<00:06,  1.97it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
 44%|████▍     | 4/9 [00:00<00:00, 39.23it/s][A
 78%|███████▊  | 7/9 [00:00<00:00, 33.50it/s][A
100%|██████████| 9/9 [00:00<00:00, 19.61it/s][A
 15%|█▌        | 2/13 [00:00<00:05,  2.02it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
 44%|████▍     | 4/9 [00:00<00:00, 36.59it/s][A
 78%|███████▊  | 7/9 [00:00<00:00, 30.81it/s][A
100%|██████████| 9/9 [00:00<00:00, 17.82it/s][A
 23%|██▎       | 3/13 [00:01<00:04,  2.01it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A
 44%|████▍     | 4/9 [00:00<00:00, 39.01it/s][A
 78%|███████▊  | 7/9 [00:00<00:00, 32.91it/s][A
100%|██████████| 9/9 [00:00<00:00, 17.91it/s][A
 31%|███       | 4/13 [00:01<00:04,  2.00it/s]
  0%|          | 0/9 

In [149]:
accuracy

Unnamed: 0,0.001,0.010,0.100,1.000,10.000,100.000,1000.000,10000.000,100000.000
ES,0.54,0.561,0.57,0.531,0.52,0.533,0.544,0.533,0.529
NQ,0.533,0.566,0.561,0.548,0.546,0.553,0.544,0.523,0.501
CD,0.525,0.529,0.51,0.471,0.449,0.46,0.467,0.46,0.454
EC,0.49,0.499,0.499,0.518,0.497,0.514,0.501,0.488,0.505
JY,0.548,0.525,0.525,0.512,0.51,0.512,0.518,0.52,0.529
MP,0.503,0.51,0.51,0.505,0.52,0.538,0.523,0.54,0.542
TY,0.6,0.576,0.548,0.525,0.525,0.501,0.503,0.51,0.508
US,0.561,0.535,0.523,0.51,0.501,0.503,0.486,0.49,0.48
C,0.535,0.551,0.533,0.523,0.535,0.523,0.527,0.52,0.525
S,0.501,0.508,0.51,0.508,0.501,0.499,0.492,0.495,0.488


In [150]:
f1score

Unnamed: 0,0.001,0.010,0.100,1.000,10.000,100.000,1000.000,10000.000,100000.000
ES,0.67,0.707,0.714,0.657,0.62,0.617,0.619,0.596,0.575
NQ,0.675,0.722,0.717,0.667,0.626,0.619,0.61,0.585,0.551
CD,0.463,0.378,0.373,0.384,0.38,0.402,0.416,0.417,0.435
EC,0.514,0.532,0.487,0.495,0.497,0.506,0.502,0.516,0.535
JY,0.561,0.553,0.555,0.521,0.511,0.52,0.532,0.538,0.544
MP,0.609,0.637,0.594,0.534,0.52,0.53,0.529,0.566,0.577
TY,0.622,0.618,0.595,0.526,0.521,0.493,0.504,0.506,0.499
US,0.676,0.679,0.659,0.587,0.539,0.544,0.53,0.519,0.521
C,0.589,0.607,0.589,0.58,0.597,0.578,0.585,0.579,0.584
S,0.422,0.447,0.474,0.477,0.436,0.452,0.456,0.454,0.437


In [151]:
plong

Unnamed: 0,0.001,0.010,0.100,1.000,10.000,100.000,1000.000,10000.000,100000.000
ES,0.841,0.931,0.935,0.798,0.688,0.645,0.622,0.576,0.542
NQ,0.89,1.0,0.991,0.798,0.658,0.617,0.609,0.596,0.561
CD,0.428,0.316,0.357,0.413,0.426,0.441,0.452,0.462,0.499
EC,0.551,0.581,0.499,0.467,0.505,0.497,0.518,0.578,0.587
JY,0.548,0.594,0.606,0.559,0.548,0.568,0.587,0.589,0.585
MP,0.766,0.845,0.695,0.557,0.49,0.473,0.505,0.548,0.572
TY,0.557,0.615,0.617,0.503,0.503,0.488,0.499,0.492,0.482
US,0.824,0.918,0.871,0.665,0.57,0.576,0.581,0.551,0.574
C,0.637,0.652,0.639,0.637,0.658,0.637,0.645,0.643,0.647
S,0.469,0.484,0.49,0.484,0.413,0.449,0.482,0.462,0.443


In [8]:
instrument_list = ['ES', 'NQ', 'CD', 'EC', 'JY', 'MP', 'TY', 'US', 'C', 'S', 'W', 'CL', 'GC']
strat_sharpes = {}
strat_rets = {}
strat_cum_rets = {}
strat_models = {}
depth_list = range(5,15)
max_features_list = range(4,23,3)
accuracies = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
f1scores = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
plong = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
n_splits = 5
for inst in tqdm([instrument_list[0]]):
    for dl in tqdm(depth_list):
        for mf in max_features_list:

            X, X_test, y, y_test = train_test_split(x_dict[inst], y_dict[inst], test_size=0.20, shuffle=False)
            tm = TradeModel(n_estimators=1000, max_features=mf, max_depth=dl, criterion ='entropy')
    
            tscv = TimeSeriesSplit(n_splits=n_splits)
            time_split = tscv.split(X)
            ac = 0
            f1 = 0
            p = 0
            for train_index, valid_index in time_split:
                X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
                y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]    
                tm.fit(X_train, y_train)
                ac = ac + tm.model.score(X_valid, y_valid)
                f1 = f1 + f1_score(tm.model.predict(X_valid),y_valid)
                p = p + tm.model.predict(X_valid).mean()

            accuracies[inst].loc[dl,mf] = round(ac / n_splits,3)
            f1scores[inst].loc[dl,mf] = round(f1 / n_splits,3)
            plong.loc[inst][dl,mf] = ( round(p / n_splits, 3) ) 
  


  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:37<03:47, 37.99s/it][A
 29%|██▊       | 2/7 [01:17<03:12, 38.40s/it][A
 43%|████▎     | 3/7 [01:58<02:37, 39.33s/it][A
 57%|█████▋    | 4/7 [02:43<02:02, 40.81s/it][A
 71%|███████▏  | 5/7 [03:29<01:25, 42.62s/it][A
 86%|████████▌ | 6/7 [04:15<00:43, 43.49s/it][A
100%|██████████| 7/7 [05:01<00:00, 43.01s/it][A
100%|██████████| 1/1 [05:01<00:00, 301.07s/it]


In [51]:
strat_sharpes['S']

Unnamed: 0,0.01,0.10,1.00,10.00,100.00,1000.00,10000.00,100000.00
0.5,0.0855926,0.0669582,0.045771,0.0384284,0.0507326,0.0685316,0.044581,0.0506452
0.51,0.0784206,0.070487,0.0517867,0.0410994,0.0470711,0.0590907,0.046055,0.0502515
0.52,0.0469216,0.0823921,0.0531957,0.0531957,0.0577591,0.049581,0.054797,0.0405843
0.53,0.0233577,0.0709701,0.062969,0.0505193,0.0575032,0.0613069,0.0525983,0.0357615
0.54,0.0577617,0.0614797,0.0469661,0.0681494,0.0475409,0.0698795,0.0497001,0.0367312
0.55,0.0002364,0.0649171,0.0564104,0.0538254,0.0472184,0.0748084,0.0488707,0.0348592
0.56,-0.0187035,0.0702023,0.0426884,0.0468479,0.043401,0.0648009,0.0503157,0.0188386
0.57,-0.057735,0.0731689,0.0378165,0.0430764,0.047582,0.0506912,0.0335073,0.0343163


In [10]:
accuracies['ES']

Unnamed: 0,5,8,11,14,17
5,0.584946,0.593548,0.597849,0.612903,0.6
6,0.589247,0.602151,0.604301,0.608602,0.612903
7,0.604301,0.597849,0.6,0.6,0.602151
8,0.595699,0.595699,0.602151,0.602151,0.615054
9,0.597849,0.591398,0.608602,0.602151,0.602151
10,0.602151,0.606452,0.602151,0.604301,0.604301
11,0.582796,0.6,0.606452,0.602151,0.610753


In [11]:
precisions['ES']

Unnamed: 0,5,8,11,14,17
5,0.886717,0.869398,0.85444,0.855762,0.825887
6,0.87297,0.861282,0.839481,0.830803,0.825712
7,0.876366,0.854615,0.826994,0.825844,0.808331
8,0.854377,0.825193,0.829052,0.811602,0.804521
9,0.842576,0.831301,0.828877,0.819613,0.807294
10,0.858747,0.840628,0.815305,0.812035,0.800907
11,0.835496,0.834872,0.817379,0.808331,0.811602


In [12]:
recalls['ES']

Unnamed: 0,5,8,11,14,17
5,0.606366,0.613877,0.619353,0.632006,0.626041
6,0.611589,0.623582,0.628151,0.633183,0.636684
7,0.623752,0.621922,0.627545,0.628413,0.632797
8,0.620877,0.622803,0.626729,0.632744,0.643979
9,0.624085,0.620321,0.631676,0.629904,0.631491
10,0.622094,0.628525,0.63199,0.633282,0.635497
11,0.61261,0.625718,0.633867,0.632043,0.638152


In [76]:
instrument_list = ['ES', 'NQ', 'CD', 'EC', 'JY', 'MP', 'TY', 'US', 'C', 'S', 'W', 'CL', 'GC']
strat_sharpes = {}
strat_rets = {}
strat_cum_rets = {}
strat_models = {}
depth_list = range(5,12)
max_features_list = range(5,18,3)
parameters = {'max_features':max_features_list, 'max_depth':depth_list}
accuracies = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
precisions = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
n_splits = 5
for inst in tqdm(instrument_list):
            
            


            X, X_test, y, y_test = train_test_split(x_dict[inst], y_dict[inst], test_size=0.20, shuffle=False)
            clf = GridSearchCV(RandomForestClassifier(n_estimators=1000,criterion="entropy"), 
                               param_grid=parameters, 
                               cv=TimeSeriesSplit(n_splits=n_splits),
                              scoring = ['accuracy', 'precision'],
                              refit=False)
            clf.fit(X,y)
            accuracies = clf.cv_results_['mean_test_score']
            precisions = clf.cv_results_['mean_test_precision']
    
        
        #strat_models[inst] = tm
        #strat_rets[inst], strat_cum_rets[inst] = tm.strategy_returns(X_test, y_dict[inst][-len(X_test):], cutoff=0.55)
        #strat_sharpes[inst] = tm.sharpe(X_test, y_dict[inst][-len(X_test):],cutoff=0.55)
        #print('{} sharpe {}:'.format(inst, strat_sharpes[inst]))

sorted(clf.cv_results_.keys())






  0%|          | 0/13 [00:00<?, ?it/s][A[A[A[A[A

KeyError: 'mean_test_score'