In [1]:
import sys
sys.path.append('..')
#from project_helper import VolFeatures, FuturesCloseData, TradeModel
from modules import opts

In [165]:
from __future__ import absolute_import

import pickle
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import r2_score, accuracy_score, f1_score, log_loss

from tqdm import tqdm

import sys
sys.path.append('..')
from modules.project_helper import VolFeatures, FuturesCloseData, TradeModel

import warnings
warnings.filterwarnings('ignore')



In [3]:
import pickle
file = open("../data/features/full_features.pkl",'rb')
full_features = pickle.load(file)

In [4]:
instrument_list = ['ES', 'NQ', 'CD', 'EC', 'JY', 'MP', 'TY', 'US', 'C', 'S', 'W', 'CL', 'GC']
x_dict={}
y_dict={}
for inst in instrument_list:
    #y_dict[inst] = 2 * (full_features[inst][inst] >=0)- 1
    y_dict[inst] = (full_features[inst][inst]>=0).astype(int)    
    x_dict[inst] = full_features[inst].drop([inst], axis=1)

In [6]:
class MLModel:
    def __init__(self,model,inst,x_dict,y_dict):
        self.inst = inst
        self.x = x_dict[inst]
        self.y = y_dict[inst]

        self.model = model
        self.accuracy_train = None
        self.accuracy_test = None
    
    def split_data(self):
        self.X_train, \
        self.X_test, \
        self.y_train, \
        self.y_test = train_test_split(self.x, self.y, test_size=0.50, shuffle=False)


    def train_model(self):
        #self.model = OLS(self.y_train, self.X_train)
        #self.model = self.model.fit()
        self.model = self.model(random_state=0).fit(self.X_train,self.y_train)


    def evaluate_model(self):
        self.accuracy_train = self.model.score(self.X_train, self.y_train)
        self.accuracy_test = self.model.score(self.X_test, self.y_test)

        
    
class ModelBuilder:
    def __init__(self,inst,x_dict,y_dict):
        #self.model = None
        self.logistic_model = MLModel(LogisticRegression,inst,x_dict,y_dict)
        self.rf_model = MLModel(RandomForestClassifier,inst,x_dict,y_dict)
        self.tree_model = MLModel(DecisionTreeClassifier,inst,x_dict,y_dict)
        self.boosted_tree_model = MLModel(GradientBoostingClassifier,inst,x_dict,y_dict)

        self.models = {'logistic':self.logistic_model,
                 'rf':self.rf_model,
                 'tree':self.tree_model,
                    'boosted_tree':self.boosted_tree_model}


        self.accuracy_train = None
        self.accuracy_test = None
        #ml_models = [LinearRegression, ]

        
    def run(self):
        {model.split_data() for model in self.models.values()}
        {model.train_model() for model in self.models.values()}
        {model.evaluate_model() for model in self.models.values()}
        return self


In [7]:
models = {inst: ModelBuilder(inst,x_dict,y_dict) for inst in instrument_list}
models = {inst: model.run() for inst,model in models.items()}

In [8]:

logistic = pd.DataFrame([(inst, model.models['logistic'].accuracy_train) for inst,model in models.items()],columns=['asset','logistic']).set_index('asset')
tree = pd.DataFrame([(inst, model.models['tree'].accuracy_train) for inst,model in models.items()],columns=['asset','tree']).set_index('asset')
rf = pd.DataFrame([(inst, model.models['rf'].accuracy_train) for inst,model in models.items()],columns=['asset','rf']).set_index('asset')
boosted_tree = pd.DataFrame([(inst, model.models['boosted_tree'].accuracy_train) for inst,model in models.items()],columns=['asset','boosted_tree']).set_index('asset')

train_accuracies = logistic.join(tree).join(rf).join(boosted_tree)
train_accuracies

Unnamed: 0_level_0,logistic,tree,rf,boosted_tree
asset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ES,0.60745,1.0,0.979943,0.982808
NQ,0.616046,1.0,0.985673,0.985673
CD,0.593123,1.0,0.979943,0.982808
EC,0.621777,1.0,0.977077,0.979943
JY,0.604585,1.0,0.971347,0.982808
MP,0.590258,1.0,0.982808,1.0
TY,0.590258,1.0,0.979943,0.994269
US,0.610315,1.0,0.974212,0.988539
C,0.610315,1.0,0.982808,0.979943
S,0.598854,1.0,0.979943,0.985673


In [9]:

logistic = pd.DataFrame([(inst, model.models['logistic'].accuracy_test) for inst,model in models.items()],columns=['asset','logistic']).set_index('asset')
tree = pd.DataFrame([(inst, model.models['tree'].accuracy_test) for inst,model in models.items()],columns=['asset','tree']).set_index('asset')
rf = pd.DataFrame([(inst, model.models['rf'].accuracy_test) for inst,model in models.items()],columns=['asset','rf']).set_index('asset')
boosted_tree = pd.DataFrame([(inst, model.models['boosted_tree'].accuracy_test) for inst,model in models.items()],columns=['asset','boosted_tree']).set_index('asset')

test_accuracies = logistic.join(tree).join(rf).join(bohttp://localhost:8888/notebooks/notebooks_modelling/all_models.ipynb#osted_tree)
test_accuracies.to_csv('test_accuracies.csv')

In [10]:
imp_df = pd.DataFrame()
for inst in instrument_list:
    feature_imp_df = pd.DataFrame(models[inst].models['tree'].model.feature_importances_,columns=[inst])
    feature_imp_df.index = x_dict[inst].columns
    imp_df = imp_df.join(feature_imp_df,how='outer')

In [9]:
imp_df.to_csv('imp_df.csv')

In [None]:

for inst in tqdm(instrument_list):
    X, X_test, y, y_test = train_test_split(x_dict[inst], y_dict[inst], test_size=0.20, shuffle=False)


In [141]:
instrument_list = ['ES', 'NQ', 'CD', 'EC', 'JY', 'MP', 'TY', 'US', 'C', 'S', 'W', 'CL', 'GC']

C_list = [0.001, 0.01, 0.1,1,10,100,1000,10000,100000]
accuracy_logreg = pd.DataFrame(columns=C_list, index=instrument_list)
f1score_logreg = pd.DataFrame(columns=C_list, index=instrument_list)
logloss_logreg = pd.DataFrame(columns=C_list, index=instrument_list)
plong_logreg = pd.DataFrame(columns=C_list, index=instrument_list)

n_splits = 5
for inst in instrument_list:
    for c in C_list:
        
        X, X_test, y, y_test = train_test_split(x_dict[inst], y_dict[inst], test_size=0.20, shuffle=False)
        X_test = X_test[2:]
        y_test = y_test[2:]
        
        tm = TradeModel(model=LogisticRegression, C=c)
        tscv = TimeSeriesSplit(n_splits=n_splits)
        time_split = tscv.split(X)
        ac = 0
        f1 = 0
        p = 0
        ll = 0
        for train_index, valid_index in time_split:
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] 
            X_valid = X_valid[2:]
            y_valid = y_valid[2:]
            tm.fit(X_train, y_train)
            ac = ac + tm.model.score(X_valid, y_valid)
            f1 = f1 + f1_score(tm.model.predict(X_valid),y_valid)
            p = p + tm.model.predict(X_valid).mean()
            ll = ll + log_loss(y_valid, tm.model.predict(X_valid))

        accuracy_logreg.loc[inst][c] = ( round(ac / n_splits, 3) ) 
        f1score_logreg.loc[inst][c] = ( round(f1 / n_splits, 3) ) 
        plong_logreg.loc[inst][c] = ( round(p / n_splits, 3) ) 
        logloss_logreg.loc[inst][c] = ( round(ll / n_splits, 3) ) 
    cv_logreg = logloss_logreg.astype('float').idxmin(axis=1)

        

In [150]:
logloss_logreg

Unnamed: 0,0.001,0.010,0.100,1.000,10.000,100.000,1000.000,10000.000,100000.000
ES,15.789,15.03,14.727,16.169,16.548,16.093,15.638,16.017,16.245
NQ,16.093,14.954,15.106,15.562,15.638,15.258,15.486,16.245,17.004
CD,16.473,16.245,16.928,18.446,19.129,18.826,18.522,18.75,19.053
EC,17.535,17.232,17.232,16.473,17.383,16.776,17.232,17.687,17.156
JY,15.638,16.473,16.473,16.928,16.852,16.852,16.624,16.624,16.321
MP,17.08,16.928,17.004,17.156,16.7,16.093,16.548,15.941,15.941
TY,13.816,14.651,15.638,16.548,16.548,17.308,17.156,16.928,17.004
US,15.03,16.017,16.473,16.928,17.232,17.156,17.763,17.535,17.991
C,16.093,15.562,15.941,16.321,16.017,16.321,16.245,16.397,16.321
S,17.308,17.08,16.852,16.776,17.08,17.08,17.232,17.308,17.535


In [164]:
cv_logreg

ES        0.100
NQ        0.010
CD        0.010
EC        1.000
JY        0.001
MP    10000.000
TY        0.001
US        0.001
C         0.010
S         1.000
W        10.000
CL        0.100
GC    10000.000
dtype: float64

In [163]:
logloss_logreg

Unnamed: 0,0.001,0.010,0.100,1.000,10.000,100.000,1000.000,10000.000,100000.000
ES,15.789,15.03,14.727,16.169,16.548,16.093,15.638,16.017,16.245
NQ,16.093,14.954,15.106,15.562,15.638,15.258,15.486,16.245,17.004
CD,16.473,16.245,16.928,18.446,19.129,18.826,18.522,18.75,19.053
EC,17.535,17.232,17.232,16.473,17.383,16.776,17.232,17.687,17.156
JY,15.638,16.473,16.473,16.928,16.852,16.852,16.624,16.624,16.321
MP,17.08,16.928,17.004,17.156,16.7,16.093,16.548,15.941,15.941
TY,13.816,14.651,15.638,16.548,16.548,17.308,17.156,16.928,17.004
US,15.03,16.017,16.473,16.928,17.232,17.156,17.763,17.535,17.991
C,16.093,15.562,15.941,16.321,16.017,16.321,16.245,16.397,16.321
S,17.308,17.08,16.852,16.776,17.08,17.08,17.232,17.308,17.535


In [171]:
with open('cv_logreg.pickle', 'wb') as handle:
    pickle.dump(accuracy_logreg, handle)
    pickle.dump(f1score_logreg, handle)
    pickle.dump(plong_logreg, handle)
    pickle.dump(logloss_logreg, handle)
    pickle.dump(cv_logreg, handle)

In [146]:

depth_list = range(4,15)
max_features_list = list(range(3,7,1)) + list(range(7,26,3))  
accuracies_rf = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
f1scores_rf = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
logloss_rf = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
plong_rf = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
cv_rf = pd.DataFrame(index = instrument_list, columns = ['max_depth','max_features'])
n_splits = 5
for inst in tqdm(instrument_list):
    for dl in depth_list:
        for mf in max_features_list:

            X, X_test, y, y_test = train_test_split(x_dict[inst], y_dict[inst], test_size=0.20, shuffle=False)
            tm = TradeModel(n_estimators=1000, max_features=mf, max_depth=dl, criterion ='entropy')
    
            tscv = TimeSeriesSplit(n_splits=n_splits)
            time_split = tscv.split(X)
            ac = 0
            f1 = 0
            p = 0
            ll = 0
            for train_index, valid_index in time_split:
                X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
                y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]   
                X_valid = X_valid[2:]
                y_valid = y_valid[2:]
                tm.fit(X_train, y_train)
                ac = ac + tm.model.score(X_valid, y_valid)
                f1 = f1 + f1_score(tm.model.predict(X_valid),y_valid)
                p = p + tm.model.predict(X_valid).mean()
                ll = ll + log_loss(y_valid, tm.model.predict(X_valid))

            accuracies_rf[inst].loc[dl,mf] = round(ac / n_splits,3)
            f1scores_rf[inst].loc[dl,mf] = round(f1 / n_splits,3)
            plong_rf[inst].loc[dl,mf] = ( round(p / n_splits, 3) ) 
            logloss_rf[inst].loc[dl,mf] = ( round(ll / n_splits, 3) )
    
    x1 = logloss_rf[inst].astype('float').min(axis=1).idxmin()
    x2 = logloss_rf[inst].astype('float').loc[x1].idxmin()
    cv_rf.loc[inst] = np.array([x1,x2])
  











  0%|          | 0/13 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A








  8%|▊         | 1/13 [16:42<3:20:33, 1002.76s/it][A[A[A[A[A[A[A[A[A








 15%|█▌        | 2/13 [33:27<3:03:56, 1003.32s/it][A[A[A[A[A[A[A[A[A








 23%|██▎       | 3/13 [50:20<2:47:43, 1006.35s/it][A[A[A[A[A[A[A[A[A








 31%|███       | 4/13 [1:07:07<2:30:58, 1006.53s/it][A[A[A[A[A[A[A[A[A








 38%|███▊      | 5/13 [1:24:59<2:16:47, 1025.96s/it][A[A[A[A[A[A[A[A[A








 46%|████▌     | 6/13 [1:42:46<2:01:09, 1038.44s/it][A[A[A[A[A[A[A[A[A








 54%|█████▍    | 7/13 [2:04:40<1:52:06, 1121.08s/it][A[A[A[A[A[A[A[A[A








 62%|██████▏   | 8/13 [2:25:35<1:36:46, 1161.27s/it][A[A[A[A[A[A[A[A[A








 69%|██████▉   | 9/13 [2:42:38<1:14:39, 1119.90s/it][A[A[A[A[A[A[A[A[A








 77%|███████▋  | 10/13 [2:59:49<54:38, 1092.98s/it] [A[A[A[A[A[A[A[A[A








 85%|████████▍ | 11/13 [3:17:01<35:49, 1074

In [181]:
cv_rf

Unnamed: 0,max_depth,max_features
ES,4,13
NQ,4,3
CD,13,6
EC,12,6
JY,4,22
MP,8,19
TY,7,13
US,5,3
C,8,19
S,13,10


In [151]:
logloss_rf['ES']

Unnamed: 0,3,4,5,6,7,10,13,16,19,22,25
4,14.803,15.182,14.954,14.727,14.651,14.803,14.423,14.803,14.879,14.803,14.651
5,14.879,14.879,14.727,15.03,14.954,14.803,14.651,14.954,15.03,15.258,14.954
6,15.182,14.954,15.106,15.41,14.727,15.258,14.954,15.334,15.258,15.182,15.334
7,14.879,15.106,14.954,14.954,14.651,15.334,15.41,15.182,14.954,15.258,15.106
8,15.182,15.182,15.182,15.03,15.106,15.334,15.562,15.106,15.41,15.713,15.258
9,14.879,14.879,14.879,15.182,14.803,15.334,15.182,15.334,15.03,14.803,15.258
10,15.334,15.258,15.258,15.334,14.879,14.954,15.106,15.486,14.954,15.258,15.182
11,15.03,14.954,15.486,14.954,14.878,15.41,15.258,15.334,15.41,15.334,14.878
12,15.182,15.182,14.954,15.182,15.41,15.03,15.03,15.182,15.41,15.258,15.182
13,15.562,15.258,15.562,15.03,15.258,15.638,15.334,15.258,14.954,15.03,15.258


In [182]:
with open('cv_rf.pickle', 'wb') as handle:
    pickle.dump(accuracies_rf, handle)
    pickle.dump(f1scores_rf, handle)
    pickle.dump(plong_rf, handle)
    pickle.dump(logloss_rf, handle)
    pickle.dump(cv_rf, handle)

In [145]:

depth_list = range(4,15)
max_features_list = list(range(3,7,1)) + list(range(7,26,3)) 
accuracies_boost = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
f1scores_boost = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
logloss_boost = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
plong_boost = {inst:pd.DataFrame(columns=max_features_list, index=depth_list) for inst in instrument_list}
cv_boost = pd.DataFrame(index = instrument_list, columns = ['max_depth','max_features'])
n_splits = 5
for inst in tqdm(instrument_list):
    for dl in depth_list:
        for mf in max_features_list:

            X, X_test, y, y_test = train_test_split(x_dict[inst], y_dict[inst], test_size=0.20, shuffle=False)
            tm = TradeModel(model=GradientBoostingClassifier, 
                            n_estimators=1000, 
                            max_features=mf, 
                            max_depth=dl, 
                            )
    
            tscv = TimeSeriesSplit(n_splits=n_splits)
            time_split = tscv.split(X)
            ac = 0
            f1 = 0
            p = 0
            ll = 0
            for train_index, valid_index in time_split:
                X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
                y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]  
                X_valid = X_valid[2:]
                y_valid = y_valid[2:]
                tm.fit(X_train, y_train)
                ac = ac + tm.model.score(X_valid, y_valid)
                f1 = f1 + f1_score(tm.model.predict(X_valid),y_valid)
                p = p + tm.model.predict(X_valid).mean()
                ll = ll + log_loss(y_valid, tm.model.predict(X_valid))

            accuracies_boost[inst].loc[dl,mf] = round(ac / n_splits,3)
            f1scores_boost[inst].loc[dl,mf] = round(f1 / n_splits,3)
            plong_boost[inst].loc[dl,mf] = ( round(p / n_splits, 3) ) 
            logloss_boost[inst].loc[dl,mf] = ( round(ll / n_splits, 3) )
        
    x1 = logloss_boost[inst].astype('float').min(axis=1).idxmin()
    x2 = logloss_boost[inst].astype('float').loc[x1].idxmin()
    cv_boost.loc[inst] = np.array([x1,x2])
  










  0%|          | 0/13 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A








  8%|▊         | 1/13 [04:11<50:16, 251.36s/it][A[A[A[A[A[A[A[A[A








 15%|█▌        | 2/13 [08:24<46:12, 252.01s/it][A[A[A[A[A[A[A[A[A








 23%|██▎       | 3/13 [12:40<42:10, 253.03s/it][A[A[A[A[A[A[A[A[A








 31%|███       | 4/13 [22:20<52:40, 351.20s/it][A[A[A[A[A[A[A[A[A








 38%|███▊      | 5/13 [26:40<43:09, 323.72s/it][A[A[A[A[A[A[A[A[A








 46%|████▌     | 6/13 [30:55<35:22, 303.22s/it][A[A[A[A[A[A[A[A[A








 54%|█████▍    | 7/13 [38:17<34:28, 344.75s/it][A[A[A[A[A[A[A[A[A








 62%|██████▏   | 8/13 [42:48<26:53, 322.74s/it][A[A[A[A[A[A[A[A[A








 69%|██████▉   | 9/13 [47:10<20:17, 304.47s/it][A[A[A[A[A[A[A[A[A








 77%|███████▋  | 10/13 [51:28<14:31, 290.51s/it][A[A[A[A[A[A[A[A[A








 85%|████████▍ | 11/13 [55:49<09:23, 281.84s/it][A[A[A[A[A[A[A[A[A








 9

In [159]:
cv_boost

Unnamed: 0,max_depth,max_features
ES,6,16
NQ,14,5
CD,8,6
EC,11,3
JY,13,16
MP,11,22
TY,8,16
US,11,16
C,11,5
S,6,10


In [161]:
logloss_boost['C']

Unnamed: 0,3,4,5,6,7,10,13,16,19,22,25
4,16.017,16.624,15.486,15.713,16.017,16.093,16.7,16.624,16.245,16.7,16.397
5,16.624,16.7,16.245,16.624,16.852,16.093,15.941,17.004,16.473,16.928,16.776
6,16.321,16.017,16.017,15.865,16.321,16.093,15.941,16.7,16.397,16.548,16.093
7,16.397,16.017,16.928,15.789,16.928,15.638,16.245,16.473,16.7,16.852,16.776
8,16.245,16.093,16.7,16.093,17.156,16.7,16.624,16.7,15.865,16.473,15.713
9,16.548,16.624,16.093,16.624,15.258,16.397,16.321,16.397,16.624,16.852,16.245
10,17.156,15.41,15.865,16.017,16.245,16.169,16.852,16.321,15.789,16.017,16.473
11,16.548,17.535,15.182,16.624,16.852,16.852,16.928,15.941,16.928,16.624,16.473
12,16.7,16.548,17.156,16.093,16.776,16.473,16.017,15.789,16.093,16.093,16.397
13,16.548,15.258,16.548,16.321,16.397,17.232,16.548,16.017,17.08,16.852,15.334


In [173]:
with open('cv_boost.pickle', 'wb') as handle:
    pickle.dump(accuracies_boost, handle)
    pickle.dump(f1scores_boost, handle)
    pickle.dump(plong_boost, handle)
    pickle.dump(logloss_boost, handle)
    pickle.dump(cv_boost, handle)