In [37]:
import sys
sys.path.append('..') #to add top-level to path

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, accuracy_score, f1_score

from pandas.plotting import scatter_matrix
#import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


from modules.project_helper import VolFeatures, FuturesCloseData

In [2]:
import pickle
file = open("../data/features/full_features.pkl",'rb')
full_features = pickle.load(file)
file.close()

In [3]:
instrument_list = ['ES', 'NQ', 'CD', 'EC', 'JY', 'MP', 'TY', 'US', 'C', 'S', 'W', 'CL', 'GC']
x_dict={}
y_dict={}
y_returns={}
for inst in instrument_list:
    #y_dict[inst] = 2 * (full_features[inst][inst] >=0)- 1
    y_dict[inst] = (full_features[inst][inst]>=0).astype(int)
    x_dict[inst] = full_features[inst].drop([inst], axis=1)
    returns = full_features[inst][inst]
    y_returns[inst] = returns[[i in x_dict[inst].index for i in returns.index]]
    if sum(y_returns[inst].index != x_dict[inst].index)!=0:
        raise Exception('Returns and X indices dont match')


In [54]:
class MLModel:
    def __init__(self,model,inst,x_dict,y_dict,y_returns,hyper_parameters={}):
        self.inst = inst
        self.x = x_dict[inst]
        self.y = y_dict[inst]
        self.y_returns = y_returns[inst]
        self.hyper_parameters=hyper_parameters

        self.model = model
        self.accuracy_train = None
        self.accuracy_test = None
        self.sharpe = None
        self.f1_test = None
        self.prediction_ratio = None
        
        self.strat_rets = None
        self.strat_rets_cum = None
        
    
        self.train_predictions = None
        self.test_predictions = None
        self.position = None
        
        self.train_class_balance = None
        self.test_class_balance = None
    
    def split_data(self):
        self.X_train, \
        self.X_test, \
        self.y_train, \
        self.y_test,\
        self.y_returns_train,\
        self.y_returns_test = train_test_split(self.x, self.y, self.y_returns, test_size=0.20, shuffle=False)


    def train_model(self):
        #self.model = OLS(self.y_train, self.X_train)
        #self.model = self.model.fit()

        self.model = self.model(random_state=0,**self.hyper_parameters).fit(self.X_train,self.y_train)

    def evaluate_sharpe(self, cutoff=0.55):
        rets = self.strategy_returns(cutoff)[0]
        self.sharpe = np.sqrt(252)*np.mean(rets) / np.std(rets)
    
    def get_position(self, cutoff=0.55):
        # converting predictions from {0,1} to {-1,1}, short/long
        self.position = 2 * self.model.predict(self.X_test) - 1
        self.position[self.model.predict_proba(self.X_test).max(axis=1) <= cutoff] = 0
        return self.position
    
    def strategy_returns(self,cutoff=0.55):
        x = self.get_position(cutoff=cutoff)[:-1]
        y = self.y_returns_test[:-1] #make sure returns are logs
        self.strat_rets = x * y
        self.strat_rets_cum =  self.strat_rets.cumsum()
        return self.strat_rets, self.strat_rets_cum


    def evaluate_model(self):
        self.accuracy_train = self.model.score(self.X_train, self.y_train)
        self.accuracy_test = self.model.score(self.X_test, self.y_test)
        self.f1_test = f1_score(self.y_test,self.test_predictions)
        self.evaluate_sharpe()
        self.prediction_ratio = np.mean(self.test_predictions)
    
    def generate_predictions(self):
        self.train_predictions = self.model.predict(self.X_train)
        self.test_predictions = self.model.predict(self.X_test)
        
        self.train_class_balance = np.mean(self.train_predictions)
        self.test_class_balance = np.mean(self.test_predictions)
    



        
class AssetModels:
    def __init__(self,inst,x_dict,y_dict,y_returns,hyper_parameters):
        #self.model = None
        self.logistic_model = MLModel(LogisticRegression,inst,x_dict,y_dict,y_returns,
                                     hyper_parameters.get('logistic') if hyper_parameters.get('logistic') else {})
        self.rf_model = MLModel(RandomForestClassifier,inst,x_dict,y_dict,y_returns,
                                     hyper_parameters.get('rf') if hyper_parameters.get('rf') else {})
        self.tree_model = MLModel(DecisionTreeClassifier,inst,x_dict,y_dict,y_returns,
                                     hyper_parameters.get('tree') if hyper_parameters.get('tree') else {})
        self.boosted_tree_model = MLModel(GradientBoostingClassifier,inst,x_dict,y_dict,y_returns,
                                     hyper_parameters.get('boosted_tree') if hyper_parameters.get('boosted_tree') else {})

        self.ml_models = {'logistic':self.logistic_model,
                          'rf':self.rf_model,
                          'tree':self.tree_model,
                          'boosted_tree':self.boosted_tree_model
                         }
        
        self.best_model_name = None
        self.best_model = None
        
        self.best_model_accuracy = None
        self.best_model_sharpe = None

        self.accuracies_train = None
        self.accuracies_test = None
        self.sharpe_values = None
        self.f1_scores = None
        self.prediction_ratios = None
        
    def get_best_model(self):
        self.accuracies_test = pd.DataFrame.from_dict(
            {model_name:model.accuracy_test for model_name,model in self.ml_models.items()},
            orient='index',columns=['test_accuracy']
        )
        self.accuracies_train = pd.DataFrame.from_dict(
            {model_name:model.accuracy_train for model_name,model in self.ml_models.items()},
            orient='index',columns=['test_accuracy']
        )
        
        self.sharpe_values = pd.DataFrame.from_dict(
            {model_name:model.sharpe for model_name,model in self.ml_models.items()},
            orient='index',columns=['sharpe']
        )
        
        self.f1_scores = pd.DataFrame.from_dict(
            {model_name:model.f1_test for model_name,model in self.ml_models.items()},
            orient='index',columns=['f1_score']
        )
        
        self.prediction_ratios = pd.DataFrame.from_dict(
            {model_name:model.prediction_ratio for model_name,model in self.ml_models.items()},
            orient='index',columns=['prediction_ratio']
        )



        
        self.best_model_name = self.accuracies_test.idxmax().tolist()[0]
        self.best_model = self.ml_models.get(self.best_model_name)
        
        self.best_model_accuracy = self.accuracies_test[self.accuracies_test.index==self.best_model_name]
        self.best_model_accuracy.index.name = 'best_model'
        self.best_model_accuracy.reset_index(inplace=True)
        
        self.best_model_sharpe = self.sharpe_values[self.sharpe_values.index==self.best_model_name]
        self.best_model_f1 = self.f1_scores[self.f1_scores.index==self.best_model_name]
        self.best_model_prediction_ratio = self.prediction_ratios[self.prediction_ratios.index==self.best_model_name]

        #self.best_model_sharpe.index.name = 'best_model'
        #self.best_model_sharpe.reset_index(inplace=True)

        
        
    def run(self):
        {model.split_data() for model in self.ml_models.values()}
        {model.train_model() for model in self.ml_models.values()}
        {model.generate_predictions() for model in self.ml_models.values()}
        {model.evaluate_model() for model in self.ml_models.values()}
        self.get_best_model()

        
class ModelBuildier:
    def __init__(self,x_dict,y_dict,y_returns,instrument_list,hyper_parameters={}):
        self.x_dict = x_dict
        self.y_dict = y_dict
        self.hyper_parameters = hyper_parameters
        self.instrument_list = instrument_list
        self.asset_models =  {inst: AssetModels(inst,x_dict,y_dict,y_returns,
                            hyper_parameters.get(inst) if hyper_parameters.get(inst) else {})\
                              for inst in instrument_list}
        
        self.accuracies_best = pd.DataFrame()
        self.accuracies_all = pd.DataFrame()
        
    def get_accuracies(self):
        for inst in instrument_list:
            accuracy_df = self.asset_models[inst].best_model_accuracy
            accuracy_df.index = [inst]
            sharpe_df = self.asset_models[inst].best_model_sharpe
            sharpe_df.index = [inst]
            f1_df = self.asset_models[inst].best_model_f1
            f1_df.index = [inst]
            prediction_ratio_df = self.asset_models[inst].best_model_prediction_ratio
            prediction_ratio_df.index = [inst]


            accuracy_df = accuracy_df.join(sharpe_df).join(f1_df).join(prediction_ratio_df)
            self.accuracies_best = self.accuracies_best.append(accuracy_df)
            
            all_accuracy_df = self.asset_models[inst].accuracies_test
            all_sharpe_df = self.asset_models[inst].sharpe_values
            all_f1_df = self.asset_models[inst].f1_scores
            all_prediction_ratio_df = self.asset_models[inst].prediction_ratios


            all_accuracy_df = all_accuracy_df.join(all_sharpe_df).join(all_f1_df).join(all_prediction_ratio_df)
            all_accuracy_df.index.name = 'model'
            all_accuracy_df = all_accuracy_df.reset_index()
            all_accuracy_df['asset'] = inst
            self.accuracies_all = self.accuracies_all.append(all_accuracy_df)
        self.accuracies_all = self.accuracies_all.set_index('asset')

            
            
        
    
    def run(self):
        {inst: model.run() for inst,model in self.asset_models.items()}
        self.get_accuracies()
        
        

In [55]:
hp = {
    'ES':{
        'logistic':{
            'C':0.1
        }
    },

    'NQ':{
        'logistic':{
            'C':1
        }
    },
    'CD':{
        'logistic':{
            'C':0.1
        }
    },
    'EC':{
        'logistic':{
            'C':100000
        }
    },
    'JY':{
        'logistic':{
            'C':100000
        }
    },
    'MP':{
        'logistic':{
            'C':0.1
        }
    },
    'TY':{
        'logistic':{
            'C':0.1
        }
    },
        
    'US':{
        'logistic':{
            'C':0.1
        }
    },
    'C':{
        'logistic':{
            'C':1
        }
    },    
    'S':{
        'logistic':{
            'C':1000
        }
    },
    'W':{
        'logistic':{
            'C':10
        }
    },
    'CL':{
        'logistic':{
            'C':0.01
        }
    },
    'GC':{
        'logistic':{
            'C':0.1
        }
    }




}


In [56]:
model_builder = ModelBuildier(x_dict,y_dict,y_returns,instrument_list,hyper_parameters=hp)
model_builder.run()

## Best Model Metrics

In [60]:
model_builder.accuracies_best

Unnamed: 0,best_model,test_accuracy,sharpe,f1_score,prediction_ratio
ES,rf,0.55,0.219635,0.676923,0.85
NQ,logistic,0.557143,0.074584,0.670213,0.8
CD,logistic,0.507143,-0.775839,0.188235,0.078571
EC,tree,0.507143,-0.314249,0.448,0.414286
JY,logistic,0.492857,1.378888,0.503497,0.521429
MP,tree,0.571429,1.657035,0.594595,0.535714
TY,tree,0.514286,-0.291475,0.521127,0.428571
US,logistic,0.485714,-1.669515,0.604396,0.714286
C,tree,0.571429,1.971582,0.620253,0.621429
S,logistic,0.507143,,0.56051,0.614286


# All Model Metrics

In [61]:
model_builder.accuracies_all

Unnamed: 0_level_0,model,test_accuracy,sharpe,f1_score,prediction_ratio
asset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ES,logistic,0.542857,0.502561,0.703704,1.0
ES,rf,0.55,0.219635,0.676923,0.85
ES,tree,0.535714,0.401223,0.606061,0.635714
ES,boosted_tree,0.492857,-0.736929,0.612022,0.764286
NQ,logistic,0.557143,0.074584,0.670213,0.8
NQ,rf,0.457143,-1.272991,0.552941,0.671429
NQ,tree,0.471429,-1.598417,0.506667,0.528571
NQ,boosted_tree,0.435714,-3.156523,0.526946,0.65
CD,logistic,0.507143,-0.775839,0.188235,0.078571
CD,rf,0.457143,-2.403768,0.377049,0.342857


In [8]:
# imp_df = pd.DataFrame()
# for inst in instrument_list:
#     feature_imp_df = pd.DataFrame(models[inst].models['tree'].model.feature_importances_,columns=[inst])
#     feature_imp_df.index = x_dict[inst].columns
#     imp_df = imp_df.join(feature_imp_df,how='outer')

In [209]:
# imp_df.to_csv('imp_df.csv')