# Importing libraries

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from pickle import load
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

from useful_functions import *  # our own set of functions

# Importing data

In [2]:
Scaled_Data_X = np.load('input/Scaled_Data_X.npy')
Scaled_Data_Y = np.load('input/Scaled_Data_Y.npy')
stock_returns = pd.read_pickle('input/stock_returns.pkl')
df = pd.read_pickle('input/df.pkl')
lag = load(open('input/lag.pkl', 'rb'))

In [3]:
Scaler_X = load(open('data_objects/Scaler_X.pkl', 'rb'))
Scaler_Y = load(open('data_objects/Scaler_Y.pkl', 'rb'))

## Walk-Forward Optimizatoin

Walk-forward optimization is one of the most popular validation techniques used by financial researchers to undergo decision making for algorithmic trading. We have performed non-anchored type of Walk-forward. The process is the following:

1. For each set in the hyperparameter search space, train it on the training set. Calclute the Modified Information Ratio for the training set.
2. Validate them on the validation set. Calclute the Modified Information Ratio for the validatoin set
3. Calculte the absolute difference between the two modified information ratio
4. Use the set of hyperparameter where the difference is the lowest and the Modified Information Ratio on the Validation set is not 0.
5. Predict the model using the set of hyperparameter as defined in 4. trained on the training set on the test set

In case of our research problem, we set the Training Data to 250 trading days, and the Validation and Testing Data to 75 trading days each.

In [4]:
Training_Bars = 250 
Validation_Bars = 75
Testing_Bars = 75

## Random Forest

Random Forest is not a very computationally expensive model compared to others for example Neural Networks, we decided to perform Grid Search for tuning the hyperparameters. The hyperparameter search space is selected after deep analysis. Multiple hyperparamter tunning processes were conducted before arriving to this search space.

In [5]:
n_estimators = [300]
criterion = ['entropy']
max_depth = [5, 8] # 2
min_samples_split = [30, 60]
min_samples_leaf = [20, 40]
min_weight_fraction_leaf = [0]
max_features = [60]
max_leaf_nodes = [None]
min_impurity_decrease = [0.0025, 0.0015]
bootstrap = [False] # Was True
oob_score = [False]
n_jobs = [-1]

digit = 8

config = [
    {
    'n_estimators': ne,
    'criterion': c,
    'max_depth': md,
    'min_samples_split': mss,
    'min_samples_leaf': msl,
    'min_weight_fraction_leaf': mwfl,
    'max_features': mf,
    'max_leaf_nodes': mln,
    'min_impurity_decrease': mid,
    'bootstrap': b,
    'oob_score': os,
    'n_jobs': nj,
    }
    for ne in n_estimators
    for c in criterion
    for md in max_depth
    for mss in min_samples_split
    for msl in min_samples_leaf
    for mwfl in min_weight_fraction_leaf
    for mf in max_features
    for mln in max_leaf_nodes
    for mid in min_impurity_decrease
    for b in bootstrap
    for os in oob_score
    for nj in n_jobs
]

len(config)

16

In [6]:
ranges = list(range(Training_Bars, len(Scaled_Data_Y) - Testing_Bars, Validation_Bars))

df_all_score = {}

ALL_MODEL_PREDICTIONS = np.zeros((0,1)).flatten()
for i in range(0, len(ranges)): 

    train_X, train_y, train_stock_returns = Scaled_Data_X[ranges[i]-Training_Bars:ranges[i]], Scaled_Data_Y[ranges[i]-Training_Bars:ranges[i]], stock_returns[ranges[i]-Training_Bars:ranges[i]]
    val_X, val_Y, val_stock_returns = Scaled_Data_X[ranges[i]:ranges[i]+Validation_Bars], Scaled_Data_Y[ranges[i]:ranges[i]+Validation_Bars], stock_returns[ranges[i]:ranges[i]+Validation_Bars]
    test_X, test_y, test_stock_returns = Scaled_Data_X[ranges[i]+Validation_Bars:ranges[i]+Validation_Bars+Testing_Bars], Scaled_Data_Y[ranges[i]+Validation_Bars:ranges[i]+Validation_Bars+Testing_Bars], stock_returns[ranges[i]+Validation_Bars:ranges[i]+Validation_Bars+Testing_Bars]
    
    training_IR2 = []
    validation_IR2 = []
    testing_IR2 = []
    config_all = []
    for num, cfg in enumerate(config):
        model_cfg = RandomForestClassifier(
                                    n_estimators = cfg['n_estimators'],
                                    criterion = cfg['criterion'],
                                    max_depth = cfg['max_depth'],
                                    min_samples_split = cfg['min_samples_split'],
                                    min_samples_leaf = cfg['min_samples_leaf'],
                                    min_weight_fraction_leaf = cfg['min_weight_fraction_leaf'],
                                    max_features = cfg['max_features'],
                                    max_leaf_nodes = cfg['max_leaf_nodes'],
                                    min_impurity_decrease = cfg['min_impurity_decrease'],
                                    bootstrap = cfg['bootstrap'],
                                    oob_score = cfg['oob_score'],
                                    n_jobs = cfg['n_jobs'],
                                    random_state = i+num+digit
                                    )
        
        model_cfg.fit(train_X, train_y)

        cfg_IR2_train = get_eqline_IR2(train_stock_returns, Scaler_Y.inverse_transform(model_cfg.predict(train_X)))
        cfg_IR2_val = get_eqline_IR2(val_stock_returns, Scaler_Y.inverse_transform(model_cfg.predict(val_X)))
        cfg_IR2_test = get_eqline_IR2(test_stock_returns, Scaler_Y.inverse_transform(model_cfg.predict(test_X)))

        training_IR2.append(cfg_IR2_train)
        validation_IR2.append(cfg_IR2_val)
        testing_IR2.append(cfg_IR2_test)

        config_all.append(cfg)
    

    df_finding_the_best_configuration = pd.DataFrame(
        data = {
            'Config': config_all,
            'Training IR2': training_IR2,
            'Validation IR2': validation_IR2,
            'Testing IR2': testing_IR2
        }
    )

    df_all_score[f'id_{i}'] = df_finding_the_best_configuration

    df_finding_the_best_configuration['custom_score'] =  abs(df_finding_the_best_configuration['Validation IR2'] - df_finding_the_best_configuration['Training IR2'])
    df_finding_the_best_configuration['custom_score'] = np.where(df_finding_the_best_configuration['Validation IR2']==0, np.nan, df_finding_the_best_configuration['custom_score'])

    try:    
        best_config = df_finding_the_best_configuration.loc[df_finding_the_best_configuration['custom_score'].idxmin()]['Config']
        best_config_index = df_finding_the_best_configuration['custom_score'].idxmin()
    except:
        best_config = df_finding_the_best_configuration.loc[df_finding_the_best_configuration['Training IR2'].idxmax()]['Config']  
        best_config_index = df_finding_the_best_configuration['Training IR2'].idxmax()

    print(f'The Random Seed Is: {i+best_config_index}')
    print(f'The Best Model Configuration Is: {best_config}')
    print(f"Training IR2: {df_all_score[f'id_{i}'][df_all_score[f'id_{i}']['Config']==best_config]['Training IR2'].values}, \
          Validation IR2: {df_all_score[f'id_{i}'][df_all_score[f'id_{i}']['Config']==best_config]['Validation IR2'].values}, \
          Testing IR2: {df_all_score[f'id_{i}'][df_all_score[f'id_{i}']['Config']==best_config]['Testing IR2'].values}")

    # Fit the model on the combined data Train and Val 
    combined_train_val_X = np.concatenate([train_X, val_X]) 
    combined_train_val_Y = np.concatenate([train_y, val_Y]) 
    
    model = RandomForestClassifier(
                                n_estimators = best_config['n_estimators'],
                                criterion = best_config['criterion'],
                                max_depth = best_config['max_depth'],
                                min_samples_split = best_config['min_samples_split'],
                                min_samples_leaf = best_config['min_samples_leaf'],
                                min_weight_fraction_leaf = best_config['min_weight_fraction_leaf'],
                                max_features = best_config['max_features'],
                                max_leaf_nodes = best_config['max_leaf_nodes'],
                                min_impurity_decrease = best_config['min_impurity_decrease'],
                                bootstrap = best_config['bootstrap'],
                                oob_score = best_config['oob_score'],
                                n_jobs = best_config['n_jobs'],
                                random_state = i+best_config_index+digit,
                                )
    
    model.fit(train_X, train_y)

    PREDICTIONS = model.predict(test_X)
    print(get_eqline_IR2(test_stock_returns, Scaler_Y.inverse_transform(PREDICTIONS)))
    ALL_MODEL_PREDICTIONS = np.concatenate((ALL_MODEL_PREDICTIONS, PREDICTIONS))

    print(f'ID_{i} - DONE')

The Random Seed Is: 7
The Best Model Configuration Is: {'n_estimators': 300, 'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 60, 'min_samples_leaf': 40, 'min_weight_fraction_leaf': 0, 'max_features': 60, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0015, 'bootstrap': False, 'oob_score': False, 'n_jobs': -1}
Training IR2: [0.94805931],           Validation IR2: [55.86990629],           Testing IR2: [158.53768724]
158.5376872399038
ID_0 - DONE
The Random Seed Is: 13
The Best Model Configuration Is: {'n_estimators': 300, 'criterion': 'entropy', 'max_depth': 8, 'min_samples_split': 60, 'min_samples_leaf': 20, 'min_weight_fraction_leaf': 0, 'max_features': 60, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0025, 'bootstrap': False, 'oob_score': False, 'n_jobs': -1}
Training IR2: [6.28124631],           Validation IR2: [5.46496481],           Testing IR2: [32.83931231]
32.83931231469978
ID_1 - DONE
The Random Seed Is: 16
The Best Model Configuration Is: {'n_estimators'

Converting ALL_MODEL_PREDICTIONS to a series and export it to pickle:

In [7]:
pd.Series(Scaler_Y.inverse_transform(ALL_MODEL_PREDICTIONS.astype(int)), index=df.index[Training_Bars+Validation_Bars+lag:]).to_pickle('./output/RandomForestModel.pkl')