# Importing libraries

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from pickle import load
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

from useful_functions import *  # our own set of functions

# Importing data

In [2]:
Scaled_Data_X = np.load('input/Scaled_Data_X.npy')
Scaled_Data_Y = np.load('input/Scaled_Data_Y.npy')
stock_returns = pd.read_pickle('input/stock_returns.pkl')
df = pd.read_pickle('input/df.pkl')
lag = load(open('input/lag.pkl', 'rb'))

In [3]:
Scaler_X = load(open('data_objects/Scaler_X.pkl', 'rb'))
Scaler_Y = load(open('data_objects/Scaler_Y.pkl', 'rb'))

In [4]:
def get_label(log_return):
    if log_return > 0.001: return 'up'
    if log_return < -0.001: return 'down'
    if log_return >= -0.001 and log_return <= 0.001: return 'same'
    else: return np.NaN

# Walk forward optimization parameters setting

Walk-forward optimization is one of the most popular validation techniques used by financial researchers to undergo decision making for algorithmic trading. We have performed non-anchored type of Walk-forward. The process is the following:

1. For each set in the hyperparameter search space, train it on the training set. Calclute the Modified Information Ratio for the training set.
2. Validate them on the validation set. Calclute the Modified Information Ratio for the validatoin set
3. Calculte the absolute difference between the two modified information ratio
4. Use the set of hyperparameter where the difference is the lowest and the Modified Information Ratio on the Validation set is not 0.
5. Predict the model using the set of hyperparameter as defined in 4. trained on the training set on the test set

In case of our research problem, we set the Training Data to 250 trading days, and the Validation and Testing Data to 75 trading days each.

In [5]:
Training_Bars = 250 
Validation_Bars = 75
Testing_Bars = 75

In [6]:
ranges = list(range(Training_Bars, len(Scaled_Data_Y) - Testing_Bars, Validation_Bars))
len(ranges)

33

# Linear regression

We tune two hyperparameters:
- `alpha` - regularization term used in elastic net regularization, 
- `l1_ratio` - weight of L1 (lasso) in elastic net.

In [7]:
alpha = [np.power(10., i) for i in range (-5, 6)]  # regularization terms (hyperparameter search space)
l1_ratio = [0, 0.25, 0.5, 0.75, 1] # L1 regularizationweight (hyperparameter search space)


config = [
    {
    'alpha' : a,
    'l1_ratio': lr
    }
    for a in alpha
    for lr in l1_ratio
]

len(config)

55

In [8]:
v_get_label = np.vectorize(get_label)

Because linear regression is a model for numerical output, after making predictions we transform them to classes the same way we did when we were calculating values of our label (see `get_label` function in `useful_functions.py`).

In [9]:
RF = np.random.RandomState(seed=4015)

ranges = list(range(Training_Bars, len(Scaled_Data_Y) - Testing_Bars, Validation_Bars))

df_all_score = {}

ALL_MODEL_PREDICTIONS = np.zeros((0,1)).flatten()
for i in range(0, len(ranges)):

    train_X, train_y, train_stock_returns = Scaled_Data_X[ranges[i]-Training_Bars:ranges[i]], Scaled_Data_Y[ranges[i]-Training_Bars:ranges[i]], stock_returns[ranges[i]-Training_Bars:ranges[i]]
    val_X, val_Y, val_stock_returns = Scaled_Data_X[ranges[i]:ranges[i]+Validation_Bars], Scaled_Data_Y[ranges[i]:ranges[i]+Validation_Bars], stock_returns[ranges[i]:ranges[i]+Validation_Bars]
    test_X, test_y, test_stock_returns = Scaled_Data_X[ranges[i]+Validation_Bars:ranges[i]+Validation_Bars+Testing_Bars], Scaled_Data_Y[ranges[i]+Validation_Bars:ranges[i]+Validation_Bars+Testing_Bars], stock_returns[ranges[i]+Validation_Bars:ranges[i]+Validation_Bars+Testing_Bars]
    
    training_IR2 = []
    validation_IR2 = []
    testing_IR2 = []
    config_all = []
    for num, cfg in enumerate(config):
        model_cfg = ElasticNet(alpha=cfg['alpha'],
                               l1_ratio=cfg['l1_ratio'])
        
        model_cfg.fit(train_X, train_y)

        cfg_IR2_train = get_eqline_IR2(train_stock_returns, v_get_label(model_cfg.predict(train_X)).astype(object))
        cfg_IR2_val = get_eqline_IR2(val_stock_returns, v_get_label(model_cfg.predict(val_X)).astype(object))
        cfg_IR2_test = get_eqline_IR2(test_stock_returns, v_get_label(model_cfg.predict(test_X)).astype(object))

        training_IR2.append(cfg_IR2_train)
        validation_IR2.append(cfg_IR2_val)
        testing_IR2.append(cfg_IR2_test)

        config_all.append(cfg)
    

    df_finding_the_best_configuration = pd.DataFrame(
        data = {
            'Config': config_all,
            'Training IR2': training_IR2,
            'Validation IR2': validation_IR2,
            'Testing IR2': testing_IR2
        }
    )

    df_all_score[f'id_{i}'] = df_finding_the_best_configuration

    df_finding_the_best_configuration['custom_score'] =  abs(df_finding_the_best_configuration['Validation IR2'] - df_finding_the_best_configuration['Training IR2'])
    df_finding_the_best_configuration['custom_score'] = np.where(df_finding_the_best_configuration['Validation IR2']==0, np.nan, df_finding_the_best_configuration['custom_score'])

    try:    
        best_config = df_finding_the_best_configuration.loc[df_finding_the_best_configuration['custom_score'].idxmin()]['Config']
    except:
        best_config = df_finding_the_best_configuration.loc[df_finding_the_best_configuration['Training IR2'].idxmax()]['Config']    

    print(f'The Best Model Configuration Is: {best_config}')
    print(f"Training IR2: {df_all_score[f'id_{i}'][df_all_score[f'id_{i}']['Config']==best_config]['Training IR2'].values}, \
          Validation IR2: {df_all_score[f'id_{i}'][df_all_score[f'id_{i}']['Config']==best_config]['Validation IR2'].values}, \
          Testing IR2: {df_all_score[f'id_{i}'][df_all_score[f'id_{i}']['Config']==best_config]['Testing IR2'].values}")

    # Fit the model on the combined data Train and Val 
    combined_train_val_X = np.concatenate([train_X, val_X]) 
    combined_train_val_Y = np.concatenate([train_y, val_Y]) 
    
    model = ElasticNet(alpha=best_config['alpha'],
                       l1_ratio=best_config['l1_ratio'])
    
    model.fit(combined_train_val_X, combined_train_val_Y)

    PREDICTIONS = model.predict(test_X)
    ALL_MODEL_PREDICTIONS = np.concatenate((ALL_MODEL_PREDICTIONS, PREDICTIONS))

    print(f'ID_{i} - DONE')

The Best Model Configuration Is: {'alpha': 1e-05, 'l1_ratio': 0.25}
Training IR2: [0.04948973],           Validation IR2: [0.77190656],           Testing IR2: [277.16034084]
ID_0 - DONE
The Best Model Configuration Is: {'alpha': 0.01, 'l1_ratio': 0.5}
Training IR2: [2.98637954],           Validation IR2: [74.13220131],           Testing IR2: [15.89342985]
ID_1 - DONE
The Best Model Configuration Is: {'alpha': 0.01, 'l1_ratio': 0}
Training IR2: [7.20774328],           Validation IR2: [7.17708415],           Testing IR2: [4.14902269]
ID_2 - DONE
The Best Model Configuration Is: {'alpha': 0.001, 'l1_ratio': 1}
Training IR2: [6.59701424],           Validation IR2: [6.70309997],           Testing IR2: [0.24354051]
ID_3 - DONE
The Best Model Configuration Is: {'alpha': 0.001, 'l1_ratio': 0}
Training IR2: [10.29090622],           Validation IR2: [0.24354051],           Testing IR2: [0]
ID_4 - DONE
The Best Model Configuration Is: {'alpha': 0.0001, 'l1_ratio': 0.5}
Training IR2: [9.00036858], 

In [10]:
id0 = df_all_score['id_1']
id0_condig_df = pd.json_normalize(id0['Config'])
id0_df = pd.concat([id0, id0_condig_df], axis=1).drop(['Config'], axis=1)

In [11]:
sorted_id0_df = id0_df.sort_values(by='custom_score')

We save our predictions for laster analysis.

In [12]:
pred_pos_linear_regression = pd.Series(v_get_label(ALL_MODEL_PREDICTIONS).astype(object),
                                       index=df[Training_Bars+Validation_Bars+lag:Training_Bars+Validation_Bars+lag+len(ALL_MODEL_PREDICTIONS)].index)

In [13]:
pred_pos_linear_regression.to_pickle('output/lin_reg.pkl')

## Logit regression

We tune three hyperparameters:
- `C` - inverse of regularization term used in elastic net regularization, 
- `l1_ratio` - weight of L1 (lasso) in elastic net,
- `multi_class` - option to fit the logit model, for `ovr` one-versus-rest approach is applied, for `multinomial` the loss minimised is the multinomial loss fit across the entire probability distribution.

In [14]:
alpha = [np.power(10., i) for i in range (-5, 6)]  # regularization terms (hyperparameter search space)
l1_ratio = [0, 0.25, 0.5, 0.75, 1] # L1 regularizationweight (hyperparameter search space)
penalty = ['elasticnet']
solver = ['saga']
multi_class = ['multinomial', 'ovr']


config = [
    {
    'alpha' : a,
    'l1_ratio': lr,
    'penalty': pen,
    'solver': solv,
    'multi_class': mtc
    }
    for a in alpha
    for lr in l1_ratio
    for pen in penalty
    for solv in solver
    for mtc in multi_class
]

len(config)

110

In [15]:
RF = np.random.RandomState(seed=4015)

ranges = list(range(Training_Bars, len(Scaled_Data_Y) - Testing_Bars, Validation_Bars))

df_all_score = {}

ALL_MODEL_PREDICTIONS = np.zeros((0,1)).flatten()
for i in range(0, len(ranges)):

    train_X, train_y, train_stock_returns = Scaled_Data_X[ranges[i]-Training_Bars:ranges[i]], Scaled_Data_Y[ranges[i]-Training_Bars:ranges[i]], stock_returns[ranges[i]-Training_Bars:ranges[i]]
    val_X, val_Y, val_stock_returns = Scaled_Data_X[ranges[i]:ranges[i]+Validation_Bars], Scaled_Data_Y[ranges[i]:ranges[i]+Validation_Bars], stock_returns[ranges[i]:ranges[i]+Validation_Bars]
    test_X, test_y, test_stock_returns = Scaled_Data_X[ranges[i]+Validation_Bars:ranges[i]+Validation_Bars+Testing_Bars], Scaled_Data_Y[ranges[i]+Validation_Bars:ranges[i]+Validation_Bars+Testing_Bars], stock_returns[ranges[i]+Validation_Bars:ranges[i]+Validation_Bars+Testing_Bars]
    
    training_IR2 = []
    validation_IR2 = []
    testing_IR2 = []
    config_all = []
    for num, cfg in enumerate(config):
        model_cfg = LogisticRegression(penalty=cfg['penalty'],
                                       C=1/cfg['alpha'],
                                       l1_ratio=cfg['l1_ratio'],
                                       solver=cfg['solver'],
                                       multi_class=cfg['multi_class'],
                                       random_state=RF)
        
        model_cfg.fit(train_X, train_y)

        cfg_IR2_train = get_eqline_IR2(train_stock_returns, Scaler_Y.inverse_transform(model_cfg.predict(train_X)))
        cfg_IR2_val = get_eqline_IR2(val_stock_returns, Scaler_Y.inverse_transform(model_cfg.predict(val_X)))
        cfg_IR2_test = get_eqline_IR2(test_stock_returns, Scaler_Y.inverse_transform(model_cfg.predict(test_X)))
        
        training_IR2.append(cfg_IR2_train)
        validation_IR2.append(cfg_IR2_val)
        testing_IR2.append(cfg_IR2_test)

        config_all.append(cfg)
    

    df_finding_the_best_configuration = pd.DataFrame(
        data = {
            'Config': config_all,
            'Training IR2': training_IR2,
            'Validation IR2': validation_IR2,
            'Testing IR2': testing_IR2
        }
    )

    df_all_score[f'id_{i}'] = df_finding_the_best_configuration

    df_finding_the_best_configuration['custom_score'] =  abs(df_finding_the_best_configuration['Validation IR2'] - df_finding_the_best_configuration['Training IR2'])
    df_finding_the_best_configuration['custom_score'] = np.where(df_finding_the_best_configuration['Validation IR2']==0, np.nan, df_finding_the_best_configuration['custom_score'])

    try:    
        best_config = df_finding_the_best_configuration.loc[df_finding_the_best_configuration['custom_score'].idxmin()]['Config']
    except:
        best_config = df_finding_the_best_configuration.loc[df_finding_the_best_configuration['Training IR2'].idxmax()]['Config']    

    print(f'The Best Model Configuration Is: {best_config}')
    print(f"Training IR2: {df_all_score[f'id_{i}'][df_all_score[f'id_{i}']['Config']==best_config]['Training IR2'].values}, \
          Validation IR2: {df_all_score[f'id_{i}'][df_all_score[f'id_{i}']['Config']==best_config]['Validation IR2'].values}, \
          Testing IR2: {df_all_score[f'id_{i}'][df_all_score[f'id_{i}']['Config']==best_config]['Testing IR2'].values}")

    # Fit the model on the combined data Train and Val 
    combined_train_val_X = np.concatenate([train_X, val_X]) 
    combined_train_val_Y = np.concatenate([train_y, val_Y]) 
    
    model = LogisticRegression(penalty=best_config['penalty'],
                               C=1/best_config['alpha'],
                               l1_ratio=best_config['l1_ratio'],
                               solver=best_config['solver'],
                               multi_class=best_config['multi_class'],
                               random_state=RF)
    
    model.fit(combined_train_val_X, combined_train_val_Y)

    PREDICTIONS = model.predict(test_X)
    ALL_MODEL_PREDICTIONS = np.concatenate((ALL_MODEL_PREDICTIONS, PREDICTIONS))

    print(f'ID_{i} - DONE')

The Best Model Configuration Is: {'alpha': 10000.0, 'l1_ratio': 0, 'penalty': 'elasticnet', 'solver': 'saga', 'multi_class': 'multinomial'}
Training IR2: [0.00530282],           Validation IR2: [1.76404714],           Testing IR2: [100.24578774]
ID_0 - DONE
The Best Model Configuration Is: {'alpha': 10.0, 'l1_ratio': 1, 'penalty': 'elasticnet', 'solver': 'saga', 'multi_class': 'multinomial'}
Training IR2: [10.92011257],           Validation IR2: [68.01075106],           Testing IR2: [1.06148676]
ID_1 - DONE
The Best Model Configuration Is: {'alpha': 1.0, 'l1_ratio': 0.5, 'penalty': 'elasticnet', 'solver': 'saga', 'multi_class': 'multinomial'}
Training IR2: [36.8943877],           Validation IR2: [36.98389035],           Testing IR2: [11.36442378]
ID_2 - DONE
The Best Model Configuration Is: {'alpha': 100.0, 'l1_ratio': 0.25, 'penalty': 'elasticnet', 'solver': 'saga', 'multi_class': 'multinomial'}
Training IR2: [6.27997788],           Validation IR2: [6.68375054],           Testing IR2:

We save our data for later use.

In [16]:
pred_pos_logit_regression = pd.Series(Scaler_Y.inverse_transform(ALL_MODEL_PREDICTIONS.astype(int)),
                                      index=df[Training_Bars+Validation_Bars+lag:Training_Bars+Validation_Bars+lag+len(ALL_MODEL_PREDICTIONS)].index)

In [17]:
pred_pos_logit_regression.to_pickle('output/log_reg.pkl')