In [1]:
import os
import pickle
import joblib
import numpy as np
import xgboost
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Input, IntegerLookup, Activation, Embedding, Concatenate, Reshape, CategoryEncoding
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
import keras_tuner as kt

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# Load the data

In [3]:
with open('JPX_Preprocessing/X_train.pickle', 'rb') as f1:
    X_train = pickle.load(f1)
with open('JPX_Preprocessing/X_test.pickle', 'rb') as f2:
    X_test = pickle.load(f2)
with open('JPX_Preprocessing/stocks_train.pickle', 'rb') as f3:
    stocks_train = pickle.load(f3)
with open('JPX_Preprocessing/stocks_test.pickle', 'rb') as f4:
    stocks_test = pickle.load(f4)
with open('JPX_Preprocessing/y_train.pickle', 'rb') as f5:
    y_train = pickle.load(f5)
with open('JPX_Preprocessing/y_test.pickle', 'rb') as f6:
    y_test = pickle.load(f6)

# Build two model: XGB-Reg & NN

## 1) XGBoost - Regressor

The RandomizedSearchCV from sklearn is used to tune the hyperparameter

The created test set is used for cross validation with the class PredefinedSplit from sklearn

In [3]:
X_con = np.concatenate((X_train, X_test), axis=0)  # concat for predefined split
y_con = np.concatenate((y_train, y_test), axis=0)

In [4]:
test_fold = [-1 for _ in range(len(X_train))] + [0 for _ in range(len(X_test))]

-1 will not be used as a validationset; 0 is the first and only validation set

In [5]:
ps = PredefinedSplit(test_fold)

In [6]:
xgb = xgboost.XGBRegressor()

In [7]:
xgb_params = {
 'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
 'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],
 'min_child_weight' : [ 1, 3, 5, 7 ],
 'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ]
}

In [9]:
xgb_search = RandomizedSearchCV(xgb, xgb_params, cv=ps, verbose=3, scoring='neg_mean_squared_error', n_iter=50)

In [10]:
xgb_search.fit(X_con, y_con)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV] min_child_weight=1, max_depth=3, learning_rate=0.2, gamma=0.0, colsample_bytree=0.7 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  min_child_weight=1, max_depth=3, learning_rate=0.2, gamma=0.0, colsample_bytree=0.7, score=-0.000, total=  19.3s
[CV] min_child_weight=3, max_depth=15, learning_rate=0.15, gamma=0.4, colsample_bytree=0.4 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.3s remaining:    0.0s


[CV]  min_child_weight=3, max_depth=15, learning_rate=0.15, gamma=0.4, colsample_bytree=0.4, score=-0.000, total= 1.4min
[CV] min_child_weight=3, max_depth=5, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s


[CV]  min_child_weight=3, max_depth=5, learning_rate=0.05, gamma=0.3, colsample_bytree=0.5, score=-0.000, total=  26.1s
[CV] min_child_weight=5, max_depth=15, learning_rate=0.05, gamma=0.3, colsample_bytree=0.3 
[CV]  min_child_weight=5, max_depth=15, learning_rate=0.05, gamma=0.3, colsample_bytree=0.3, score=-0.000, total=  52.0s
[CV] min_child_weight=1, max_depth=3, learning_rate=0.05, gamma=0.2, colsample_bytree=0.5 
[CV]  min_child_weight=1, max_depth=3, learning_rate=0.05, gamma=0.2, colsample_bytree=0.5, score=-0.000, total=  18.0s
[CV] min_child_weight=3, max_depth=6, learning_rate=0.3, gamma=0.0, colsample_bytree=0.4 
[CV]  min_child_weight=3, max_depth=6, learning_rate=0.3, gamma=0.0, colsample_bytree=0.4, score=-0.000, total=  35.3s
[CV] min_child_weight=3, max_depth=10, learning_rate=0.2, gamma=0.4, colsample_bytree=0.3 
[CV]  min_child_weight=3, max_depth=10, learning_rate=0.2, gamma=0.4, colsample_bytree=0.3, score=-0.000, total=  50.0s
[CV] min_child_weight=7, max_depth=3

[CV]  min_child_weight=5, max_depth=10, learning_rate=0.25, gamma=0.4, colsample_bytree=0.7, score=-0.000, total=  58.6s
[CV] min_child_weight=1, max_depth=3, learning_rate=0.25, gamma=0.2, colsample_bytree=0.4 
[CV]  min_child_weight=1, max_depth=3, learning_rate=0.25, gamma=0.2, colsample_bytree=0.4, score=-0.000, total=  19.2s
[CV] min_child_weight=1, max_depth=4, learning_rate=0.2, gamma=0.4, colsample_bytree=0.4 
[CV]  min_child_weight=1, max_depth=4, learning_rate=0.2, gamma=0.4, colsample_bytree=0.4, score=-0.000, total=  26.1s
[CV] min_child_weight=3, max_depth=5, learning_rate=0.25, gamma=0.4, colsample_bytree=0.3 
[CV]  min_child_weight=3, max_depth=5, learning_rate=0.25, gamma=0.4, colsample_bytree=0.3, score=-0.000, total=  28.4s
[CV] min_child_weight=3, max_depth=6, learning_rate=0.3, gamma=0.3, colsample_bytree=0.7 
[CV]  min_child_weight=3, max_depth=6, learning_rate=0.3, gamma=0.3, colsample_bytree=0.7, score=-0.000, total=  39.5s
[CV] min_child_weight=5, max_depth=15, 

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 32.6min finished


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_chi...
                                          reg_alpha=None, reg_lambda=None,
                                          scale_pos_weight=None, subsample=None,
                                          tree_method=None,
             

In [11]:
xgb_search.best_params_

{'min_child_weight': 1,
 'max_depth': 4,
 'learning_rate': 0.1,
 'gamma': 0.1,
 'colsample_bytree': 0.5}

In [12]:
xgb_search.best_score_

-0.00044652685755863786

In [13]:
xgb_2 = xgboost.XGBRegressor()

In [17]:
xgb_params_2 = {
 'learning_rate' : [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15],
 'max_depth' : [3, 4, 5],
 'min_child_weight' : [1, 2],
 'gamma': [0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175],
 'colsample_bytree' : [0.45, 0.5, 0.55, 0.6]
}

In [18]:
xgb_search_2 = RandomizedSearchCV(xgb_2, xgb_params_2, cv=ps, verbose=3, scoring='neg_mean_squared_error', n_iter=50)

In [19]:
xgb_search_2.fit(X_con, y_con)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV] min_child_weight=1, max_depth=3, learning_rate=0.12, gamma=0.025, colsample_bytree=0.55 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  min_child_weight=1, max_depth=3, learning_rate=0.12, gamma=0.025, colsample_bytree=0.55, score=-0.000, total=  16.8s
[CV] min_child_weight=1, max_depth=5, learning_rate=0.13, gamma=0.175, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.8s remaining:    0.0s


[CV]  min_child_weight=1, max_depth=5, learning_rate=0.13, gamma=0.175, colsample_bytree=0.5, score=-0.000, total=  28.2s
[CV] min_child_weight=2, max_depth=5, learning_rate=0.1, gamma=0.125, colsample_bytree=0.45 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   45.0s remaining:    0.0s


[CV]  min_child_weight=2, max_depth=5, learning_rate=0.1, gamma=0.125, colsample_bytree=0.45, score=-0.000, total=  29.1s
[CV] min_child_weight=2, max_depth=5, learning_rate=0.11, gamma=0.1, colsample_bytree=0.5 
[CV]  min_child_weight=2, max_depth=5, learning_rate=0.11, gamma=0.1, colsample_bytree=0.5, score=-0.000, total=  28.4s
[CV] min_child_weight=2, max_depth=5, learning_rate=0.12, gamma=0.15, colsample_bytree=0.5 
[CV]  min_child_weight=2, max_depth=5, learning_rate=0.12, gamma=0.15, colsample_bytree=0.5, score=-0.000, total=  29.6s
[CV] min_child_weight=1, max_depth=4, learning_rate=0.07, gamma=0.125, colsample_bytree=0.55 
[CV]  min_child_weight=1, max_depth=4, learning_rate=0.07, gamma=0.125, colsample_bytree=0.55, score=-0.000, total=  23.8s
[CV] min_child_weight=2, max_depth=5, learning_rate=0.07, gamma=0.05, colsample_bytree=0.6 
[CV]  min_child_weight=2, max_depth=5, learning_rate=0.07, gamma=0.05, colsample_bytree=0.6, score=-0.000, total=  29.3s
[CV] min_child_weight=2,

[CV]  min_child_weight=1, max_depth=4, learning_rate=0.14, gamma=0.075, colsample_bytree=0.45, score=-0.000, total=  22.5s
[CV] min_child_weight=1, max_depth=5, learning_rate=0.1, gamma=0.075, colsample_bytree=0.6 
[CV]  min_child_weight=1, max_depth=5, learning_rate=0.1, gamma=0.075, colsample_bytree=0.6, score=-0.000, total=  28.5s
[CV] min_child_weight=1, max_depth=4, learning_rate=0.07, gamma=0.125, colsample_bytree=0.45 
[CV]  min_child_weight=1, max_depth=4, learning_rate=0.07, gamma=0.125, colsample_bytree=0.45, score=-0.000, total=  21.1s
[CV] min_child_weight=2, max_depth=5, learning_rate=0.09, gamma=0.125, colsample_bytree=0.45 
[CV]  min_child_weight=2, max_depth=5, learning_rate=0.09, gamma=0.125, colsample_bytree=0.45, score=-0.000, total=  26.2s
[CV] min_child_weight=2, max_depth=3, learning_rate=0.15, gamma=0.175, colsample_bytree=0.45 
[CV]  min_child_weight=2, max_depth=3, learning_rate=0.15, gamma=0.175, colsample_bytree=0.45, score=-0.000, total=  17.3s
[CV] min_chil

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 19.1min finished


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_chi...
                                          scale_pos_weight=None, subsample=None,
                                          tree_method=None,
                                          validate_parameters=None,
                    

In [20]:
xgb_search_2.best_params_

{'min_child_weight': 1,
 'max_depth': 3,
 'learning_rate': 0.12,
 'gamma': 0.025,
 'colsample_bytree': 0.55}

In [22]:
xgb_search_2.best_score_

-0.00044646894093602896

### Fit the XGB regressor with all data and the best hyperparameters

In [23]:
xgb_best = xgboost.XGBRegressor(**xgb_search_2.best_params_)

In [24]:
xgb_best.fit(X_con, y_con)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.55,
             enable_categorical=False, gamma=0.025, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.12, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=32, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

### Save the trained model

In [26]:
joblib.dump(xgb_best, 'XGB_Regressor.joblib')

['XGB_Regressor.joblib']

## Now to the Neural Network

The stocks will be fed separately through a preprocessing layer (Integer Lookup)

In [27]:
stocks_unique = np.unique(stocks_train)  # all 2000 stocks for adapting
maxlen_stocks = len(stocks_unique) # 2000
stock_lookup_layer = IntegerLookup(max_tokens=maxlen_stocks)
stock_lookup_layer.adapt(stocks_unique)  # fit the layer with the data

In [28]:
def build_model_1(hp):
    stock_inputs = Input(shape=(1,), dtype=tf.uint16)  # Inputs into the model
    price_inputs = Input(shape=(5,), dtype=tf.float32)
    
    stocks = stock_lookup_layer(stock_inputs)  # preprocessinglayer creating onehot outputs
    stocks = Embedding(maxlen_stocks, hp.Int('Emb_Out', 8, 64, 8), input_length=1)(stocks)
    stocks = Reshape((-1, ))(stocks)
    stocks = Dense(units=hp.Int('Stock_Units', 8, 128, 8), activation='relu')(stocks)
    
    drop_X = hp.Float('Drop_Rate_X', 0.1, 0.5, 0.1)
    drop_Con = hp.Float('Drop_Rate_Con', 0.1, 0.5, 0.1)
    add_Con_Layers = hp.Int('Add_Con_Layers', 1, 2, 1)
    
    X = Dense(units=hp.Choice('X_1_Units', [32, 64, 128, 256, 512]))(price_inputs)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = Dropout(drop_X)(X)
    
    if hp.Boolean('2nd_X_Layer'):
        X = Dense(units=hp.Choice('X_2_Units', [16, 32, 64, 128]))(price_inputs)
        X = BatchNormalization()(X)
        X = Activation('relu')(X)
        X = Dropout(drop_X)(X)
    
    con = Concatenate(axis=1)([stocks, X])
    
    con = Dense(units=hp.Choice('Con_1_Units', [64, 128, 256, 512]))(con)
    con = BatchNormalization()(con)
    con = Activation('relu')(con)
    con = Dropout(drop_Con)(con)
    
    for i in range(add_Con_Layers):
        con = Dense(units=hp.Choice(f'Con_{i+2}_Units', [8, 16, 32, 64, 128, 256]))(con)
        con = BatchNormalization()(con)
        con = Activation('relu')(con)
        con = Dropout(drop_Con)(con)
    
    output = Dense(1)(con)
    
    rmse = RootMeanSquaredError(name='rmse')
    
    model = keras.Model(inputs=[stock_inputs, price_inputs], outputs=[output])
    
    lr = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    
    model.compile(loss='mse', optimizer=Adam(learning_rate=lr), metrics=[rmse])
    
    return model

In [29]:
tuner_1 = kt.RandomSearch(
    hypermodel=build_model_1,
    objective='val_loss',
    max_trials=120,
    directory='NN_Logs_Tuner_1')

INFO:tensorflow:Reloading Oracle from existing project NN_Logs_Tuner_1/untitled_project/oracle.json
INFO:tensorflow:Reloading Tuner from NN_Logs_Tuner_1/untitled_project/tuner0.json


In [35]:
tuner_1.search((stocks_train, X_train), y_train, epochs=3, batch_size=1024,
                validation_data=((stocks_test, X_test), y_test))

Trial 120 Complete [00h 00m 19s]
val_loss: 0.00044662586878985167

Best val_loss So Far: 0.00044647062895819545
Total elapsed time: 00h 35m 29s
INFO:tensorflow:Oracle triggered exit


In [36]:
tuner_1.results_summary(3)

Results summary
Results in NN_Logs_Tuner_1/untitled_project
Showing 3 best trials
Objective(name='val_loss', direction='min')
Trial summary
Hyperparameters:
Emb_Out: 40
Stock_Units: 56
Drop_Rate_X: 0.2
Drop_Rate_Con: 0.2
Add_Con_Layers: 1
X_1_Units: 64
2nd_X_Layer: False
Con_1_Units: 512
Con_2_Units: 16
lr: 0.0004910533133090028
X_2_Units: 64
Con_3_Units: 8
Score: 0.00044647062895819545
Trial summary
Hyperparameters:
Emb_Out: 56
Stock_Units: 72
Drop_Rate_X: 0.30000000000000004
Drop_Rate_Con: 0.30000000000000004
Add_Con_Layers: 1
X_1_Units: 128
2nd_X_Layer: True
Con_1_Units: 512
Con_2_Units: 16
lr: 0.0021979153094558064
X_2_Units: 64
Con_3_Units: 32
Score: 0.00044650660129263997
Trial summary
Hyperparameters:
Emb_Out: 40
Stock_Units: 32
Drop_Rate_X: 0.30000000000000004
Drop_Rate_Con: 0.30000000000000004
Add_Con_Layers: 1
X_1_Units: 256
2nd_X_Layer: True
Con_1_Units: 512
Con_2_Units: 32
lr: 0.002768554810672086
X_2_Units: 128
Con_3_Units: 8
Score: 0.00044654367957264185


In [37]:
def build_model_2(hp):
    stock_inputs = Input(shape=(1,), dtype=tf.uint16)  # Inputs into the model
    price_inputs = Input(shape=(5,), dtype=tf.float32)
    
    stocks = stock_lookup_layer(stock_inputs)  # preprocessinglayer creating onehot outputs
    stocks = Embedding(maxlen_stocks, 40, input_length=1)(stocks)
    stocks = Reshape((-1, ))(stocks)
    stocks = Dense(units=hp.Int('Stock_Units', 32, 72, 8), activation='relu')(stocks)
    
    X = Dense(units=hp.Choice('X_1_Units', [64, 128, 256]))(price_inputs)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = Dropout(0.25)(X)
    
    if hp.Boolean('2nd_X_Layer'):
        X = Dense(units=hp.Choice('X_2_Units', [32, 64, 128]))(price_inputs)
        X = BatchNormalization()(X)
        X = Activation('relu')(X)
        X = Dropout(0.25)(X)
    
    con = Concatenate(axis=1)([stocks, X])
    
    con = Dense(units=512)(con)
    con = BatchNormalization()(con)
    con = Activation('relu')(con)
    con = Dropout(0.25)(con)
    
    con = Dense(units=hp.Choice('Con_2_Units', [16, 32]))(con)
    con = BatchNormalization()(con)
    con = Activation('relu')(con)
    con = Dropout(0.25)(con)
    
    output = Dense(1)(con)
    
    rmse = RootMeanSquaredError(name='rmse')
    
    model = keras.Model(inputs=[stock_inputs, price_inputs], outputs=[output])
    
    lr = hp.Float("lr", min_value=0.0005, max_value=0.002, sampling="log")
    
    model.compile(loss='mse', optimizer=Adam(learning_rate=lr), metrics=[rmse])
    
    return model

In [38]:
tuner_2 = kt.RandomSearch(
    hypermodel=build_model_2,
    objective='val_loss',
    max_trials=100,
    directory='NN_Logs_Tuner_2')

In [39]:
tuner_2.search((stocks_train, X_train), y_train, epochs=3, batch_size=1024,
                validation_data=((stocks_test, X_test), y_test))

Trial 100 Complete [00h 00m 16s]
val_loss: 0.00044659030390903354

Best val_loss So Far: 0.00044644103036262095
Total elapsed time: 00h 28m 11s
INFO:tensorflow:Oracle triggered exit


In [40]:
tuner_2.results_summary(3)

Results summary
Results in NN_Logs_Tuner_2/untitled_project
Showing 3 best trials
Objective(name='val_loss', direction='min')
Trial summary
Hyperparameters:
Stock_Units: 56
X_1_Units: 128
2nd_X_Layer: False
Con_2_Units: 32
lr: 0.0019828883793758215
X_2_Units: 128
Score: 0.00044644103036262095
Trial summary
Hyperparameters:
Stock_Units: 56
X_1_Units: 128
2nd_X_Layer: True
Con_2_Units: 32
lr: 0.0011599879176878356
X_2_Units: 32
Score: 0.00044646902824752033
Trial summary
Hyperparameters:
Stock_Units: 64
X_1_Units: 128
2nd_X_Layer: False
Con_2_Units: 32
lr: 0.0011452275069150644
X_2_Units: 32
Score: 0.0004464886733330786


In [43]:
def build_best_model():
    stock_inputs = Input(shape=(1,), dtype=tf.uint16)  # Inputs into the model
    price_inputs = Input(shape=(5,), dtype=tf.float32)
    
    stocks = stock_lookup_layer(stock_inputs)  # preprocessinglayer creating onehot outputs
    stocks = Embedding(maxlen_stocks, 40, input_length=1)(stocks)
    stocks = Reshape((-1, ))(stocks)
    stocks = Dense(units=56, activation='relu')(stocks)
    
    X = Dense(units=128)(price_inputs)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = Dropout(0.25)(X)
    
    con = Concatenate(axis=1)([stocks, X])
    
    con = Dense(units=512)(con)
    con = BatchNormalization()(con)
    con = Activation('relu')(con)
    con = Dropout(0.25)(con)
    
    con = Dense(units=32)(con)
    con = BatchNormalization()(con)
    con = Activation('relu')(con)
    con = Dropout(0.25)(con)
    
    output = Dense(1)(con)
    
    rmse = RootMeanSquaredError(name='rmse')
    
    model = keras.Model(inputs=[stock_inputs, price_inputs], outputs=[output])
    
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001), metrics=[rmse])
    
    return model

In [44]:
best_model = build_best_model()

In [48]:
es = EarlyStopping(patience=8, restore_best_weights=True)
mc = ModelCheckpoint(filepath='Best_NN_Weights.tf', save_best_only=True)
tb = TensorBoard(log_dir='TensorBoard_Logs_1')
cb = [es, mc, tb]

In [49]:
best_model.fit((stocks_train, X_train), y_train, epochs=1000, batch_size=1024, callbacks=cb,
                validation_data=((stocks_test, X_test), y_test))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000


<keras.callbacks.History at 0x7f4860799a90>

In [50]:
%load_ext tensorboard
%tensorboard --logdir=./TensorBoard_Logs_1 --port=6009