In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

from numpy import mean
from numpy import std
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
# link to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def initialise(data):
  data.index = data["date"]
  data.index = pd.to_datetime(data.index)
  data.drop(columns=["date"], inplace=True)
  return data

data_level = initialise(pd.read_csv('/content/drive/MyDrive/Msc Thesis/code/data/Low resolution/data_level_mapped_selection10.csv'))
data_slope = initialise(pd.read_csv('/content/drive/MyDrive/Msc Thesis/code/data/Low resolution/data_slope_mapped_selection10.csv'))
data_curve = initialise(pd.read_csv('/content/drive/MyDrive/Msc Thesis/code/data/Low resolution/data_curve_mapped_selection10.csv'))

data_level.head()

Unnamed: 0_level_0,secid,-80,-50,-20,level,slope,curve,permno,beta,betasq,...,cash,cinvest,roaq,roavol,ms,baspread,maxret,retvol,std_dolvol,std_turn
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-01-31,100862,0.307688,0.315627,0.358095,0.315627,0.042468,0.017264,54594,-0.502126,-0.886445,...,-0.868511,0.071798,0.396279,-0.953113,0.0,-0.689049,-0.662673,-0.619551,0.327234,-0.936622
1996-01-31,100871,0.705411,0.615432,0.65116,0.615432,0.035728,0.062854,50906,0.551365,0.193703,...,-0.738289,0.080067,0.538981,-0.900059,0.75,-0.112575,-0.39013,-0.223204,-0.342296,-0.748948
1996-01-31,100892,0.27009,0.216768,0.216508,0.216768,-0.00026,0.026531,57904,-0.53673,-0.902575,...,-0.958006,0.066337,0.367446,-0.998951,0.25,-0.786859,-0.947305,-0.877289,-0.536138,-0.955821
1996-01-31,100896,0.430796,0.436291,0.429517,0.436291,-0.006774,-0.006135,77520,0.349363,-0.1018,...,-0.968644,0.060739,0.459243,-0.859083,-0.25,-0.747736,-0.649524,-0.703492,-0.331142,-0.818216
1996-01-31,100903,0.262405,0.252683,0.373839,0.252683,0.121156,0.065439,80303,-0.186662,-0.682642,...,-0.584473,0.077826,0.605842,-0.875346,-0.5,-0.807682,-0.825541,-0.737079,-0.400503,-0.866238


In [4]:
def samplesplitting(data,train,validate,oos,i):
    train_data    = data[(data.index.year >= 1996 )         & (data.index.year < i - validate)]
    validate_data = data[(data.index.year >= i - validate)  & (data.index.year < i)]
    total_train   = data[(data.index.year >= 1996 )         & (data.index.year < i)]
    oos_data      = data[(data.index.year == i)]
    pred_data     = data[(data.index.year >= 1996 + train + validate) & (data.index.year <= i)]

    print('train=%d, validate=%d, oos=%d' % (len(train_data), len(validate_data), len(oos_data)))
    return train_data, validate_data, oos_data, total_train, pred_data

def split_xy(data, kind):
  data_x = data.iloc[:,8:]
  data_y = data[kind]
  return data_x, data_y

# Level old

In [None]:
# Complete number of observations is 982285
# We take the initial train, validate and oos sample to be 8, 5 and 13 years respectively (Ratios similar to Gu Kelly Xiu)
# On average one month contains 3000 options, therefore we loop over 12 * 3000 options, which is similar to looping over a year
trainsize = 8 * 3000 * 12
validatesize = 5 * 3000 * 12
oossize = 3000 * 12

# Initialise predictionstring
IV = 'level'
oos_pred_level = pd.DataFrame()

# Hyperparameters
max_depth = [4, 6]
n_estimators = [200, 500]
eta = [0.1]
# subsample = [0.7]
# colsample_bytree = [0.8]
# reg_alpha = 0 #0.1
# reg_lambda = 1

for window in range (0,13):
  print('Iteration: ', window)
  best = float("inf")

  # Optimisation method 1 expanding window

  # Sample splitting
  train, validate, oos, total_train, total_oos = samplesplitting(data_level, trainsize, validatesize, oossize, window)
  train_x, train_y              = split_xy(train,IV)
  validate_x, validate_y        = split_xy(validate,IV)
  oos_x, oos_y                  = split_xy(oos,IV)
  total_train_x, total_train_y  = split_xy(total_train,IV)

  # Tuning
  for i in range(len(max_depth)):
    for j in range(len(n_estimators)):
      for k in range(len(eta)):
        model = XGBRegressor(max_depth = max_depth[i], n_estimators=n_estimators[j], eta=eta[k])
        model.fit(train_x, train_y)
        pred_val= model.predict(validate_x)
        error = np.sqrt(((pred_val - validate_y) ** 2).mean())
        if error < best:
          best = error
          max_depth_final = max_depth[i]
          n_estimators_final = n_estimators[j]
          eta_final = eta[k]

          y_hat = pd.DataFrame(model.predict(oos_x))

  # Fit model
  print('Max Depth: ', max_depth_final)
  print('Nr of trees: ', n_estimators_final)
  print('Learning rate: ', eta_final)
  # model_final = XGBRegressor(max_depth = max_depth_final,
  #                            n_estimators = n_estimators_final,
  #                            eta = eta_final)
  # model_final.fit(total_train_x, total_train_y)
  # y_hat = pd.DataFrame(model_final.predict(oos_x))
  print('Yearly:',r2_score(oos_y, y_hat))
  oos_pred_level = pd.concat([oos_pred_level, y_hat], ignore_index=True)
  print('Total:',r2_score(total_oos[IV], oos_pred_level))

oos_pred_level.to_csv('/content/drive/MyDrive/Msc Thesis/code/Results/XGBoosting/level_predictions_xgb.csv', index=True)


Iteration:  0
train=288000, validate=180000, oos=36000
Max Depth:  6
Nr of trees:  500
Learning rate:  0.1
Yearly: 0.45448851551540614
Total: 0.45448851551540614
Iteration:  1
train=324000, validate=180000, oos=36000
Max Depth:  4
Nr of trees:  200
Learning rate:  0.1
Yearly: 0.47187150033721803
Total: 0.46304039071576153
Iteration:  2
train=360000, validate=180000, oos=36000
Max Depth:  6
Nr of trees:  500
Learning rate:  0.1
Yearly: 0.31483574693169425
Total: 0.4224497651822653
Iteration:  3
train=396000, validate=180000, oos=36000
Max Depth:  6
Nr of trees:  200
Learning rate:  0.1
Yearly: 0.4826565564755215
Total: 0.44018061377868245
Iteration:  4
train=432000, validate=180000, oos=36000
Max Depth:  4
Nr of trees:  200
Learning rate:  0.1
Yearly: 0.2561241094930582
Total: 0.4076957406586599
Iteration:  5
train=468000, validate=180000, oos=36000
Max Depth:  6
Nr of trees:  200
Learning rate:  0.1
Yearly: 0.5016001924670601
Total: 0.4234775238951639
Iteration:  6
train=504000, valida

In [None]:
oos_pred_level.to_csv('/content/drive/MyDrive/Msc Thesis/code/Results/XGBoosting/level_predictions_xgb.csv', index=True)

# Level New

In [None]:
# Complete number of observations is 982285
# We take the initial train, validate and oos sample to be 8, 5 and 13 years respectively (Ratios similar to Gu Kelly Xiu)
# On average one month contains 3000 options, therefore we loop over 12 * 3000 options, which is similar to looping over a year
# trainsize = 8 * 3000 * 12
# validatesize = 5 * 3000 * 12
# oossize = 3000 * 12
trainsize = 11
validatesize = 5
oossize = 1

# Initialise predictionstring
IV = 'level'
oos_pred_level = pd.DataFrame()

# # Hyperparameters
# param_grid = {
#     'max_depth': [11, 13] ,# [9, 11], #[7, 9],
#     'n_estimators': [600], # [400, 600], #[200, 500],
#     'learning_rate': [0.01], #[0.01, 0.001] # was (0.1, 0.01)
      # 'n_jobs': [-1]
# }

window = 2021
train, validate, oos, total_train, total_oos = samplesplitting(data_level, trainsize, validatesize, oossize, window)
total_train_x, total_train_y  = split_xy(total_train,IV)

# model = XGBRegressor()
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2,  verbose=10)
# grid_search.fit(total_train_x, total_train_y)
# best_params = grid_search.best_params_
best_params = {'learning_rate': 0.01, 'max_depth': 11, 'n_estimators': 700} #{'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500}
print("Best Hyperparameters:", best_params)
best_model = XGBRegressor(**best_params)

feature_importance = np.zeros(total_train_x.shape[1])

for window in range(2012, 2021+1):
  print('Iteration: ', window)
  best = float("inf")

  # Sample splitting
  train, validate, oos, total_train, total_oos = samplesplitting(data_level, trainsize, validatesize, oossize, window)
  train_x, train_y              = split_xy(train,IV)
  validate_x, validate_y        = split_xy(validate,IV)
  oos_x, oos_y                  = split_xy(oos,IV)
  total_train_x, total_train_y  = split_xy(total_train,IV)

  best_model.fit(total_train_x, total_train_y)
  y_hat = pd.DataFrame(best_model.predict(oos_x))

  # Obtain feature importance
  feature_importance += best_model.feature_importances_

  print('Yearly:',r2_score(oos_y, y_hat))
  oos_pred_level = pd.concat([oos_pred_level, y_hat], ignore_index=True)
  print('Total:',r2_score(total_oos[IV], oos_pred_level))

feature_importance_mean = feature_importance / 10
np.savetxt('/content/drive/MyDrive/Msc Thesis/code/Results/XGBoosting/feature_importance_mean_level_yearly.csv', feature_importance_mean, delimiter=',')
oos_pred_level.to_csv('/content/drive/MyDrive/Msc Thesis/code/Results/XGBoosting/level_predictions_xgb_yearly.csv', index=True)

train=504261, validate=148707, oos=27899
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 11, 'n_estimators': 700}
Iteration:  2012
train=210839, validate=158189, oos=33864
Yearly: 0.3627130588183748
Total: 0.3627130588183748
Iteration:  2013
train=241135, validate=161757, oos=34182
Yearly: 0.4282467236085986
Total: 0.40204854512250676
Iteration:  2014
train=272284, validate=164790, oos=34113
Yearly: 0.44183377672856483
Total: 0.4167397064215309
Iteration:  2015
train=303150, validate=168037, oos=33074
Yearly: 0.4263864764075437
Total: 0.41965766449878716
Iteration:  2016
train=335237, validate=169024, oos=31906
Yearly: 0.3855759281768646
Total: 0.413355960730607
Iteration:  2017
train=369028, validate=167139, oos=30972
Yearly: 0.42138351973931887
Total: 0.41745325724023774
Iteration:  2018
train=402892, validate=164247, oos=30002
Yearly: 0.4946783964209196
Total: 0.4278037029040015
Iteration:  2019
train=437074, validate=160067, oos=28538
Yearly: 0.5707166661319294
Total: 0.

# Slope

In [5]:
trainsize = 11
validatesize = 5
oossize = 1

# Initialise predictionstring
IV = 'slope'
oos_pred_slope = pd.DataFrame()

# # Hyperparameters
# param_grid = {
#     'max_depth': [3],#[3,5,7],#[7, 11] ,
#     'n_estimators': [700],#[400, 600],
#     'learning_rate': [0.01],
      # 'n_jobs': [-1]
# }

window = 2021
train, validate, oos, total_train, total_oos = samplesplitting(data_slope, trainsize, validatesize, oossize, window)
total_train_x, total_train_y  = split_xy(total_train,IV)

# model = XGBRegressor()
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2,  verbose=10)
# grid_search.fit(total_train_x, total_train_y)
# best_params = grid_search.best_params_
best_params = {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 700} #{'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500, 'n_jobs': -1}
print("Best Hyperparameters:", best_params)
best_model = XGBRegressor(**best_params)

feature_importance = np.zeros(total_train_x.shape[1])

for window in range(2012, 2021+1):
  print('Iteration: ', window)
  best = float("inf")

  # Sample splitting
  train, validate, oos, total_train, total_oos = samplesplitting(data_slope, trainsize, validatesize, oossize, window)
  train_x, train_y              = split_xy(train,IV)
  validate_x, validate_y        = split_xy(validate,IV)
  oos_x, oos_y                  = split_xy(oos,IV)
  total_train_x, total_train_y  = split_xy(total_train,IV)

  best_model.fit(total_train_x, total_train_y)
  y_hat = pd.DataFrame(best_model.predict(oos_x))

  # Obtain feature importance
  feature_importance += best_model.feature_importances_

  print('Yearly:',r2_score(oos_y, y_hat))
  oos_pred_slope = pd.concat([oos_pred_slope, y_hat], ignore_index=True)
  print('Total:',r2_score(total_oos[IV], oos_pred_slope))

feature_importance_mean = feature_importance / 10
np.savetxt('/content/drive/MyDrive/Msc Thesis/code/Results/XGBoosting/feature_importance_mean_slope_yearly.csv', feature_importance_mean, delimiter=',')
oos_pred_slope.to_csv('/content/drive/MyDrive/Msc Thesis/code/Results/XGBoosting/slope_predictions_xgb_yearly.csv', index=True)

train=504261, validate=148707, oos=27899
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 700}
Iteration:  2012
train=210839, validate=158189, oos=33864
Yearly: 0.015355886662705931
Total: 0.015355886662705931
Iteration:  2013
train=241135, validate=161757, oos=34182
Yearly: 0.0184076803595129
Total: 0.020900566089374295
Iteration:  2014
train=272284, validate=164790, oos=34113
Yearly: 0.004076872452754343
Total: 0.022126578447140943
Iteration:  2015
train=303150, validate=168037, oos=33074
Yearly: 0.03352052558502916
Total: 0.029962517412287504
Iteration:  2016
train=335237, validate=169024, oos=31906
Yearly: 0.028498141860209825
Total: 0.030637592673746994
Iteration:  2017
train=369028, validate=167139, oos=30972
Yearly: 0.028327150353237518
Total: 0.031094321867793595
Iteration:  2018
train=402892, validate=164247, oos=30002
Yearly: 0.03123784259434692
Total: 0.031209560886287013
Iteration:  2019
train=437074, validate=160067, oos=28538
Yearly: 0.0433076

In [None]:
oos_pred_slope = pd.read_csv('/content/drive/MyDrive/Msc Thesis/code/Results/XGBoosting/slope_predictions_xgb3.csv')
oos_pred_slope.head()

# **Curvature**

In [6]:
trainsize = 11
validatesize = 5
oossize = 1

# Initialise predictionstring
IV = 'curve'
oos_pred_curve = pd.DataFrame()

# Hyperparameters
# param_grid = {
#     'max_depth': [5], #[3,5,7],#
#     'n_estimators': [300, 500],# [500]
#     'learning_rate': [0.01, 0.1], #[0.01],
#     'n_jobs': [-1]
# }

window = 2021
train, validate, oos, total_train, total_oos = samplesplitting(data_curve, trainsize, validatesize, oossize, window)
total_train_x, total_train_y  = split_xy(total_train,IV)

# model = XGBRegressor()
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2,  verbose=10)
# grid_search.fit(total_train_x, total_train_y)
# best_params = grid_search.best_params_
best_params = {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'n_jobs': -1}
print("Best Hyperparameters:", best_params)
best_model = XGBRegressor(**best_params)

feature_importance = np.zeros(total_train_x.shape[1])

for window in range(2012, 2021+1):
  print('Iteration: ', window)
  best = float("inf")

  # Sample splitting
  train, validate, oos, total_train, total_oos = samplesplitting(data_curve, trainsize, validatesize, oossize, window)
  train_x, train_y              = split_xy(train,IV)
  validate_x, validate_y        = split_xy(validate,IV)
  oos_x, oos_y                  = split_xy(oos,IV)
  total_train_x, total_train_y  = split_xy(total_train,IV)

  best_model.fit(total_train_x, total_train_y)
  y_hat = pd.DataFrame(best_model.predict(oos_x))

  # Obtain feature importance
  feature_importance += best_model.feature_importances_

  print('Yearly:',r2_score(oos_y, y_hat))
  oos_pred_curve = pd.concat([oos_pred_curve, y_hat], ignore_index=True)
  print('Total:',r2_score(total_oos[IV], oos_pred_curve))

feature_importance_mean = feature_importance / 10
np.savetxt('/content/drive/MyDrive/Msc Thesis/code/Results/XGBoosting/feature_importance_mean_curve_yearly.csv', feature_importance_mean, delimiter=',')
oos_pred_curve.to_csv('/content/drive/MyDrive/Msc Thesis/code/Results/XGBoosting/curve_predictions_xgb_yearly.csv', index=True)

train=504261, validate=148707, oos=27899
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'n_jobs': -1}
Iteration:  2012
train=210839, validate=158189, oos=33864
Yearly: 0.007089884374277378
Total: 0.007089884374277378
Iteration:  2013
train=241135, validate=161757, oos=34182
Yearly: 0.014025113209116702
Total: 0.012948931821928134
Iteration:  2014
train=272284, validate=164790, oos=34113
Yearly: -0.004883966210775137
Total: 0.01364079971403065
Iteration:  2015
train=303150, validate=168037, oos=33074
Yearly: 0.017577853772373087
Total: 0.018803884125296655
Iteration:  2016
train=335237, validate=169024, oos=31906
Yearly: -0.04291795289815381
Total: 0.014532207448196055
Iteration:  2017
train=369028, validate=167139, oos=30972
Yearly: -0.0018721598262165795
Total: 0.012222619890702502
Iteration:  2018
train=402892, validate=164247, oos=30002
Yearly: 0.012050310164516742
Total: 0.012570054181704537
Iteration:  2019
train=437074, validate=160067, oos=285