In [1]:
import numpy as np
np.random.seed(1)
import pandas as pd
import scipy

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

from functools import partial

import matplotlib.pyplot as plt
import seaborn as sns

from xgb_utils import *

In [2]:
data = pd.read_csv('clean_monthly_data.csv')
data

Unnamed: 0,Date,unemployment_rate,CPI,treasury_yield,GDP_growth,SP500_return,AZN,BMY,JNJ,LLY,MRK,NVO,NVS,PFE,ROG,inflation_change,unemp_change,treasury_yield_change
0,2000-02-01,4.1,170.000,6.661000,0.496560,-1.522563,-12.828964,-13.228004,-16.339821,-11.121498,-21.701151,2.220031,3.838386,-11.226228,54.440789,1.000,-0.1,-0.141500
1,2000-03-01,4.0,171.000,6.519500,0.511258,9.413333,22.264136,-0.218329,-2.079067,5.804243,0.913712,8.390897,6.420237,14.101954,6.922258,1.000,-0.1,-0.141500
2,2000-04-01,3.8,170.900,6.256522,1.327803,-3.266805,5.567379,-8.205683,17.437698,23.153694,12.400712,-0.097663,2.559423,15.213674,7.370518,-0.100,-0.2,-0.262978
3,2000-05-01,4.0,171.200,5.990526,-0.181797,-1.572223,-0.148357,5.395746,8.484832,-1.296597,7.374072,20.863985,5.169310,5.638019,-8.163265,0.300,0.2,-0.265995
4,2000-06-01,4.0,172.200,6.440455,0.305565,1.728613,10.549735,5.788826,14.239888,31.641749,3.078671,2.813690,8.474599,8.076012,13.131313,1.000,0.0,0.449928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,2023-01-01,3.4,300.536,3.616190,0.390254,6.776820,-3.584079,0.972908,-7.489384,-5.928822,-2.549213,2.541749,-0.110227,-13.817335,16.968326,1.546,-0.1,-0.274810
276,2023-02-01,3.6,301.648,3.531500,-0.442183,-2.514271,-0.290649,-4.328217,-6.217115,-9.568502,-1.089288,1.592445,-7.172811,-7.286115,5.451681,1.112,0.2,-0.084690
277,2023-03-01,3.5,301.808,3.746842,-0.442183,3.313488,8.035329,0.507544,1.862736,10.703390,0.141189,12.873250,9.367574,0.566924,11.025813,0.160,-0.1,0.215342
278,2023-04-01,3.4,302.918,3.663043,-0.442183,1.985238,5.489119,-3.664707,5.612908,15.269915,9.289214,5.836894,16.334413,-4.681371,-1.517467,1.110,-0.1,-0.083799


In [3]:
macros = ['unemployment_rate', 'GDP_growth', 'SP500_return', 'inflation_change', 'unemp_change', 'treasury_yield_change']

for m in macros:
    data[f'{m}_lag'] = data[m].shift(1)
    data[f'{m}_lag2'] = data[m].shift(2)
    data[f'{m}_lag3'] = data[m].shift(3)

In [4]:
data['Quater_dummy'] = 0
data['Quater_rippel'] = 0
data['Date'] = pd.to_datetime(data['Date'])
for index, row in data.iterrows():
    if row['Date'].month in [3, 6, 9, 12]:
        data.loc[index, 'Quater_dummy'] = 1
    if row['Date'].month in [1, 4, 7, 10]:
        data.loc[index, 'Quater_dummy'] = 1


In [5]:
tickers = ['NVS', 'AZN', 'BMY', 'JNJ', 'LLY', 'MRK', 'NVO', 'PFE', 'ROG']

for t in tickers:
    data[f'{t}_lag'] = data[t].shift(1)
    data[f'{t}_lag2'] = data[t].shift(2)

#### 1. Macros Only

In [6]:
data_macro = data.copy()
to_exclude = []

for t in tickers:
    to_exclude.append(t)
    to_exclude.append(f'{t}_lag')
    to_exclude.append(f'{t}_lag2')

for m in macros:
    to_exclude.append(m)


to_exclude.append('Date')

1 month

In [7]:
dates1m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01']

params = {'n_estimators': [100, 150, 200],
          'max_depth': [3, 5, 7],
          'learning_rate': [0.01, 0.1]}

In [8]:
nvs_loss = get_model_performance(data_macro, 'NVS', 0.05, to_exclude, dates1m, 1)
nvs_loss

0.5217945068580359

In [9]:
nvs_loss

0.5217945068580359

In [10]:
for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates1m, 1)
    print(f'{t}-1m-loss: {loss} \n')

NVS-1m-loss: 0.521789679820792 

AZN-1m-loss: 0.7566584695173182 

BMY-1m-loss: 0.5064353070286455 

JNJ-1m-loss: 0.5991419985175034 

LLY-1m-loss: 0.7669115458664167 

MRK-1m-loss: 0.6250140156744843 

NVO-1m-loss: 0.8333365649373826 

PFE-1m-loss: 1.2386725598427137 

ROG-1m-loss: 4.1156308458245805 



In [11]:
dates3m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates3m, 3)
    print(f'{t}-3m-loss: {loss} \n')

NVS-3m-loss: 1.5607745232682038 

AZN-3m-loss: 2.339165162053782 

BMY-3m-loss: 1.5119184993375459 

JNJ-3m-loss: 1.680783402195689 

LLY-3m-loss: 2.5592187436600478 

MRK-3m-loss: 1.9432344913193702 

NVO-3m-loss: 2.7244543075448617 

PFE-3m-loss: 3.954572734392637 

ROG-3m-loss: 14.146679550359584 



In [12]:
dates6m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates6m, 6)
    print(f'{t}-6m-loss: {loss} \n')

NVS-6m-loss: 3.1431742013414166 

AZN-6m-loss: 4.658144534251645 

BMY-6m-loss: 3.143999807183176 

JNJ-6m-loss: 3.354044482862048 

LLY-6m-loss: 5.2212184569802655 

MRK-6m-loss: 4.108864832661672 

NVO-6m-loss: 4.968251098743439 

PFE-6m-loss: 9.770474283723841 

ROG-6m-loss: 38.7025861842007 



In [13]:
dates9m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates9m, 9)
    print(f'{t}-9m-loss: {loss} \n')

NVS-9m-loss: 4.609245658905412 

AZN-9m-loss: 7.146215308231334 

BMY-9m-loss: 4.802153151249902 

JNJ-9m-loss: 3.770194020343789 

LLY-9m-loss: 7.541364925590523 

MRK-9m-loss: 5.9923697574479755 

NVO-9m-loss: 8.404083973357446 

PFE-9m-loss: 15.122608777244654 

ROG-9m-loss: 47.016219480192184 



In [14]:
dates12m = ['2022-05-01']#, '2022-06-01', '2022-07-01', '2022-08-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates12m, 12)
    print(f'{t}-12m-loss: {loss} \n')

NVS-12m-loss: 6.1413816305177 

AZN-12m-loss: 8.674257380367584 

BMY-12m-loss: 6.045331566766581 

JNJ-12m-loss: 4.34369485390347 

LLY-12m-loss: 10.435938087737512 

MRK-12m-loss: 7.577803154301937 

NVO-12m-loss: 11.600213121480419 

PFE-12m-loss: 28.948339596063192 

ROG-12m-loss: 47.67868071703029 



#### 2. Ticker data

In [15]:
data_stocks = data.copy()
to_exclude2 = []

for m in macros:
    to_exclude2.append(m)
    to_exclude2.append(f'{m}_lag')
    to_exclude2.append(f'{m}_lag2')
    to_exclude2.append(f'{m}_lag3')

for t in tickers:
    to_exclude2.append(t)


to_exclude2.append('Date')

In [16]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates1m, 1)
    print(f'{t}-1m-loss: {loss}')

NVS-1m-loss: 0.5700806431319664
AZN-1m-loss: 0.734669072582622
BMY-1m-loss: 0.4967935838554775
JNJ-1m-loss: 0.5704925734143715
LLY-1m-loss: 0.6961034351790486
MRK-1m-loss: 0.5933983595775069
NVO-1m-loss: 0.6320326867919637
PFE-1m-loss: 0.9097740562511705
ROG-1m-loss: 3.958289186422867


In [17]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates3m, 3)
    print(f'{t}-3m-loss: {loss}')

NVS-3m-loss: 1.6130321646730992
AZN-3m-loss: 2.4085111378262405
BMY-3m-loss: 1.4894008099105336
JNJ-3m-loss: 2.026918342188585
LLY-3m-loss: 2.0077462769314094
MRK-3m-loss: 1.8157416710740826
NVO-3m-loss: 2.0101102474330954
PFE-3m-loss: 3.1031981834508637
ROG-3m-loss: 14.066043874780169


In [18]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates6m, 6)
    print(f'{t}-6m-loss: {loss}')

NVS-6m-loss: 3.182153322373202
AZN-6m-loss: 4.984198988461185
BMY-6m-loss: 3.0540434456939862
JNJ-6m-loss: 3.7096900652787537
LLY-6m-loss: 3.893652478891512
MRK-6m-loss: 3.736103698353191
NVO-6m-loss: 4.204725323044245
PFE-6m-loss: 6.601819557893541
ROG-6m-loss: 38.32655355825408


In [19]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates9m, 9)
    print(f'{t}-9m-loss: {loss}')

NVS-9m-loss: 4.387599475028342
AZN-9m-loss: 7.511688672459899
BMY-9m-loss: 4.5754657224728055
JNJ-9m-loss: 5.929051572227044
LLY-9m-loss: 5.891831608679676
MRK-9m-loss: 5.420934154400189
NVO-9m-loss: 6.0177983643903445
PFE-9m-loss: 9.553063218619032
ROG-9m-loss: 44.59209209473438


In [20]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates12m, 12)
    print(f'{t}-12m-loss: {loss}')

NVS-12m-loss: 6.518956651659676
AZN-12m-loss: 9.025976879592067
BMY-12m-loss: 6.082053790974383
JNJ-12m-loss: 7.522618259311169
LLY-12m-loss: 8.05881843530952
MRK-12m-loss: 6.978077972929193
NVO-12m-loss: 7.485982269754692
PFE-12m-loss: 10.90816947396195
ROG-12m-loss: 46.80406302040529


3. All Covariates

In [21]:
covs = data.copy()
to_exclude3 = ['Date']

for m in macros:
    to_exclude3.append(m)

for t in tickers:
    to_exclude3.append(t)

In [22]:
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates1m, 1)
    print(f'{t}-1m-loss: {loss}')

NVS-1m-loss: 0.540977066174191
AZN-1m-loss: 0.746224052835912
BMY-1m-loss: 0.5149412715739979
JNJ-1m-loss: 0.6129846718801543
LLY-1m-loss: 0.8049843219656475
MRK-1m-loss: 0.5937419662371566
NVO-1m-loss: 0.629031539490111
PFE-1m-loss: 0.9019506220754225
ROG-1m-loss: 4.033481702300313


In [23]:
dates3m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01']
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates3m, 3)
    print(f'{t}-3m-loss: {loss}')

NVS-3m-loss: 1.6286504351421431
AZN-3m-loss: 2.5357634868854437
BMY-3m-loss: 1.54312820667973
JNJ-3m-loss: 1.8727404036232944
LLY-3m-loss: 2.166047286124091
MRK-3m-loss: 1.895854990021266
NVO-3m-loss: 2.000439758082165
PFE-3m-loss: 2.9843994019508338
ROG-3m-loss: 13.912583233181262


In [24]:
dates6m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01']
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates6m, 6)
    print(f'{t}-6m-loss: {loss}')

NVS-6m-loss: 3.264650698329685
AZN-6m-loss: 5.163962170403846
BMY-6m-loss: 3.1726181846867547
JNJ-6m-loss: 3.5841116949256095
LLY-6m-loss: 4.213468121627476
MRK-6m-loss: 3.8840471566293893
NVO-6m-loss: 4.181955531472537
PFE-6m-loss: 6.627550236004956
ROG-6m-loss: 37.502391227871826


In [25]:
dates9m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01'] 
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates9m, 9)
    print(f'{t}-9m-loss: {loss}')

NVS-9m-loss: 4.694773347027297
AZN-9m-loss: 7.865460584820135
BMY-9m-loss: 4.650072329533792
JNJ-9m-loss: 5.6311241927070075
LLY-9m-loss: 5.910582547938102
MRK-9m-loss: 5.680333419781423
NVO-9m-loss: 5.968886225980706
PFE-9m-loss: 9.203823242632755
ROG-9m-loss: 43.752969545589586


In [26]:
dates12m = ['2022-05-01']
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates12m, 12)
    print(f'{t}-12m-loss: {loss}')

NVS-12m-loss: 6.5298076919942
AZN-12m-loss: 9.857400892373896
BMY-12m-loss: 6.082053790974383
JNJ-12m-loss: 5.795183151885968
LLY-12m-loss: 8.704428260766152
MRK-12m-loss: 7.266565179382988
NVO-12m-loss: 7.485982269754692
PFE-12m-loss: 10.383454896034257
ROG-12m-loss: 46.1563522199452
