In [1]:
import numpy as np
np.random.seed(1)
import pandas as pd
import scipy

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

from functools import partial

import matplotlib.pyplot as plt
import seaborn as sns

from xgb_utils import *

In [2]:
data = pd.read_csv('clean_monthly_data.csv')
data

Unnamed: 0,Date,unemployment_rate,CPI,treasury_yield,GDP_growth,SP500_return,AZN,BMY,JNJ,LLY,MRK,NVO,NVS,PFE,ROG,inflation_change,unemp_change,treasury_yield_change
0,2000-02-01,4.1,170.000,6.661000,0.496560,-1.522563,-12.828964,-13.228004,-16.339821,-11.121498,-21.701151,2.220031,3.838386,-11.226228,54.440789,1.000,-0.1,-0.141500
1,2000-03-01,4.0,171.000,6.519500,0.511258,9.413333,22.264136,-0.218329,-2.079067,5.804243,0.913712,8.390897,6.420237,14.101954,6.922258,1.000,-0.1,-0.141500
2,2000-04-01,3.8,170.900,6.256522,1.327803,-3.266805,5.567379,-8.205683,17.437698,23.153694,12.400712,-0.097663,2.559423,15.213674,7.370518,-0.100,-0.2,-0.262978
3,2000-05-01,4.0,171.200,5.990526,-0.181797,-1.572223,-0.148357,5.395746,8.484832,-1.296597,7.374072,20.863985,5.169310,5.638019,-8.163265,0.300,0.2,-0.265995
4,2000-06-01,4.0,172.200,6.440455,0.305565,1.728613,10.549735,5.788826,14.239888,31.641749,3.078671,2.813690,8.474599,8.076012,13.131313,1.000,0.0,0.449928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,2023-01-01,3.4,300.536,3.616190,0.390254,6.776820,-3.584079,0.972908,-7.489384,-5.928822,-2.549213,2.541749,-0.110227,-13.817335,16.968326,1.546,-0.1,-0.274810
276,2023-02-01,3.6,301.648,3.531500,-0.442183,-2.514271,-0.290649,-4.328217,-6.217115,-9.568502,-1.089288,1.592445,-7.172811,-7.286115,5.451681,1.112,0.2,-0.084690
277,2023-03-01,3.5,301.808,3.746842,-0.442183,3.313488,8.035329,0.507544,1.862736,10.703390,0.141189,12.873250,9.367574,0.566924,11.025813,0.160,-0.1,0.215342
278,2023-04-01,3.4,302.918,3.663043,-0.442183,1.985238,5.489119,-3.664707,5.612908,15.269915,9.289214,5.836894,16.334413,-4.681371,-1.517467,1.110,-0.1,-0.083799


In [3]:
macros = ['unemployment_rate', 'GDP_growth', 'SP500_return', 'inflation_change', 'unemp_change', 'treasury_yield_change']

for m in macros:
    data[f'{m}_lag'] = data[m].shift(1)
    data[f'{m}_lag2'] = data[m].shift(2)

In [4]:
data['Quater_dummy'] = 0
data['Quater_rippel'] = 0
data['Date'] = pd.to_datetime(data['Date'])
for index, row in data.iterrows():
    if row['Date'].month in [3, 6, 9, 12]:
        data.loc[index, 'Quater_dummy'] = 1
    if row['Date'].month in [1, 4, 7, 10]:
        data.loc[index, 'Quater_dummy'] = 1


In [5]:
tickers = ['NVS', 'AZN', 'BMY', 'JNJ', 'LLY', 'MRK', 'NVO', 'PFE', 'ROG']

for t in tickers:
    data[f'{t}_lag'] = data[t].shift(1)
    data[f'{t}_lag2'] = data[t].shift(2)

#### 1. Macros Only

In [6]:
data_macro = data.copy()
to_exclude = []

for t in tickers:
    to_exclude.append(t)
    to_exclude.append(f'{t}_lag')
    to_exclude.append(f'{t}_lag2')

for m in macros:
    to_exclude.append(m)


to_exclude.append('Date')

1 month

In [7]:
dates1m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01']

params = {'n_estimators': [100, 150, 200],
          'max_depth': [3, 5, 7],
          'learning_rate': [0.01, 0.1]}

In [8]:
nvs_loss = get_model_performance(data_macro, 'NVS', 0.05, to_exclude, dates1m, 1)
nvs_loss

0.5239553478560977

In [9]:
nvs_loss

0.5239553478560977

In [10]:
for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates1m, 1)
    print(f'{t}-1m-loss: {loss} \n')

NVS-1m-loss: 0.5239559216495931 

AZN-1m-loss: 0.7549644387361227 

BMY-1m-loss: 0.5419768858606927 

JNJ-1m-loss: 0.4901810858423284 

LLY-1m-loss: 0.709682770815563 

MRK-1m-loss: 0.6250140018098828 

NVO-1m-loss: 0.5442496894780305 

PFE-1m-loss: 0.8359548137478647 

ROG-1m-loss: 3.727945998717294 



In [11]:
dates3m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates3m, 3)
    print(f'{t}-3m-loss: {loss} \n')

NVS-3m-loss: 1.5661282088197122 

AZN-3m-loss: 3.840173678050801 

BMY-3m-loss: 1.826695351318741 

JNJ-3m-loss: 1.4072060701943898 

LLY-3m-loss: 2.057243672410013 

MRK-3m-loss: 1.9432344497255656 

NVO-3m-loss: 1.8304341168478622 

PFE-3m-loss: 3.407969091468851 

ROG-3m-loss: 13.322920795471157 



In [12]:
dates6m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates6m, 6)
    print(f'{t}-6m-loss: {loss} \n')

NVS-6m-loss: 3.1687691185402818 

AZN-6m-loss: 7.204899218205503 

BMY-6m-loss: 4.276191158368844 

JNJ-6m-loss: 2.4195996200756347 

LLY-6m-loss: 3.980943187785424 

MRK-6m-loss: 4.108864749474063 

NVO-6m-loss: 3.827016876095626 

PFE-6m-loss: 6.799134375728973 

ROG-6m-loss: 35.425585719597294 



In [13]:
dates9m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates9m, 9)
    print(f'{t}-9m-loss: {loss} \n')

NVS-9m-loss: 4.38673806823542 

AZN-9m-loss: 8.547382100712772 

BMY-9m-loss: 5.268729767336063 

JNJ-9m-loss: 3.2656714137658147 

LLY-9m-loss: 5.922216346150734 

MRK-9m-loss: 5.992369632666562 

NVO-9m-loss: 5.380677251542048 

PFE-9m-loss: 9.79468818724598 

ROG-9m-loss: 42.697001141521554 



In [14]:
dates12m = ['2022-05-01']#, '2022-06-01', '2022-07-01', '2022-08-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates12m, 12)
    print(f'{t}-12m-loss: {loss} \n')

NVS-12m-loss: 6.13896146631126 

AZN-12m-loss: 8.886552337110487 

BMY-12m-loss: 5.522447148382341 

JNJ-12m-loss: 4.0551859684158655 

LLY-12m-loss: 7.847175779100317 

MRK-12m-loss: 7.577802987926716 

NVO-12m-loss: 6.527368330385726 

PFE-12m-loss: 11.486160957994306 

ROG-12m-loss: 42.77728389596288 



#### 2. Ticker data

In [15]:
data_stocks = data.copy()
to_exclude2 = []

for m in macros:
    to_exclude2.append(m)
    to_exclude2.append(f'{m}_lag')
    to_exclude2.append(f'{m}_lag2')

for t in tickers:
    to_exclude2.append(t)


to_exclude2.append('Date')

In [16]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates1m, 1)
    print(f'{t}-1m-loss: {loss}')

NVS-1m-loss: 0.5803374065931469
AZN-1m-loss: 0.7358063956929111
BMY-1m-loss: 0.5057580128180901
JNJ-1m-loss: 0.5768586489189557
LLY-1m-loss: 0.6748760254719305
MRK-1m-loss: 0.5963140811155355
NVO-1m-loss: 0.6320091865172993
PFE-1m-loss: 0.8875392596994515
ROG-1m-loss: 3.9550240920112323


In [17]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates3m, 3)
    print(f'{t}-3m-loss: {loss}')

NVS-3m-loss: 1.6402179356300877
AZN-3m-loss: 2.376837862079168
BMY-3m-loss: 1.5325136641029364
JNJ-3m-loss: 2.028404638862919
LLY-3m-loss: 1.969083194430258
MRK-3m-loss: 1.797002554513521
NVO-3m-loss: 2.010587279067211
PFE-3m-loss: 3.0186560990631435
ROG-3m-loss: 14.067851034378004


In [18]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates6m, 6)
    print(f'{t}-6m-loss: {loss}')

NVS-6m-loss: 3.041988445970439
AZN-6m-loss: 4.958200910873312
BMY-6m-loss: 3.153370257831155
JNJ-6m-loss: 3.709121595933786
LLY-6m-loss: 3.803489737701281
MRK-6m-loss: 3.7200848847229446
NVO-6m-loss: 4.205287294485195
PFE-6m-loss: 6.430450532903488
ROG-6m-loss: 38.36617731070508


In [19]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates9m, 9)
    print(f'{t}-9m-loss: {loss}')

NVS-9m-loss: 4.49333143768882
AZN-9m-loss: 7.5075582510616865
BMY-9m-loss: 4.721587675652011
JNJ-9m-loss: 5.926453443736458
LLY-9m-loss: 5.895437282052227
MRK-9m-loss: 5.432897200144502
NVO-9m-loss: 6.018568369631833
PFE-9m-loss: 9.164318003641498
ROG-9m-loss: 44.576784711869735


In [20]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates12m, 12)
    print(f'{t}-12m-loss: {loss}')

NVS-12m-loss: 7.4929077868004175
AZN-12m-loss: 9.009188016236195
BMY-12m-loss: 6.082053384560985
JNJ-12m-loss: 7.522624581708163
LLY-12m-loss: 8.102253037789099
MRK-12m-loss: 7.0505440580714644
NVO-12m-loss: 7.488226256280775
PFE-12m-loss: 10.540177684888027
ROG-12m-loss: 46.98335667251931


3. All Covariates

In [21]:
covs = data.copy()
to_exclude3 = ['Date']

for m in macros:
    to_exclude3.append(m)

for t in tickers:
    to_exclude3.append(t)

In [22]:
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates1m, 1)
    print(f'{t}-1m-loss: {loss}')

NVS-1m-loss: 0.5173433288116877
AZN-1m-loss: 0.765374563825267
BMY-1m-loss: 0.501349733805026
JNJ-1m-loss: 0.5750175212326086
LLY-1m-loss: 0.6910964589521451
MRK-1m-loss: 0.6067842576227013
NVO-1m-loss: 0.6289385630323657
PFE-1m-loss: 0.9065718142465173
ROG-1m-loss: 4.027524707503015


In [23]:
dates3m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01']
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates3m, 3)
    print(f'{t}-3m-loss: {loss}')

NVS-3m-loss: 1.5542470205636516
AZN-3m-loss: 2.4831394258782984
BMY-3m-loss: 1.5325136641029364
JNJ-3m-loss: 1.9709052757454404
LLY-3m-loss: 2.0275584746730133
MRK-3m-loss: 1.83434694155276
NVO-3m-loss: 2.00066603871881
PFE-3m-loss: 3.10608565067614
ROG-3m-loss: 13.891951513204223


In [24]:
dates6m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01']
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates6m, 6)
    print(f'{t}-6m-loss: {loss}')

NVS-6m-loss: 3.1687691185402818
AZN-6m-loss: 5.207271918052003
BMY-6m-loss: 3.153370257831155
JNJ-6m-loss: 3.829529027276521
LLY-6m-loss: 3.9077707820647016
MRK-6m-loss: 3.8528156726522407
NVO-6m-loss: 4.1818006221410045
PFE-6m-loss: 6.608829219021631
ROG-6m-loss: 37.2783193203398


In [25]:
dates9m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01'] 
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates9m, 9)
    print(f'{t}-9m-loss: {loss}')

NVS-9m-loss: 4.38673806823542
AZN-9m-loss: 7.863942504330467
BMY-9m-loss: 4.721587675652011
JNJ-9m-loss: 6.015470341046031
LLY-9m-loss: 5.943836502886198
MRK-9m-loss: 5.671022455286314
NVO-9m-loss: 5.969657598083135
PFE-9m-loss: 9.492992079558016
ROG-9m-loss: 43.80659929150879


In [26]:
dates12m = ['2022-05-01']
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates12m, 12)
    print(f'{t}-12m-loss: {loss}')

NVS-12m-loss: 6.13896146631126
AZN-12m-loss: 9.189254935993628
BMY-12m-loss: 6.082053384560985
JNJ-12m-loss: 7.522624581708163
LLY-12m-loss: 8.226739052944357
MRK-12m-loss: 7.303754971615112
NVO-12m-loss: 7.488226256280775
PFE-12m-loss: 10.88343828778542
ROG-12m-loss: 46.91995500345865
