In [1]:
import numpy as np
np.random.seed(1)
import pandas as pd
import scipy

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

from functools import partial

import matplotlib.pyplot as plt
import seaborn as sns

from xgb_utils import *

In [2]:
data = pd.read_csv('clean_monthly_data.csv')
data

Unnamed: 0,Date,unemployment_rate,CPI,treasury_yield,GDP_growth,SP500_return,AZN,BMY,JNJ,LLY,MRK,NVO,NVS,PFE,ROG,inflation_change,unemp_change,treasury_yield_change
0,2000-02-01,4.1,170.000,6.661000,0.496560,-1.522563,-12.828964,-13.228004,-16.339821,-11.121498,-21.701151,2.220031,3.838386,-11.226228,54.440789,1.000,-0.1,-0.141500
1,2000-03-01,4.0,171.000,6.519500,0.511258,9.413333,22.264136,-0.218329,-2.079067,5.804243,0.913712,8.390897,6.420237,14.101954,6.922258,1.000,-0.1,-0.141500
2,2000-04-01,3.8,170.900,6.256522,1.327803,-3.266805,5.567379,-8.205683,17.437698,23.153694,12.400712,-0.097663,2.559423,15.213674,7.370518,-0.100,-0.2,-0.262978
3,2000-05-01,4.0,171.200,5.990526,-0.181797,-1.572223,-0.148357,5.395746,8.484832,-1.296597,7.374072,20.863985,5.169310,5.638019,-8.163265,0.300,0.2,-0.265995
4,2000-06-01,4.0,172.200,6.440455,0.305565,1.728613,10.549735,5.788826,14.239888,31.641749,3.078671,2.813690,8.474599,8.076012,13.131313,1.000,0.0,0.449928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,2023-01-01,3.4,300.536,3.616190,0.390254,6.776820,-3.584079,0.972908,-7.489384,-5.928822,-2.549213,2.541749,-0.110227,-13.817335,16.968326,1.546,-0.1,-0.274810
276,2023-02-01,3.6,301.648,3.531500,-0.442183,-2.514271,-0.290649,-4.328217,-6.217115,-9.568502,-1.089288,1.592445,-7.172811,-7.286115,5.451681,1.112,0.2,-0.084690
277,2023-03-01,3.5,301.808,3.746842,-0.442183,3.313488,8.035329,0.507544,1.862736,10.703390,0.141189,12.873250,9.367574,0.566924,11.025813,0.160,-0.1,0.215342
278,2023-04-01,3.4,302.918,3.663043,-0.442183,1.985238,5.489119,-3.664707,5.612908,15.269915,9.289214,5.836894,16.334413,-4.681371,-1.517467,1.110,-0.1,-0.083799


In [3]:
macros = ['unemployment_rate', 'GDP_growth', 'SP500_return', 'inflation_change', 'unemp_change', 'treasury_yield_change']

for m in macros:
    data[f'{m}_lag'] = data[m].shift(1)
    data[f'{m}_lag2'] = data[m].shift(2)

In [4]:
data['Quater_dummy'] = 0
data['Quater_rippel'] = 0
data['Date'] = pd.to_datetime(data['Date'])
for index, row in data.iterrows():
    if row['Date'].month in [3, 6, 9, 12]:
        data.loc[index, 'Quater_dummy'] = 1
    if row['Date'].month in [1, 4, 7, 10]:
        data.loc[index, 'Quater_dummy'] = 1


In [5]:
tickers = ['NVS', 'AZN', 'BMY', 'JNJ', 'LLY', 'MRK', 'NVO', 'PFE', 'ROG']

for t in tickers:
    data[f'{t}_lag'] = data[t].shift(1)
    data[f'{t}_lag2'] = data[t].shift(2)

#### 1. Macros Only

In [6]:
data_macro = data.copy()
to_exclude = []

for t in tickers:
    to_exclude.append(t)
    to_exclude.append(f'{t}_lag')
    to_exclude.append(f'{t}_lag2')


to_exclude.append('Date')

1 month

In [8]:
dates1m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01']

params = {'n_estimators': [100, 150, 200],
          'max_depth': [3, 5, 7],
          'learning_rate': [0.01, 0.1]}

In [8]:
for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates1m, 1, params)
    print(f'{t}-1m-loss: {loss}')

NVS-1m-loss: 3.2618896020753922
AZN-1m-loss: 2.3879767866548227
BMY-1m-loss: 3.3772761038209924
JNJ-1m-loss: 2.775884336227969
LLY-1m-loss: 2.724265567092514
MRK-1m-loss: 1.7247076488973418
NVO-1m-loss: 2.2703321926043105
PFE-1m-loss: 4.4454702710296905
ROG-1m-loss: 6.529772320564978


In [9]:
dates3m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates3m, 3, params)
    print(f'{t}-3m-loss: {loss}')

NVS-3m-loss: 7.6856894581403505
AZN-3m-loss: 7.913814541466462
BMY-3m-loss: 10.320303291058712
JNJ-3m-loss: 8.557777771943062
LLY-3m-loss: 9.370765301931797
MRK-3m-loss: 4.76074037038329
NVO-3m-loss: 6.529063424849665
PFE-3m-loss: 14.277869771777205
ROG-3m-loss: 22.197556289628714


In [10]:
dates6m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates6m, 6, params)
    print(f'{t}-6m-loss: {loss}')

NVS-6m-loss: 13.922965018790793
AZN-6m-loss: 14.953414871997783
BMY-6m-loss: 19.592622280901292
JNJ-6m-loss: 16.75052847277061
LLY-6m-loss: 17.71223617093269
MRK-6m-loss: 7.481201076526001
NVO-6m-loss: 11.496488756000781
PFE-6m-loss: 29.437149808066383
ROG-6m-loss: 57.1847587508464


In [11]:
dates9m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates9m, 9, params)
    print(f'{t}-9m-loss: {loss}')

NVS-9m-loss: 23.008403478224373
AZN-9m-loss: 24.5686933164365
BMY-9m-loss: 30.009063820880193
JNJ-9m-loss: 26.82202020808235
LLY-9m-loss: 28.328063299648246
MRK-9m-loss: 12.057306156432961
NVO-9m-loss: 18.67089724702614
PFE-9m-loss: 47.23267264501412
ROG-9m-loss: 70.91790710996713


In [12]:
dates12m = ['2022-05-01']#, '2022-06-01', '2022-07-01', '2022-08-01']

for t in tickers:
    loss = get_model_performance(data_macro, t, 0.05, to_exclude, dates12m, 12, params)
    print(f'{t}-12m-loss: {loss}')

NVS-12m-loss: 34.7126594865153
AZN-12m-loss: 30.705399675387714
BMY-12m-loss: 42.87319951579293
JNJ-12m-loss: 35.917879442837766
LLY-12m-loss: 32.12677166131159
MRK-12m-loss: 19.9878014080006
NVO-12m-loss: 26.04278140727248
PFE-12m-loss: 60.53200796572177
ROG-12m-loss: 78.03032275572728


#### 2. Ticker data

In [18]:
data_stocks = data.copy()
to_exclude2 = []

for m in macros:
    to_exclude2.append(m)
    to_exclude2.append(f'{m}_lag')
    to_exclude2.append(f'{m}_lag2')

for t in tickers:
    to_exclude2.append(t)


to_exclude2.append('Date')

In [19]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates1m, 1, params)
    print(f'{t}-1m-loss: {loss}')

NVS-1m-loss: 3.414639907122538
AZN-1m-loss: 2.4748912778635566
BMY-1m-loss: 3.218429476513577
JNJ-1m-loss: 2.8672552235381374
LLY-1m-loss: 2.547822486898083
MRK-1m-loss: 2.1477671400466423
NVO-1m-loss: 2.253368906513552
PFE-1m-loss: 4.295173307153142
ROG-1m-loss: 6.658262253019601


In [20]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates3m, 3, params)
    print(f'{t}-3m-loss: {loss}')

NVS-3m-loss: 8.396974854419883
AZN-3m-loss: 8.347388410096144
BMY-3m-loss: 10.381359601222254
JNJ-3m-loss: 8.661862897360054
LLY-3m-loss: 8.987574628591302
MRK-3m-loss: 6.70548003549375
NVO-3m-loss: 6.553720027384607
PFE-3m-loss: 14.132171488955342
ROG-3m-loss: 21.987254376205744


In [21]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates6m, 6, params)
    print(f'{t}-6m-loss: {loss}')

NVS-6m-loss: 14.636931830989855
AZN-6m-loss: 15.755465305316772
BMY-6m-loss: 20.28829824784301
JNJ-6m-loss: 16.093740106974575
LLY-6m-loss: 16.55997568370876
MRK-6m-loss: 12.922695897970929
NVO-6m-loss: 10.84030722490124
PFE-6m-loss: 26.22467566102721
ROG-6m-loss: 54.683840796184356


In [22]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates9m, 9, params)
    print(f'{t}-9m-loss: {loss}')

NVS-9m-loss: 25.73808214841266
AZN-9m-loss: 25.779581452788236
BMY-9m-loss: 30.36335889222024
JNJ-9m-loss: 26.27263051583249
LLY-9m-loss: 26.405230127405023
MRK-9m-loss: 16.15267108443479
NVO-9m-loss: 18.23960896578994
PFE-9m-loss: 41.662805873575294
ROG-9m-loss: 69.35851588018414


In [23]:
for t in tickers:
    loss = get_model_performance(data_stocks, t, 0.05, to_exclude2, dates12m, 12, params)
    print(f'{t}-12m-loss: {loss}')

NVS-12m-loss: 38.28203700916028
AZN-12m-loss: 29.775583430299854
BMY-12m-loss: 39.48609007062669
JNJ-12m-loss: 35.792903991057514
LLY-12m-loss: 29.37599455417286
MRK-12m-loss: 24.211954757944728
NVO-12m-loss: 25.817952858966862
PFE-12m-loss: 52.26057140479323
ROG-12m-loss: 79.54549250071508


3. All Covariates

In [6]:
covs = data.copy()
to_exclude3 = ['Date']

for m in macros:
    to_exclude3.append(m)

for t in tickers:
    to_exclude3.append(t)

In [25]:
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates1m, 1, params)
    print(f'{t}-1m-loss: {loss}')

NVS-1m-loss: 3.5138862606384227
AZN-1m-loss: 2.507503844520777
BMY-1m-loss: 3.1283990986378263
JNJ-1m-loss: 2.9195623146883727
LLY-1m-loss: 2.4998406369522725
MRK-1m-loss: 2.054256575208234
NVO-1m-loss: 2.189967488495661
PFE-1m-loss: 3.8124814432517407
ROG-1m-loss: 6.008071505005657


In [9]:
dates3m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01']
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates3m, 3, params)
    print(f'{t}-3m-loss: {loss}')

NVS-3m-loss: 8.26193286778823
AZN-3m-loss: 8.442398745087226
BMY-3m-loss: 10.072357140845728
JNJ-3m-loss: 8.792612575112543
LLY-3m-loss: 8.900752986848358
MRK-3m-loss: 6.270732341015224
NVO-3m-loss: 6.190083281700504
PFE-3m-loss: 13.005150728095813
ROG-3m-loss: 20.126271352644416


In [10]:
dates6m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01']
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates6m, 6, params)
    print(f'{t}-6m-loss: {loss}')

NVS-6m-loss: 14.219684305403234
AZN-6m-loss: 16.012362878002637
BMY-6m-loss: 19.867364442662062
JNJ-6m-loss: 16.369703787986104
LLY-6m-loss: 17.145991501218617
MRK-6m-loss: 11.878342356262596
NVO-6m-loss: 10.445144671849276
PFE-6m-loss: 25.892143315164727
ROG-6m-loss: 52.48571713997574


In [11]:
dates9m = ['2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01'] 
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates9m, 9, params)
    print(f'{t}-9m-loss: {loss}')

NVS-9m-loss: 24.042448797456327
AZN-9m-loss: 26.063307441804294
BMY-9m-loss: 29.680066249927812
JNJ-9m-loss: 27.131697044531414
LLY-9m-loss: 27.937611964240997
MRK-9m-loss: 15.228998397200218
NVO-9m-loss: 17.74900231595003
PFE-9m-loss: 41.97860162214687
ROG-9m-loss: 64.8640274165691


In [13]:
dates12m = ['2022-05-01']
for t in tickers:
    loss = get_model_performance(covs, t, 0.05, to_exclude3, dates12m, 12, params)
    print(f'{t}-12m-loss: {loss}')

NVS-12m-loss: 37.00437721749844
AZN-12m-loss: 29.831463652876426
BMY-12m-loss: 38.21431049310663
JNJ-12m-loss: 36.623623744189175
LLY-12m-loss: 31.167928749284883
MRK-12m-loss: 22.713154728204564
NVO-12m-loss: 24.692007247140165
PFE-12m-loss: 52.50643643191302
ROG-12m-loss: 72.09701574702993
