In [34]:
%matplotlib inline
import warnings
from random import shuffle
from time import time
import numpy as np
import pandas as pd
from itertools import product
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns
import shap
import xgboost as xgb
from xgboost.callback import reset_learning_rate

In [35]:
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
idx = pd.IndexSlice
np.random.seed(42)

In [36]:
def get_data(start='2000', end='2018', holding_period=1, dropna=False):
    idx = pd.IndexSlice
    target = f'target_{holding_period}m'
    with pd.HDFStore('data.h5') as store:
        df = store['data']

    if start is not None and end is not None:
        df = df.loc[idx[:, start: end], :]
    if dropna:
        df = df.dropna()

    y = (df[target] > 0).astype(int)
    X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)
    return y, X

In [37]:
def get_one_hot_data(df, cols=('year', 'month', 'age', 'msize')):
    cols = list(cols)
    df = pd.get_dummies(df,
                        columns=cols + ['sector'],
                        prefix=cols + [''],
                        prefix_sep=['_'] * len(cols) + [''])
    return df.rename(columns={c: c.replace('.0', '').replace(' ', '_').lower() for c in df.columns})

In [41]:
def get_holdout_set(target, features, period=6):
    idx = pd.IndexSlice
    label = target.name
    dates = np.sort(target.index.get_level_values('date').unique())
    cv_start, cv_end = dates[0], dates[-period - 2]
    holdout_start, holdout_end = dates[-period - 1], dates[-1]

    df = features.join(target.to_frame())
    train = df.loc[idx[:, cv_start: cv_end], :]
    y_train, X_train = train[label], train.drop(label, axis=1)

    test = df.loc[idx[:, holdout_start: holdout_end], :]
    y_test, X_test = test[label], test.drop(label, axis=1)
    return y_train, X_train, y_test, X_test

In [42]:
y, X = get_data()
X = get_one_hot_data(X)
y_train, X_train, y_test, X_test = get_holdout_set(target=y, features=X)

In [43]:
dtrain = xgb.DMatrix(label=y_train,
                     data=X_train,
                     nthread=-1)
dtest = xgb.DMatrix(label=y_test,
                    data=X_test,
                    nthread=-1)

In [47]:
with pd.HDFStore('results.h5') as store:
#     print(store.info())
    results = store['xgboost/dummies']
results.head(10)

Unnamed: 0,rounds,train,valid,booster,learning_rate,gamma,max_depth,colsample_bytree,time
0,59.166667,0.798142,0.683618,gbtree,0.1,1,9,1.0,85.372013
1,59.166667,0.798142,0.683618,gbtree,0.1,5,9,1.0,94.299514
2,59.166667,0.798142,0.683618,gbtree,0.1,0,9,1.0,75.323257
3,47.666667,0.790726,0.681356,dart,0.1,1,9,0.8,564.30003
4,47.666667,0.790726,0.681356,dart,0.1,5,9,0.8,541.779476
5,47.666667,0.790726,0.681356,dart,0.1,0,9,0.8,490.855656
6,53.833333,0.847023,0.679781,gbtree,0.1,0,11,0.8,133.131528
7,53.833333,0.847023,0.679781,gbtree,0.1,1,11,0.8,136.267916
8,53.833333,0.847023,0.679781,gbtree,0.1,5,11,0.8,128.874094
9,44.583333,0.790091,0.679346,dart,0.1,0,9,1.0,435.602095


In [50]:
params = dict(
    booster='gbtree',
    objective='gpu:binary:logistic',
    eval_metric=['logloss', 'auc'],
    tree_method='gpu_hist',
    max_depth=9,
    learning_rate=0.1,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=1,
    colsample_bytree=1,
    colsample_bylevel=1,
    reg_alpha=0,
    reg_lambda=1,
    silent=1,
    seed=42,
)

In [51]:
scores = {}
model = xgb.train(params=params,
                  dtrain=dtrain,
                  evals=list(zip([dtrain, dtest], ['train', 'test'])),
                  verbose_eval=False,
                  num_boost_round=60,
                  early_stopping_rounds=None,
                  evals_result=scores)

In [53]:
y_pred = model.predict(dtest)
roc_auc_score(y_true=y_test, y_score=y_pred)

0.6535846505578833

In [55]:
# load JS visualization code to notebook
shap.initjs()

# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test.iloc[:1000])

# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])

In [60]:
shap.force_plot(explainer.expected_value, shap_values[:1000,:], X_test.iloc[:1000])