In [15]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from time import time
import sys, os
from pathlib import Path

import pandas as pd
from scipy.stats import spearmanr

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

import seaborn as sns
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from utils4t import MultipleTimeSeriesCV
sns.set_style('whitegrid')
YEAR = 252
idx = pd.IndexSlice
scope_params = ['lookahead', 'train_length', 'test_length']
daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
catboost_train_params = ['max_depth', 'min_child_samples']

In [16]:
# Generate LightGBM predictions
base_params = dict(boosting='gbdt', objective='regression', verbose=-1, num_threads=6, device='cpu')

categoricals = ['year', 'month', 'sector', 'weekday']
lookahead = 1
store = Path('data/predictions.h5')

In [17]:
data = pd.read_hdf('data/data.h5', 'model_data').sort_index()
labels = sorted(data.filter(like='_fwd').columns)
features = data.columns.difference(labels).tolist()
label = f'r{lookahead:02}_fwd'
data = data.loc[idx[:, '2010':], features + [label]].dropna()
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

lgb_data = lgb.Dataset(data=data[features], label=data[label], categorical_feature=categoricals,
                       free_raw_data=False)

In [18]:
# Generate predictions
lgb_ic = pd.read_hdf('data/model_tuning.h5', 'lgb/ic')
lgb_daily_ic = pd.read_hdf('data/model_tuning.h5', 'lgb/daily_ic')

def get_lgb_params(data, t=5, best=0):
    param_cols = scope_params[1:] + lgb_train_params + ['boost_rounds']
    df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
    return df.loc[param_cols]

for position in range(10):
    params = get_lgb_params(lgb_daily_ic, t=lookahead, best=position)

    params = params.to_dict()

    for p in ['min_data_in_leaf', 'num_leaves']:
        params[p] = int(params[p])

    train_length = int(params.pop('train_length'))
    test_length = int(params.pop('test_length'))
    num_boost_round = int(params.pop('boost_rounds'))
    params.update(base_params)

    print(f'\nPosition: {position:02}')

    # 1-year out-of-sample period
    n_splits = int(YEAR / test_length)
    cv = MultipleTimeSeriesCV(n_splits=n_splits, test_period_length=test_length, lookahead=lookahead,
                              train_period_length=train_length)

    predictions = []
    start = time()
    for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
        print(i, end=' ', flush=True)
        lgb_train = lgb_data.subset(used_indices=train_idx.tolist(), params=params).construct()

        model = lgb.train(params=params, train_set=lgb_train, num_boost_round=num_boost_round,
                          verbose_eval=False)

        test_set = data.iloc[test_idx, :]
        y_test = test_set.loc[:, label].to_frame('y_test')
        y_pred = model.predict(test_set.loc[:, model.feature_name()])
        predictions.append(y_test.assign(prediction=y_pred))

    if position == 0:
        test_predictions = (pd.concat(predictions)
                            .rename(columns={'prediction': position}))
    else:
        test_predictions[position] = pd.concat(predictions).prediction

by_day = test_predictions.groupby(level='date')
for position in range(10):
    if position == 0:
        ic_by_day = by_day.apply(lambda x: spearmanr(
            x.y_test, x[position])[0]).to_frame()
    else:
        ic_by_day[position] = by_day.apply(
            lambda x: spearmanr(x.y_test, x[position])[0])

print(ic_by_day.describe())
test_predictions.to_hdf(store, f'lgb/test/{lookahead:02}')


Position: 00
1 2 3 4 
Position: 01
1 2 3 4 
Position: 02
1 2 3 4 
Position: 03
1 2 3 4 
Position: 04
1 2 3 4 
Position: 05
1 2 3 4 
Position: 06
1 2 3 4 
Position: 07
1 2 3 4 
Position: 08
1 2 3 4 
Position: 09
1 2 3 4                 0           1           2           3           4           5  \
count  252.000000  252.000000  252.000000  252.000000  252.000000  252.000000   
mean     0.010675    0.011592    0.010631    0.008189    0.007743    0.006275   
std      0.113152    0.113917    0.113226    0.118084    0.118385    0.119454   
min     -0.272160   -0.285236   -0.259701   -0.327668   -0.318522   -0.316593   
25%     -0.063408   -0.065001   -0.066844   -0.065090   -0.066343   -0.068451   
50%      0.011066    0.013032    0.012221   -0.001202   -0.001485    0.000511   
75%      0.080009    0.087169    0.084750    0.079256    0.076535    0.073616   
max      0.314746    0.309612    0.318184    0.405628    0.398021    0.405234   

                6           7           8         

In [19]:
# Generate CatBoost predictions
lookaheads = [1, 5, 21]
label_dict = dict(zip(lookaheads, labels))

lookahead = 1
store = Path('data/predictions.h5')

data = pd.read_hdf('data/data.h5', 'model_data').sort_index()
labels = sorted(data.filter(like='_fwd').columns)
features = data.columns.difference(labels).tolist()
label = f'r{lookahead:02}_fwd'

data = data.loc[idx[:, '2010':], features + [label]].dropna()
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

cat_cols_idx = [data.columns.get_loc(c) for c in categoricals]
catboost_data = Pool(label=data[label], data=data.drop(label, axis=1), cat_features=cat_cols_idx)

In [20]:
# Generate predictions
catboost_ic = pd.read_hdf('data/model_tuning.h5', 'catboost/ic')
catboost_ic_avg = pd.read_hdf('data/model_tuning.h5', 'catboost/daily_ic')

def get_cb_params(data, t=5, best=0):
    param_cols = scope_params[1:] + catboost_train_params + ['boost_rounds']
    df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
    return df.loc[param_cols]

for position in range(10):
    params = get_cb_params(catboost_ic_avg, t=lookahead, best=position)

    params = params.to_dict()

    for p in ['max_depth', 'min_child_samples']:
        params[p] = int(params[p])

    train_length = int(params.pop('train_length'))
    test_length = int(params.pop('test_length'))
    num_boost_round = int(params.pop('boost_rounds'))
    params['task_type'] = 'GPU'
    params['thread_count'] = -1

    print(f'\nPosition: {position:02}')

    # 1-year out-of-sample period
    n_splits = int(YEAR / test_length)
    cv = MultipleTimeSeriesCV(n_splits=n_splits, test_period_length=test_length, lookahead=lookahead,
                              train_period_length=train_length)

    predictions = []
    start = time()
    for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
        print(i, end=' ', flush=True)
        train_set = catboost_data.slice(train_idx.tolist())

        model = CatBoostRegressor(**params)
        model.fit(X=train_set, verbose_eval=False)

        test_set = data.iloc[test_idx, :]
        y_test = test_set.loc[:, label].to_frame('y_test')
        y_pred = model.predict(test_set.loc[:, model.feature_names_])
        predictions.append(y_test.assign(prediction=y_pred))

    if position == 0:
        test_predictions = (pd.concat(predictions)
                            .rename(columns={'prediction': position}))
    else:
        test_predictions[position] = pd.concat(predictions).prediction

by_day = test_predictions.groupby(level='date')
for position in range(10):
    if position == 0:
        ic_by_day = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0]).to_frame()
    else:
        ic_by_day[position] = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0])

print(ic_by_day.describe())
test_predictions.to_hdf(store, f'catboost/test/{lookahead:02}')


Position: 00
1 2 3 4 
Position: 01
1 2 3 4 
Position: 02
1 2 3 4 
Position: 03
1 2 3 4 
Position: 04
1 2 3 4 
Position: 05
1 2 3 4 
Position: 06
1 2 3 4 
Position: 07
1 2 3 4 
Position: 08
1 2 3 4 
Position: 09
1 2 3 4                 0           1           2           3           4           5  \
count  252.000000  252.000000  252.000000  252.000000  252.000000  252.000000   
mean     0.016724    0.016368    0.015685    0.016137    0.016227    0.016137   
std      0.097576    0.099424    0.098993    0.098963    0.098876    0.098963   
min     -0.286148   -0.286148   -0.290912   -0.286148   -0.286148   -0.286148   
25%     -0.043081   -0.047206   -0.047474   -0.046847   -0.047474   -0.046847   
50%      0.014120    0.019711    0.017389    0.020700    0.019711    0.020700   
75%      0.079402    0.080934    0.081175    0.081175    0.081175    0.081175   
max      0.311490    0.336327    0.311490    0.311490    0.311490    0.311490   

                6           7           8         