In [2]:
from time import time
import numpy as np
import pandas as pd
import warnings

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from itertools import product
import joblib
from pathlib import Path

warnings.filterwarnings('ignore')
np.random.seed(42)

In [3]:
def get_one_hot_data(df, cols=('year', 'month', 'age', 'msize')):
    cols = list(cols)
    df = pd.get_dummies(df, columns=cols + ['sector'], prefix=cols + [''], prefix_sep=['_'] * len(cols) + [''])
    return df.rename(columns={c: c.replace('.0', '').replace(' ', '_').lower() for c in df.columns})

In [4]:
# Create holdout test set
def get_holdout_set(target, features, period=6):
    idx = pd.IndexSlice
    label = target.name
    dates = np.sort(target.index.get_level_values('date').unique())
    cv_start, cv_end = dates[0], dates[-period - 2]
    holdout_start, holdout_end = dates[-period - 1], dates[-1]

    df = features.join(target.to_frame())
    train = df.loc[idx[:, cv_start: cv_end], :]
    y_train, X_train = train[label], train.drop(label, axis=1)

    test = df.loc[idx[:, holdout_start: holdout_end], :]
    y_test, X_test = test[label], test.drop(label, axis=1)
    return y_train, X_train, y_test, X_test

# Custom TimeSeriesSplit
class OneStepTimeSeriesSplit:
    """Generates tuples of train_idx, test_idx pairs Assumes the index contains a level labeled 'date'"""

    def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
        self.n_splits = n_splits
        self.test_period_length = test_period_length
        self.shuffle = shuffle
        self.test_end = n_splits * test_period_length

    @staticmethod
    def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    def split(self, X, y=None, groups=None):
        unique_dates = (X.index.get_level_values('date').unique().sort_values(ascending=False)
        [:self.test_end])

        dates = X.reset_index()[['date']]
        for test_date in self.chunks(unique_dates, self.test_period_length):
            train_idx = dates[dates.date < min(test_date)].index
            test_idx = dates[dates.date.isin(test_date)].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx, test_idx

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

# Instantiate GradientBoostingClassifier
# loss{‘deviance’, ‘exponential’} : deviance for logistic regression, exponential for AdaBoost
gb_clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                    criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                    init=None, random_state=None, max_features=None, verbose=0,
                                    max_leaf_nodes=None, warm_start=False,
                                    validation_fraction=0.1, n_iter_no_change=None, tol=0.0001)

In [5]:
DATA_STORE = Path('data/assets.h5')

def get_data(start='2010', end='2018', holding_period=1, dropna=False):
    idx = pd.IndexSlice
    target = f'target_{holding_period}m'
    with pd.HDFStore(DATA_STORE) as store:
        print(store.info())
        df = store['engineered_features']

    print(df.tail())

    if start is not None and end is not None:
        df = df.loc[idx[:, start: end], :]
    if dropna:
        df = df.dropna()

    y = (df[target] > 0).astype(int)
    X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)
    return y, X

n_splits = 12
cv = OneStepTimeSeriesSplit(n_splits=n_splits)
y, features = get_data()
X = get_one_hot_data(features).dropna()

y, X, y_test, X_test = get_holdout_set(target=y, features=X)

data_path = Path('data')
if not data_path.exists():
    data_path.mkdir()

with pd.HDFStore(data_path / 'tuning_sklearn_gbm.h5') as store:
    store.put('holdout/features', X_test)
    store.put('holdout/target', y_test)
    store.put('cv/target', y)
    store.put('cv/features', X)

<class 'pandas.io.pytables.HDFStore'>
File path: data\assets.h5
/engineered_features                        frame        (shape->[358914,33])                                                                
/quandl/wiki/prices                         frame        (shape->[15389314,12])                                                              
/quandl/wiki/stocks                         frame        (shape->[1,2])                                                                      
/sp500/fred                                 frame        (shape->[2608,1])                                                                   
/sp500/stocks                               frame        (shape->[503,7])                                                                    
/sp500/stooq                                frame        (shape->[18517,5])                                                                  
/stooq/jp/tse/stocks/prices                 frame_table  (typ->appendable_multi,nrow

In [6]:
X.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,return_1m,return_2m,return_3m,return_6m,return_9m,return_12m,mkt-rf,smb,hml,rmw,...,consumer_non-durables,consumer_services,energy,finance,health_care,miscellaneous,public_utilities,technology,transportation,unknown
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ZUMZ,2017-03-31,-0.102941,-0.044637,-0.057387,0.002759,0.027704,-0.007044,1.005997,0.130166,-0.826775,2.247905,...,0,1,0,0,0,0,0,0,0,0
ZUMZ,2017-04-30,-0.019126,-0.061969,-0.036208,-0.035159,0.006192,0.005633,1.198188,-0.329949,-0.566054,1.931023,...,0,1,0,0,0,0,0,0,0,0
ZUMZ,2017-05-31,-0.206128,-0.117567,-0.112718,-0.088212,-0.017474,-0.003599,1.260955,-0.792299,-0.087712,1.830295,...,0,1,0,0,0,0,0,0,0,0
ZUMZ,2017-06-30,-0.133333,-0.170529,-0.122854,-0.09071,-0.040993,-0.0122,1.210833,-0.605,0.128989,1.819012,...,0,1,0,0,0,0,0,0,0,0
ZUMZ,2017-07-31,0.02834,-0.055951,-0.108927,-0.073281,-0.060403,-0.023912,1.269729,-0.7086,0.121143,1.964291,...,0,1,0,0,0,0,0,0,0,0


In [11]:
# Parameter Grid - reduced several params coz of time-consuming from original params
param_grid = dict(learning_rate=[.01, .1], max_depth=list(range(4, 9, 4)), max_features=['sqrt', 1],
        min_impurity_decrease=[.01], min_samples_split=[10, 50], n_estimators=[100], subsample=[.8, 1])
all_params = list(product(*param_grid.values()))
print('# Models = :', len(all_params))

# Models = : 32


In [12]:
gs = GridSearchCV(gb_clf, param_grid, cv=cv, scoring='roc_auc', verbose=3, n_jobs=-1, return_train_score=True)

start = time()
gs.fit(X=X, y=y)
done = time()

print(f'Done in {done:.2f}s')
joblib.dump(gs, 'results/sklearn_gbm_gridsearch.joblib')

Fitting 12 folds for each of 32 candidates, totalling 384 fits
Done in 1684421634.52s


['results/sklearn_gbm_gridsearch.joblib']