In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
from src.utils import get_weeks
# week_labels = get_weeks(day_from=20160104, num_weeks=121)[52:]
week_labels = get_weeks(day_from=20160104, num_weeks=121)[96:]
# week_labels = get_weeks(day_from=20160104, num_weeks=121)[104:]

In [4]:
print(week_labels)

[20171106, 20171113, 20171120, 20171127, 20171204, 20171211, 20171218, 20171225, 20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [5]:
%%time
weeks = pd.DataFrame()
for name in week_labels[:-1]:
    weeks = pd.concat([weeks, pd.read_feather(
        PROCESSED/f'SVD_17-18_72f/week_{name}_SVD_diffscount.feather')])

CPU times: user 8min 8s, sys: 2min 41s, total: 10min 50s
Wall time: 5min 37s


In [6]:
test = pd.read_feather(
    PROCESSED/f'SVD_17-18_72f/week_{week_labels[-1]}_SVD_diffscount.feather')

## Preprocessing

In [7]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [8]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [9]:
%%time
from src.utils import apply_cats
for col in cat_cols:
    test[col] = test[col].astype('category').cat.as_ordered()
apply_cats(weeks, test)

for col in cat_cols:
    weeks[col] = weeks[col].cat.codes
    test[col] = test[col].cat.codes

CPU times: user 48.5 s, sys: 5.76 s, total: 54.3 s
Wall time: 24.9 s


## Model

In [10]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

In [11]:
%%time
val_set = []
train_auc = []
val_auc = []
for i in range(-5, -1):
    train, val = weeks[weeks.TradeDateKey<week_labels[i]], \
                 weeks[(weeks.TradeDateKey<week_labels[i+1]) & \
                       (weeks.TradeDateKey>=week_labels[i])]
    print(train.TradeDateKey.min(), train.TradeDateKey.max(), 
          val.TradeDateKey.unique())
    val_set.append(val.TradeDateKey.unique()[0])
    y_train = train[target_col]
    train.drop(id_cols + [target_col], axis=1, inplace=True)
    y_val = val[target_col]
    val.drop(id_cols + [target_col], axis=1, inplace=True)

    model = LGBMClassifier(n_estimators=400)
    model.fit(train, y_train, eval_metric='auc', verbose=20,
              eval_set=[(val, y_val)], early_stopping_rounds=30)
    
    y_pred = model.predict_proba(train)[:,1]
    train_auc.append(roc_auc_score(y_train, y_pred))
    print('Train AUC: ', train_auc[-1])
    y_pred = model.predict_proba(val)[:,1]
    val_auc.append(roc_auc_score(y_val, y_pred))
    print('Val AUC: ', val_auc[-1])
    print()
    
    del model, train, y_train, val, y_val, y_pred

20171106 20180319 [20180326]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training until validation scores don't improve for 30 rounds.
[20]	valid_0's auc: 0.771599
[40]	valid_0's auc: 0.806475
[60]	valid_0's auc: 0.816678
[80]	valid_0's auc: 0.822398
[100]	valid_0's auc: 0.824564
[120]	valid_0's auc: 0.826187
[140]	valid_0's auc: 0.826628
[160]	valid_0's auc: 0.827237
[180]	valid_0's auc: 0.826049
Early stopping, best iteration is:
[163]	valid_0's auc: 0.827352
Train AUC:  0.7980312197668828
Val AUC:  0.8273520913252462

20171106 20180326 [20180402]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training until validation scores don't improve for 30 rounds.
[20]	valid_0's auc: 0.789657
[40]	valid_0's auc: 0.809884
[60]	valid_0's auc: 0.815839
[80]	valid_0's auc: 0.81836
[100]	valid_0's auc: 0.820041
[120]	valid_0's auc: 0.819974
[140]	valid_0's auc: 0.821246
[160]	valid_0's auc: 0.821957
[180]	valid_0's auc: 0.822164
[200]	valid_0's auc: 0.82264
[220]	valid_0's auc: 0.823098
[240]	valid_0's auc: 0.824027
[260]	valid_0's auc: 0.824055
Early stopping, best iteration is:
[231]	valid_0's auc: 0.824267
Train AUC:  0.8062227442047145
Val AUC:  0.824267377403749

20171106 20180402 [20180409]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training until validation scores don't improve for 30 rounds.
[20]	valid_0's auc: 0.819693
[40]	valid_0's auc: 0.833246
[60]	valid_0's auc: 0.839031
[80]	valid_0's auc: 0.842935
[100]	valid_0's auc: 0.844152
[120]	valid_0's auc: 0.846077
[140]	valid_0's auc: 0.84763
[160]	valid_0's auc: 0.848228
[180]	valid_0's auc: 0.848309
[200]	valid_0's auc: 0.848596
[220]	valid_0's auc: 0.849268
[240]	valid_0's auc: 0.849654
[260]	valid_0's auc: 0.850118
[280]	valid_0's auc: 0.850131
[300]	valid_0's auc: 0.850979
[320]	valid_0's auc: 0.851299
[340]	valid_0's auc: 0.851385
[360]	valid_0's auc: 0.8515
[380]	valid_0's auc: 0.851636
[400]	valid_0's auc: 0.851637
Did not meet early stopping. Best iteration is:
[376]	valid_0's auc: 0.851642
Train AUC:  0.8161902543359266
Val AUC:  0.8516418834039483

20171106 20180409 [20180416]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training until validation scores don't improve for 30 rounds.
[20]	valid_0's auc: 0.841922
[40]	valid_0's auc: 0.854103
[60]	valid_0's auc: 0.859743
[80]	valid_0's auc: 0.864081
[100]	valid_0's auc: 0.86642
[120]	valid_0's auc: 0.868242
[140]	valid_0's auc: 0.867831
[160]	valid_0's auc: 0.868615
Early stopping, best iteration is:
[137]	valid_0's auc: 0.869374
Train AUC:  0.8038221068804319
Val AUC:  0.8693741344750291

CPU times: user 2h 20min 40s, sys: 3min 2s, total: 2h 23min 42s
Wall time: 13min 10s


In [12]:
results = pd.DataFrame()
results['val_set'] = [20180326, 20180402, 20180409, 20180416]
results['train_auc'] = [0.7980312197668828, 0.8062227442047145, 0.8161902543359266, 0.8038221068804319]
results['val_auc'] = [0.8273520913252462, 0.824267377403749, 0.8516418834039483, 0.8693741344750291]
results['iterations'] = [163, 231, 376, 137]

In [13]:
results

Unnamed: 0,val_set,train_auc,val_auc,iterations
0,20180326,0.798031,0.827352,163
1,20180402,0.806223,0.824267,231
2,20180409,0.81619,0.851642,376
3,20180416,0.803822,0.869374,137
