In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
from src.utils import get_weeks
# week_labels = get_weeks(day_from=20160104, num_weeks=121)[52:]
week_labels = get_weeks(day_from=20160104, num_weeks=121)[96:]
# week_labels = get_weeks(day_from=20160104, num_weeks=121)[104:]

In [4]:
print(week_labels)

[20171106, 20171113, 20171120, 20171127, 20171204, 20171211, 20171218, 20171225, 20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [5]:
%%time
weeks = pd.DataFrame()
for name in week_labels[:-1]:
    weeks = pd.concat([weeks, pd.read_feather(
        PROCESSED/f'SVD_17-18_72f/week_{name}_SVD_diffscount.feather')])

CPU times: user 8min 8s, sys: 2min 41s, total: 10min 50s
Wall time: 5min 37s


In [6]:
test = pd.read_feather(
    PROCESSED/f'SVD_17-18_72f/week_{week_labels[-1]}_SVD_diffscount.feather')

## Preprocessing

In [7]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [8]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [9]:
%%time
from src.utils import apply_cats
for col in cat_cols:
    test[col] = test[col].astype('category').cat.as_ordered()
apply_cats(weeks, test)

for col in cat_cols:
    weeks[col] = weeks[col].cat.codes
    test[col] = test[col].cat.codes

CPU times: user 48.5 s, sys: 5.76 s, total: 54.3 s
Wall time: 24.9 s


## Model

In [14]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from src.utils import alert

In [15]:
%%time
val_set = []
train_auc = []
val_auc = []
for i in range(-5, -1):
    train, val = weeks[weeks.TradeDateKey<week_labels[i]], \
                 weeks[(weeks.TradeDateKey<week_labels[i+1]) & \
                       (weeks.TradeDateKey>=week_labels[i])]
    print(train.TradeDateKey.min(), train.TradeDateKey.max(), 
          val.TradeDateKey.unique())
    val_set.append(val.TradeDateKey.unique()[0])
    y_train = train[target_col]
    train.drop(id_cols + [target_col], axis=1, inplace=True)
    y_val = val[target_col]
    val.drop(id_cols + [target_col], axis=1, inplace=True)

    model = LGBMClassifier(n_estimators=400, max_depth=30, 
                           random_state=42, reg_alpha=1, reg_lambda=1,
                           colsample_by_tree=0.8)
    model.fit(train, y_train, eval_metric='auc', verbose=20,
              eval_set=[(val, y_val)], early_stopping_rounds=30)
    
    y_pred = model.predict_proba(train)[:,1]
    train_auc.append(roc_auc_score(y_train, y_pred))
    print('Train AUC: ', train_auc[-1])
    y_pred = model.predict_proba(val)[:,1]
    val_auc.append(roc_auc_score(y_val, y_pred))
    print('Val AUC: ', val_auc[-1])
    print()
    
    del model, train, y_train, val, y_val, y_pred
alert()

20171106 20180319 [20180326]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training until validation scores don't improve for 30 rounds.
[20]	valid_0's auc: 0.768711
[40]	valid_0's auc: 0.807894
[60]	valid_0's auc: 0.818104
[80]	valid_0's auc: 0.823093
[100]	valid_0's auc: 0.824779
[120]	valid_0's auc: 0.825918
[140]	valid_0's auc: 0.826365
[160]	valid_0's auc: 0.826911
[180]	valid_0's auc: 0.826778
[200]	valid_0's auc: 0.827526
[220]	valid_0's auc: 0.82819
[240]	valid_0's auc: 0.828357
[260]	valid_0's auc: 0.828797
[280]	valid_0's auc: 0.829041
[300]	valid_0's auc: 0.82939
[320]	valid_0's auc: 0.829768
[340]	valid_0's auc: 0.830076
[360]	valid_0's auc: 0.830191
[380]	valid_0's auc: 0.830427
[400]	valid_0's auc: 0.83076
Did not meet early stopping. Best iteration is:
[397]	valid_0's auc: 0.830858
Train AUC:  0.814085351545663
Val AUC:  0.8308576246577456

20171106 20180326 [20180402]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training until validation scores don't improve for 30 rounds.
[20]	valid_0's auc: 0.789749
[40]	valid_0's auc: 0.808376
[60]	valid_0's auc: 0.81554
[80]	valid_0's auc: 0.81949
[100]	valid_0's auc: 0.821305
[120]	valid_0's auc: 0.822107
[140]	valid_0's auc: 0.824345
[160]	valid_0's auc: 0.824478
[180]	valid_0's auc: 0.825238
[200]	valid_0's auc: 0.825254
[220]	valid_0's auc: 0.826403
[240]	valid_0's auc: 0.826401
[260]	valid_0's auc: 0.826467
[280]	valid_0's auc: 0.826176
Early stopping, best iteration is:
[261]	valid_0's auc: 0.826517
Train AUC:  0.8085354190344088
Val AUC:  0.8265170096820063

20171106 20180402 [20180409]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training until validation scores don't improve for 30 rounds.
[20]	valid_0's auc: 0.820672
[40]	valid_0's auc: 0.833682
[60]	valid_0's auc: 0.83898
[80]	valid_0's auc: 0.842039
[100]	valid_0's auc: 0.844371
[120]	valid_0's auc: 0.846007
[140]	valid_0's auc: 0.846573
[160]	valid_0's auc: 0.847067
[180]	valid_0's auc: 0.847863
[200]	valid_0's auc: 0.848119
[220]	valid_0's auc: 0.848677
[240]	valid_0's auc: 0.848639
[260]	valid_0's auc: 0.849087
[280]	valid_0's auc: 0.849199
[300]	valid_0's auc: 0.84955
[320]	valid_0's auc: 0.84987
[340]	valid_0's auc: 0.84982
Early stopping, best iteration is:
[328]	valid_0's auc: 0.850013
Train AUC:  0.8146025107778859
Val AUC:  0.8500130873334503

20171106 20180409 [20180416]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Training until validation scores don't improve for 30 rounds.
[20]	valid_0's auc: 0.843182
[40]	valid_0's auc: 0.854489
[60]	valid_0's auc: 0.859722
[80]	valid_0's auc: 0.864106
[100]	valid_0's auc: 0.86584
[120]	valid_0's auc: 0.867652
[140]	valid_0's auc: 0.868589
[160]	valid_0's auc: 0.869524
[180]	valid_0's auc: 0.869834
[200]	valid_0's auc: 0.870547
[220]	valid_0's auc: 0.871157
[240]	valid_0's auc: 0.871417
[260]	valid_0's auc: 0.871892
[280]	valid_0's auc: 0.872228
[300]	valid_0's auc: 0.872479
[320]	valid_0's auc: 0.87269
[340]	valid_0's auc: 0.873121
[360]	valid_0's auc: 0.873409
[380]	valid_0's auc: 0.873617
[400]	valid_0's auc: 0.873756
Did not meet early stopping. Best iteration is:
[400]	valid_0's auc: 0.873756
Train AUC:  0.8211784976806455
Val AUC:  0.8737564989284256



CPU times: user 3h 9min 4s, sys: 3min 30s, total: 3h 12min 34s
Wall time: 17min 14s


In [23]:
results = pd.DataFrame()
results['val_set'] = val_set
results['train_auc'] = train_auc
results['val_auc'] = val_auc
results['iterations'] = [397,261,328,400]

In [21]:
results = pd.DataFrame()
results['val_set'] = [20180326, 20180402, 20180409, 20180416]
results['train_auc'] = [0.7980312197668828, 0.8062227442047145, 0.8161902543359266, 0.8038221068804319]
results['val_auc'] = [0.8273520913252462, 0.824267377403749, 0.8516418834039483, 0.8693741344750291]
results['iterations'] = [163, 231, 376, 137]

In [22]:
print(results.train_auc.mean(), results.val_auc.mean(), 
      results.iterations.mean())
results

0.8060665812969889 0.8431588716519931 226.75


Unnamed: 0,val_set,train_auc,val_auc,iterations
0,20180326,0.798031,0.827352,163
1,20180402,0.806223,0.824267,231
2,20180409,0.81619,0.851642,376
3,20180416,0.803822,0.869374,137


In [24]:
# (n_estimators=400, max_depth=30, random_state=42, 
#  reg_alpha=1, reg_lambda=1, colsample_by_tree=0.8)
print(results.train_auc.mean(), results.val_auc.mean(), 
      results.iterations.mean())
results

0.8146004447596509 0.8452860551504069 346.5


Unnamed: 0,val_set,train_auc,val_auc,iterations
0,20180326,0.814085,0.830858,397
1,20180402,0.808535,0.826517,261
2,20180409,0.814603,0.850013,328
3,20180416,0.821178,0.873756,400
