In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [18]:
from src.utils import get_weeks
# week_labels = get_weeks(day_from=20160104, num_weeks=121)[52:]
week_labels = get_weeks(day_from=20160104, num_weeks=121)[96:]

In [19]:
print(week_labels)

[20171106, 20171113, 20171120, 20171127, 20171204, 20171211, 20171218, 20171225, 20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [15]:
%%time
weeks = []
for name in week_labels:
    weeks.append(pd.read_feather(PROCESSED/f'all_weeks/week_{name}_diffscount.feather'))

CPU times: user 4.39 s, sys: 5.08 s, total: 9.47 s
Wall time: 21.4 s


## SVD features

In [19]:
from surprise import dump
_, svd = dump.load(PROCESSED/'2017_2018_IsinBuySell_SVD.dump')

In [20]:
uid = svd.trainset._raw2inner_id_users
iid = svd.trainset._raw2inner_id_items

In [21]:
%%time
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
for w in weeks:
    w['SVD_CustomerBias'] = [svd.bu[uid[x]] for x in w.CustomerIdx]
    w['SVD_IsinBuySellBias'] = [svd.bi[iid[f'{iIdx}-{bIdx}']] \
                                for iIdx, bIdx in zip(w.IsinIdx, w.BuySell)]
    w['SVD_Recommend'] = [svd.predict(cIdx, f'{iIdx}-{bIdx}').est \
                          for cIdx, iIdx, bIdx in zip(w.CustomerIdx, w.IsinIdx, w.BuySell)]

CPU times: user 3min 47s, sys: 2.29 s, total: 3min 49s
Wall time: 3min 49s


In [33]:
%%time
for n, w in zip(week_labels, weeks):
    customer_factors = np.array([svd.pu[uid[cIdx]] for cIdx in w.CustomerIdx])
    isin_factors = np.array([svd.qi[iid[f'{iIdx}-{bIdx}']] \
                             for iIdx, bIdx in zip(w.IsinIdx, w.BuySell)])
    for i in range(customer_factors.shape[1]):
        w[f'SVD_CustomerFactor{i:02}'] = customer_factors[:,i]
    for i in range(isin_factors.shape[1]):
        w[f'SVD_IsinBuySellFactor{i:02}'] = isin_factors[:,i]

CPU times: user 1min 7s, sys: 5.03 s, total: 1min 12s
Wall time: 1min 12s


## Extra features

In [36]:
%%time
for w in weeks:
    w['Year'] = w.TradeDateKey.apply(lambda x: x // 10**4)
    w['Month'] = w.TradeDateKey.apply(lambda x: x // 10**4 % 10**2)
    w['Day'] = w.TradeDateKey.apply(lambda x: x % 10**2)

CPU times: user 26.4 s, sys: 616 ms, total: 27 s
Wall time: 27 s


In [44]:
%%time
from src.utils import date_diff
for n, w in zip(week_labels, weeks):
    w['BondDuration']  = [date_diff(x, y) for x, y in zip(w.ActualMaturityDateKey, w.IssueDateKey)]
    w['BondRemaining'] = [date_diff(x, y) for x, y in zip(w.ActualMaturityDateKey, w.TradeDateKey)]
    w['BondLife']      = [date_diff(x, y) for x, y in zip(w.TradeDateKey, w.IssueDateKey)]

CPU times: user 1min 42s, sys: 480 ms, total: 1min 43s
Wall time: 1min 43s


In [38]:
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [39]:
%%time
from src.utils import preprocessing_pipeline
for i, w in enumerate(weeks):
    weeks[i] = preprocessing_pipeline(w, customer, isin, trade)

CPU times: user 30.4 s, sys: 4.22 s, total: 34.6 s
Wall time: 34.6 s


## Preprocessing

In [40]:
from functools import cmp_to_key
from src.utils import composite_rating_cmp
ratings = list(isin.CompositeRating.value_counts().index)
ratings = sorted(ratings, key=cmp_to_key(composite_rating_cmp), reverse=True)
rank = {k: i for i, k in enumerate(ratings)}

In [41]:
%%time
for w in weeks:
    w['CompositeRating']  = w.CompositeRating.apply(lambda x: rank[x])

CPU times: user 9.92 s, sys: 872 ms, total: 10.8 s
Wall time: 10.8 s


In [52]:
weeks[0].columns

Index(['TradeDateKey', 'CustomerIdx', 'IsinIdx', 'BuySell', 'CustomerInterest',
       'DaysSinceBuySell', 'DaysSinceTransaction', 'DaysSinceCustomerActivity',
       'DaysSinceBondActivity', 'DaysCountBuySell', 'DaysCountTransaction',
       'DaysCountCustomerActivity', 'DaysCountBondActivity',
       'SVD_CustomerBias', 'SVD_IsinBuySellBias', 'SVD_Recommend',
       'SVD_CustomerFactor00', 'SVD_CustomerFactor01', 'SVD_CustomerFactor02',
       'SVD_CustomerFactor03', 'SVD_CustomerFactor04', 'SVD_CustomerFactor05',
       'SVD_CustomerFactor06', 'SVD_CustomerFactor07', 'SVD_CustomerFactor08',
       'SVD_CustomerFactor09', 'SVD_CustomerFactor10', 'SVD_CustomerFactor11',
       'SVD_CustomerFactor12', 'SVD_CustomerFactor13', 'SVD_CustomerFactor14',
       'SVD_IsinBuySellFactor00', 'SVD_IsinBuySellFactor01',
       'SVD_IsinBuySellFactor02', 'SVD_IsinBuySellFactor03',
       'SVD_IsinBuySellFactor04', 'SVD_IsinBuySellFactor05',
       'SVD_IsinBuySellFactor06', 'SVD_IsinBuySellFactor07

In [57]:
%%time
for w, n in zip(weeks, week_labels):
    w.reset_index(drop=True).to_feather(PROCESSED/f'SVD_17-18_72f/week_{n}_SVD_diffscount.feather')

CPU times: user 1min 43s, sys: 37.8 s, total: 2min 20s
Wall time: 2min 37s


In [20]:
%%time
### Checkpoint
weeks = []
for name in week_labels:
    weeks.append(pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{name}_SVD_diffscount.feather'))

CPU times: user 14 s, sys: 13.3 s, total: 27.3 s
Wall time: 49.1 s


In [21]:
len(weeks), len(week_labels)

(25, 25)

## Fill nan

In [9]:
%%time
nan_cols = []
for n, w in zip(week_labels, weeks):
    print(n)
    x = w.apply(lambda x: sum(x.isnull()), axis=0)
    nan_cols.extend(x[x>0].index)
nan_cols = [x for x in set(nan_cols) if x != target_col]

20170102
20170109
20170116
20170123
20170130
20170206
20170213
20170220
20170227
20170306
20170313
20170320
20170327
20170403
20170410
20170417
20170424
20170501
20170508
20170515
20170522
20170529
20170605
20170612
20170619
20170626
20170703
20170710
20170717
20170724
20170731
20170807
20170814
20170821
20170828
20170904
20170911
20170918
20170925
20171002
20171009
20171016
20171023
20171030
20171106
20171113
20171120
20171127
20171204
20171211
20171218
20171225
20180101
20180108
20180115
20180122
20180129
20180205
20180212
20180219
20180226
20180305
20180312
20180319
20180326
20180402
20180409
20180416
20180423


NameError: name 'target_col' is not defined

In [22]:
nan_cols = ['Subsector', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue']

In [23]:
%%time
# fill nulls
for w in weeks:
    w.fillna({x: '-999' for x in nan_cols}, inplace=True)

CPU times: user 2.04 s, sys: 8 ms, total: 2.05 s
Wall time: 2.05 s


## Model

In [24]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [25]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [26]:
columns = list(weeks[0].drop(id_cols + [target_col], axis=1))
cat_indices = [i for i, col in enumerate(columns) if col in cat_cols]

In [27]:
cat_indices

[0, 45, 46, 47, 48, 49, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 64]

In [28]:
from src.utils import run_model, alert
from catboost import CatBoostClassifier
metric_names = ['auc']

In [29]:
len(weeks)

25

In [None]:
%%time
results = None
output = []
for i in range(-8,-1): 
    train, val = pd.concat(weeks[:i]), weeks[i]
    print(train['TradeDateKey'].min(), train['TradeDateKey'].max(),
          val['TradeDateKey'].unique())
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
            
    _, _, results, model = run_model(
            CatBoostClassifier(iterations=1000, od_pval=0.00001, eval_metric='AUC'),
            X_train, y_train, X_val, y_val, None,
            metric_names, results, 
            params_desc='iterations=1000, od_pval=1e-5',
            dataset_desc=f'{week_labels[i]}_NMF_diffcounts', 
            early_stopping_rounds=None, cat_indices=cat_indices)
    alert()
    output.append(model)

20171106 20180226 [20180305]
CatBoostClassifier 

0:	learn: 0.6473608	test: 0.5855733	best: 0.5855733 (0)	total: 11.5s	remaining: 3h 12m 18s
1:	learn: 0.6500935	test: 0.5880465	best: 0.5880465 (1)	total: 23.2s	remaining: 3h 12m 35s
2:	learn: 0.6524292	test: 0.5896734	best: 0.5896734 (2)	total: 34s	remaining: 3h 8m 30s
3:	learn: 0.6528084	test: 0.5896937	best: 0.5896937 (3)	total: 44.9s	remaining: 3h 6m 18s
4:	learn: 0.6769598	test: 0.6571048	best: 0.6571048 (4)	total: 55.6s	remaining: 3h 4m 22s
5:	learn: 0.6940773	test: 0.6607069	best: 0.6607069 (5)	total: 1m 6s	remaining: 3h 3m 40s
6:	learn: 0.7060912	test: 0.6975377	best: 0.6975377 (6)	total: 1m 18s	remaining: 3h 4m 31s
7:	learn: 0.7133414	test: 0.7050381	best: 0.7050381 (7)	total: 1m 28s	remaining: 3h 3m 51s
8:	learn: 0.7088785	test: 0.7027688	best: 0.7050381 (7)	total: 1m 40s	remaining: 3h 4m 3s
9:	learn: 0.7075831	test: 0.6986017	best: 0.7050381 (7)	total: 1m 51s	remaining: 3h 4m 5s
10:	learn: 0.7127212	test: 0.7088552	best: 0.708

20171106 20180305 [20180312]
CatBoostClassifier 

0:	learn: 0.6639480	test: 0.7014345	best: 0.7014345 (0)	total: 12s	remaining: 3h 20m 5s
1:	learn: 0.6635463	test: 0.6759853	best: 0.7014345 (0)	total: 24.5s	remaining: 3h 23m 24s
2:	learn: 0.6741286	test: 0.6774013	best: 0.7014345 (0)	total: 36.8s	remaining: 3h 23m 40s
3:	learn: 0.6887452	test: 0.7125584	best: 0.7125584 (3)	total: 48.5s	remaining: 3h 21m 21s
4:	learn: 0.6885201	test: 0.7105030	best: 0.7125584 (3)	total: 1m	remaining: 3h 19m 29s
5:	learn: 0.7058440	test: 0.7116732	best: 0.7125584 (3)	total: 1m 12s	remaining: 3h 18m 54s
6:	learn: 0.7161970	test: 0.7292235	best: 0.7292235 (6)	total: 1m 23s	remaining: 3h 17m 42s
7:	learn: 0.7170275	test: 0.7261859	best: 0.7292235 (6)	total: 1m 35s	remaining: 3h 16m 43s
8:	learn: 0.7166586	test: 0.7243798	best: 0.7292235 (6)	total: 1m 46s	remaining: 3h 15m 56s
9:	learn: 0.7257211	test: 0.7329293	best: 0.7329293 (9)	total: 1m 58s	remaining: 3h 15m 2s
10:	learn: 0.7282713	test: 0.7392306	best:

20171106 20180312 [20180319]
CatBoostClassifier 

0:	learn: 0.6692737	test: 0.7063825	best: 0.7063825 (0)	total: 13.1s	remaining: 3h 38m 4s
1:	learn: 0.6734008	test: 0.6963021	best: 0.7063825 (0)	total: 25.6s	remaining: 3h 32m 46s
2:	learn: 0.6835937	test: 0.7214292	best: 0.7214292 (2)	total: 39s	remaining: 3h 35m 51s
3:	learn: 0.6942431	test: 0.7398386	best: 0.7398386 (3)	total: 51.4s	remaining: 3h 33m 8s
4:	learn: 0.7133765	test: 0.7374517	best: 0.7398386 (3)	total: 1m 3s	remaining: 3h 30m 46s
5:	learn: 0.7303198	test: 0.7553720	best: 0.7553720 (5)	total: 1m 16s	remaining: 3h 30m 28s
6:	learn: 0.7288749	test: 0.7535332	best: 0.7553720 (5)	total: 1m 28s	remaining: 3h 29m 18s
7:	learn: 0.7296259	test: 0.7554230	best: 0.7554230 (7)	total: 1m 41s	remaining: 3h 29m 34s
8:	learn: 0.7289986	test: 0.7596439	best: 0.7596439 (8)	total: 1m 53s	remaining: 3h 28m 55s
9:	learn: 0.7333569	test: 0.7636150	best: 0.7636150 (9)	total: 2m 6s	remaining: 3h 28m 57s
10:	learn: 0.7325815	test: 0.7618889	bes

20171106 20180319 [20180326]
CatBoostClassifier 

0:	learn: 0.6584321	test: 0.6510376	best: 0.6510376 (0)	total: 14.4s	remaining: 3h 59m 2s
1:	learn: 0.6817893	test: 0.6969759	best: 0.6969759 (1)	total: 28.2s	remaining: 3h 54m 40s
2:	learn: 0.6934557	test: 0.6820993	best: 0.6969759 (1)	total: 42.1s	remaining: 3h 53m 4s
3:	learn: 0.6922865	test: 0.6775897	best: 0.6969759 (1)	total: 55.5s	remaining: 3h 50m 27s
4:	learn: 0.7171814	test: 0.6769237	best: 0.6969759 (1)	total: 1m 9s	remaining: 3h 49m
5:	learn: 0.7169817	test: 0.6783844	best: 0.6969759 (1)	total: 1m 22s	remaining: 3h 48m 13s
6:	learn: 0.7284501	test: 0.7249692	best: 0.7249692 (6)	total: 1m 36s	remaining: 3h 47m 43s
7:	learn: 0.7279158	test: 0.7260024	best: 0.7260024 (7)	total: 1m 50s	remaining: 3h 47m 40s
8:	learn: 0.7263309	test: 0.7155277	best: 0.7260024 (7)	total: 2m 3s	remaining: 3h 46m 52s
9:	learn: 0.7258080	test: 0.7127757	best: 0.7260024 (7)	total: 2m 16s	remaining: 3h 45m 35s
10:	learn: 0.7251432	test: 0.7123817	best:

In [55]:
len(weeks), len(week_labels)

(69, 69)