In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
from src.utils import get_weeks, week_num
week_labels = get_weeks(day_from=20160104, num_weeks=121)[104:]

In [4]:
train = pd.read_feather(INTERIM/'metadata_train.feather')
val = pd.read_feather(INTERIM/'metadata_val.feather')
test = pd.read_feather(INTERIM/'metadata_val.feather')

In [5]:
cat_cols = ['Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']
num_cols = ['ActualMaturityDateKey', 'IssueDateKey', 'CompositeRating', 
            'IssuedAmount', 'BondDuration']
id_cols = ['CustomerIdx', 'IsinIdx', 'BuySell']

### Weekly interest

In [6]:
import pickle
with open(INTERIM/'interest_sequences.pkl', 'rb') as f:
    seq_dict = pickle.load(f)

### Weekly # transactions (customer, isin, buysell)

In [8]:
%%time
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
weekly_trades = trade[trade.TradeDateKey > 20180000].copy()
weekly_trades['week'] = weekly_trades.TradeDateKey.apply(
                            lambda x: week_num(week_labels, x))
weekly_trades = weekly_trades.groupby(['CustomerIdx', 'IsinIdx', 'BuySell', 'week'],
                                      as_index=False)['CustomerInterest'].agg('sum')

CPU times: user 4.28 s, sys: 860 ms, total: 5.14 s
Wall time: 5.67 s


In [14]:
n_weeks = weekly_trades.week.nunique()
weekly_trades.shape

(762667, 5)

In [22]:
weekly_trades.CustomerInterest.describe()

count    762667.000000
mean          0.459075
std           0.799465
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          35.000000
Name: CustomerInterest, dtype: float64

In [16]:
%%time
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
transactions = {} # 5 GB of RAM
df = weekly_trades.drop_duplicates(['CustomerIdx', 'IsinIdx'])
for c, i in zip(df.CustomerIdx, df.IsinIdx):
    for b in ['Buy', 'Sell']:
        transactions[(c, i, b)] = [0] * n_weeks
df = challenge.drop_duplicates(['CustomerIdx', 'IsinIdx'])
for c, i in zip(df.CustomerIdx, df.IsinIdx):
    for b in ['Buy', 'Sell']:
        transactions[(c, i, b)] = [0] * n_weeks

CPU times: user 2.84 s, sys: 124 ms, total: 2.96 s
Wall time: 3.15 s


In [23]:
%%time
df = weekly_trades
for c, i, b, w, q in zip(df.CustomerIdx, df.IsinIdx, df.BuySell, \
                         df.week, df.CustomerInterest):
    transactions[(c, i, b)][w] = q

CPU times: user 624 ms, sys: 20 ms, total: 644 ms
Wall time: 643 ms


In [26]:
transactions[(2223, 19020, 'Buy')]

[0, 0, 0, 0, 0, 0, 0, 0, 3.0, 1.0, 0, 0, 0, 0, 0, 0]

In [27]:
import pickle
with open(INTERIM/'transactions_sequences.pkl', 'wb') as f:
    pickle.dump(transactions, f, pickle.HIGHEST_PROTOCOL)

### Weekly generic transactions (customer, isin)

In [40]:
%%time
df = trade[trade.TradeDateKey > 20180000].copy()
df['week'] = df.TradeDateKey.apply(lambda x: week_num(week_labels, x))
df = df.groupby(['CustomerIdx', 'IsinIdx', 'week'],
                 as_index=False)['CustomerInterest'].agg('sum')

CPU times: user 856 ms, sys: 24 ms, total: 880 ms
Wall time: 878 ms


In [41]:
n_weeks = df.week.nunique()
df.shape

(750217, 4)

In [42]:
df.CustomerInterest.describe()

count    750217.000000
mean          0.466693
std           0.871653
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          61.000000
Name: CustomerInterest, dtype: float64

In [43]:
%%time
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
buysells = {} # 5 GB of RAM
df_unique = df.drop_duplicates(['CustomerIdx', 'IsinIdx'])
for c, i in zip(df_unique.CustomerIdx, df_unique.IsinIdx):
    buysells[(c, i)] = [0] * n_weeks
df_unique = challenge.drop_duplicates(['CustomerIdx', 'IsinIdx'])
for c, i in zip(df_unique.CustomerIdx, df_unique.IsinIdx):
    buysells[(c, i)] = [0] * n_weeks

CPU times: user 1.76 s, sys: 72 ms, total: 1.83 s
Wall time: 1.82 s


In [44]:
%%time
for c, i, w, q in zip(df.CustomerIdx, df.IsinIdx, \
                         df.week, df.CustomerInterest):
    buysells[(c, i)][w] = q

CPU times: user 428 ms, sys: 12 ms, total: 440 ms
Wall time: 439 ms


In [75]:
import pickle
with open(INTERIM/'buysells_sequences.pkl', 'wb') as f:
    pickle.dump(buysells, f, pickle.HIGHEST_PROTOCOL)

In [148]:
buysells[(2223, 19020)]

[0, 0, 0, 0, 0, 0, 0, 0, 3.0, 1.0, 0, 0, 0, 0, 0, 0]

### Customer transactions (customer)

In [53]:
%%time
df = trade[trade.TradeDateKey > 20180000].copy()
df['week'] = df.TradeDateKey.apply(lambda x: week_num(week_labels, x))
df = df.groupby(['CustomerIdx', 'week'],
                 as_index=False)['CustomerInterest'].agg('sum')

CPU times: user 616 ms, sys: 0 ns, total: 616 ms
Wall time: 614 ms


In [54]:
n_weeks = df.week.nunique()
df.shape

(19396, 3)

In [55]:
df.CustomerInterest.describe()

count    19396.000000
mean        18.051196
std         59.020797
min          0.000000
25%          1.000000
50%          4.000000
75%         13.000000
max       1986.000000
Name: CustomerInterest, dtype: float64

In [56]:
%%time
customers = {} # 5 GB of RAM
df_unique = df.drop_duplicates(['CustomerIdx'])
for c in df_unique.CustomerIdx:
    customers[c] = [0] * n_weeks
df_unique = challenge.drop_duplicates(['CustomerIdx'])
for c in df_unique.CustomerIdx:
    customers[c] = [0] * n_weeks

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 19.9 ms


In [57]:
%%time
for c, w, q in zip(df.CustomerIdx, df.week, df.CustomerInterest):
    customers[c][w] = q

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.37 ms


In [73]:
import pickle
with open(INTERIM/'customers_sequences.pkl', 'wb') as f:
    pickle.dump(customers, f, pickle.HIGHEST_PROTOCOL)

In [150]:
print(customers[2223])

[206.0, 258.0, 447.0, 373.0, 375.0, 534.0, 477.0, 436.0, 341.0, 264.0, 377.0, 326.0, 424.0, 397.0, 337.0, 373.0]


### Bond transactions (isin)

In [58]:
%%time
df = trade[trade.TradeDateKey > 20180000].copy()
df['week'] = df.TradeDateKey.apply(lambda x: week_num(week_labels, x))
df = df.groupby(['IsinIdx', 'week'],
                 as_index=False)['CustomerInterest'].agg('sum')

CPU times: user 692 ms, sys: 0 ns, total: 692 ms
Wall time: 688 ms


In [59]:
n_weeks = df.week.nunique()
print(df.shape)
df.CustomerInterest.describe()

(175364, 3)


count    175364.000000
mean          1.996539
std           3.676236
min           0.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         308.000000
Name: CustomerInterest, dtype: float64

In [60]:
%%time
isins = {} # 5 GB of RAM
df_unique = df.drop_duplicates(['IsinIdx'])
for i in df_unique.IsinIdx:
    isins[i] = [0] * n_weeks
df_unique = challenge.drop_duplicates(['IsinIdx'])
for i in df_unique.IsinIdx:
    isins[i] = [0] * n_weeks

CPU times: user 56 ms, sys: 0 ns, total: 56 ms
Wall time: 54.6 ms


In [61]:
%%time
for i, w, q in zip(df.IsinIdx, df.week, df.CustomerInterest):
    isins[i][w] = q

CPU times: user 60 ms, sys: 0 ns, total: 60 ms
Wall time: 57.1 ms


In [74]:
import pickle
with open(INTERIM/'isins_sequences.pkl', 'wb') as f:
    pickle.dump(isins, f, pickle.HIGHEST_PROTOCOL)

In [63]:
# 0-1 | sum | buy or sell | 
len(seq_dict), len(transactions), len(buysells), len(customers), len(isins)

(985972, 985972, 492986, 2596, 23087)

In [151]:
print(isins[19020])

[1.0, 0, 0, 0, 3.0, 1.0, 0, 1.0, 4.0, 2.0, 0, 0, 2.0, 0.0, 1.0, 0]


## Generate Features

In [77]:
train = pd.read_feather(INTERIM/'neuralnet/metadata_train.feather')
val = pd.read_feather(INTERIM/'neuralnet/metadata_val.feather')
test = pd.read_feather(INTERIM/'neuralnet/metadata_val.feather')

In [78]:
cat_cols = ['Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']
num_cols = ['ActualMaturityDateKey', 'IssueDateKey', 'CompositeRating', 
            'IssuedAmount', 'BondDuration']
id_cols = ['CustomerIdx', 'IsinIdx', 'BuySell']

In [152]:
len(train), len(val), len(test)

(983172, 493590, 493590)

In [86]:
%%time
from src.structured_lstm import preprocess
scaler, train_seqs, val_seqs, test_seqs = preprocess(train, val, test, 
                                    cat_cols, num_cols, seq_dict, 
                                    transactions, buysells, customers, isins)

Encoding cats...
Scaling conts...
Extracting seqs...
CPU times: user 23.1 s, sys: 1.43 s, total: 24.5 s
Wall time: 24.5 s


## Model

In [135]:
from torch.utils.data import DataLoader
from torch import optim
import torch.nn as nn
from src.structured_lstm import MultimodalDataset, MultimodalNet, train_model

In [96]:
train_seqs.shape

(983172, 5, 16)

In [97]:
train_dl = DataLoader(MultimodalDataset(
                train[cat_cols], train[num_cols],
                train_seqs[:,:,:-2], train_seqs[:,0,-2]), # last interest 
                batch_size=128, shuffle=True)
val_dl = DataLoader(MultimodalDataset(
                val[cat_cols], val[num_cols],
                val_seqs[:,:,:-1], val_seqs[:,0,-1]), # last interest
                batch_size=128)

In [98]:
cat_szs = [int(train[col].max() + 1) for col in cat_cols]
emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]

In [129]:
USE_CUDA = True

In [136]:
model = MultimodalNet(emb_szs, n_cont=len(num_cols), emb_drop=0.2,
                      szs=[1000,500], drops=[0.5, 0.5],
                      rnn_hidden_sz=64, rnn_input_sz=5, rnn_n_layers=2,
                      rnn_drop=0.5)

if USE_CUDA: model = model.cuda()

In [137]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [138]:
%%time
model, train_losses, val_losses, val_auc_scores = train_model(
                model, train_dl, val_dl, optimizer, criterion,
                n_epochs=3, USE_CUDA=USE_CUDA, val_every=10)

ROC AUC Score: 0.589870
Validation Loss: 0.151739
ROC AUC Score: 0.597133
Validation Loss: 0.156596
ROC AUC Score: 0.597824
Validation Loss: 0.153773
Epoch Results:
Train ROC AUC Score: 0.780595
Train Loss: 0.079714
Validation ROC AUC Score: 0.603661
Validation Loss: 0.150835

ROC AUC Score: 0.603932
Validation Loss: 0.156942
ROC AUC Score: 0.612969
Validation Loss: 0.155154
ROC AUC Score: 0.610491
Validation Loss: 0.158040
Epoch Results:
Train ROC AUC Score: 0.813835
Train Loss: 0.078620
Validation ROC AUC Score: 0.633618
Validation Loss: 0.153691

ROC AUC Score: 0.627619
Validation Loss: 0.157228
ROC AUC Score: 0.637556
Validation Loss: 0.169749
ROC AUC Score: 0.644390
Validation Loss: 0.160749
Epoch Results:
Train ROC AUC Score: 0.830626
Train Loss: 0.081968
Validation ROC AUC Score: 0.649747
Validation Loss: 0.161984

CPU times: user 7min 20s, sys: 9.45 s, total: 7min 30s
Wall time: 7min 29s


In [144]:
len(train_dl), len(val_dl)

(7682, 3857)

In [139]:
from src.structured_lstm import get_predictions

In [140]:
targets, preds = get_predictions(model, val_dl, USE_CUDA=USE_CUDA)



In [141]:
from sklearn.metrics import roc_auc_score
roc_auc_score(targets, preds)

0.6497469317299748

In [142]:
targets, preds = get_predictions(model, train_dl, USE_CUDA=USE_CUDA)



In [143]:
from sklearn.metrics import roc_auc_score
roc_auc_score(targets, preds)

0.8306261998393005