In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [3]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [4]:
from src.utils import get_weeks, week_num
week_labels = get_weeks(day_from=20160104, num_weeks=121)[91:]

In [5]:
print(week_labels)

[20171002, 20171009, 20171016, 20171023, 20171030, 20171106, 20171113, 20171120, 20171127, 20171204, 20171211, 20171218, 20171225, 20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [6]:
from src.structurednet import get_seqs, shift_right

In [7]:
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)

In [9]:
NEURALNET = INTERIM/'neuralnet'

In [12]:
%%time
train = pd.read_feather(NEURALNET/'train_preproc.feather')
val = pd.read_feather(NEURALNET/'val_preproc.feather')
test = pd.read_feather(NEURALNET/'test_preproc.feather')

CPU times: user 1.04 s, sys: 1.44 s, total: 2.48 s
Wall time: 2.76 s


In [13]:
%%time
import pickle
with open(NEURALNET/'train_seqs.pkl', 'rb') as f:
    train_seqs = pickle.load(f)
with open(NEURALNET/'val_seqs.pkl', 'rb') as f:
    val_seqs = pickle.load(f)
with open(NEURALNET/'test_seqs.pkl', 'rb') as f:
    test_seqs = pickle.load(f)

CPU times: user 1.32 s, sys: 5.34 s, total: 6.66 s
Wall time: 6.66 s


In [14]:
cat_cols = ['Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType',
            'CompositeRatingCat', 'CustomerIdxCat', 'IsinIdxCat', 'BuySellCat']
num_cols = ['ActualMaturityDateKey', 'IssueDateKey', 'IssuedAmount', 
            'BondDuration', 'BondRemaining', 'BondLife', 
            'Day', 'CompositeRating', 'BuySellCont',
            
            'DaysSinceBuySell', 'DaysSinceTransaction', 'DaysSinceCustomerActivity',
            'DaysSinceBondActivity', 'DaysCountBuySell', 'DaysCountTransaction',
            'DaysCountCustomerActivity', 'DaysCountBondActivity', 'SVD_CustomerBias',
            'SVD_IsinBuySellBias', 'SVD_Recommend', 'SVD_CustomerFactor00',
            'SVD_CustomerFactor01', 'SVD_CustomerFactor02', 'SVD_CustomerFactor03',
            'SVD_CustomerFactor04', 'SVD_CustomerFactor05', 'SVD_CustomerFactor06',
            'SVD_CustomerFactor07', 'SVD_CustomerFactor08', 'SVD_CustomerFactor09',
            'SVD_CustomerFactor10', 'SVD_CustomerFactor11', 'SVD_CustomerFactor12',
            'SVD_CustomerFactor13', 'SVD_CustomerFactor14']
id_cols = ['CustomerIdx', 'IsinIdx', 'BuySell']
target_col = 'CustomerInterest'

## Model

In [16]:
from torch.utils.data import DataLoader
from torch import optim
import torch.nn as nn
from src.structurednet import MultimodalDataset, StructuredNet, train_model

In [17]:
%%time
train_dl = DataLoader(MultimodalDataset(
                train[cat_cols], train[num_cols],
                train_seqs, train[target_col]),
                batch_size=128, shuffle=True)
val_dl = DataLoader(MultimodalDataset(
                val[cat_cols], val[num_cols],
                val_seqs, val[target_col]), 
                batch_size=128)

CPU times: user 7.11 s, sys: 5.38 s, total: 12.5 s
Wall time: 4.98 s


In [18]:
cat_szs = [int(train[col].max() + 1) for col in cat_cols]
emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]

In [34]:
USE_CUDA = True

In [44]:
model = StructuredNet(emb_szs, n_cont=len(num_cols), emb_drop=0.2,
                      szs=[1000,500], drops=[0.5, 0.5],
                      rnn_hidden_sz=64, rnn_input_sz=4, rnn_n_layers=2,
                      rnn_drop=0.5)

if USE_CUDA: model = model.cuda()

In [45]:
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()

In [46]:
%%time
model, train_losses, val_losses, val_auc_scores = train_model(
                model, train_dl, val_dl, optimizer, criterion,
                n_epochs=2, USE_CUDA=USE_CUDA, print_every=800, 
                val_every=10) # 800,10

ROC AUC Score: 0.827829
Validation Loss: 0.132263
ROC AUC Score: 0.848718
Validation Loss: 0.129016
ROC AUC Score: 0.858230
Validation Loss: 0.120400
ROC AUC Score: 0.861445
Validation Loss: 0.125223
ROC AUC Score: 0.871437
Validation Loss: 0.118462
ROC AUC Score: 0.874396
Validation Loss: 0.115728
ROC AUC Score: 0.874636
Validation Loss: 0.117647
Epoch Results:
Train ROC AUC Score: 0.792860
Train Loss: 0.121061
Validation ROC AUC Score: 0.872865
Validation Loss: 0.117293

ROC AUC Score: 0.876888
Validation Loss: 0.115896
ROC AUC Score: 0.876255
Validation Loss: 0.119268
ROC AUC Score: 0.874388
Validation Loss: 0.116122
ROC AUC Score: 0.875302
Validation Loss: 0.118602
ROC AUC Score: 0.874489
Validation Loss: 0.117147
ROC AUC Score: 0.877895
Validation Loss: 0.111209
ROC AUC Score: 0.876081
Validation Loss: 0.117523
Epoch Results:
Train ROC AUC Score: 0.794502
Train Loss: 0.119734
Validation ROC AUC Score: 0.875042
Validation Loss: 0.112174

CPU times: user 32min 25s, sys: 36.4 s, tota