In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
from src.utils import get_weeks, week_num
week_labels = get_weeks(day_from=20160104, num_weeks=121)[104:]

In [4]:
train = pd.read_feather(INTERIM/'metadata_train.feather')
val = pd.read_feather(INTERIM/'metadata_val.feather')
test = pd.read_feather(INTERIM/'metadata_val.feather')

In [5]:
cat_cols = ['Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']
num_cols = ['ActualMaturityDateKey', 'IssueDateKey', 'CompositeRating', 
            'IssuedAmount', 'BondDuration']
id_cols = ['CustomerIdx', 'IsinIdx', 'BuySell']

In [6]:
import pickle
with open(INTERIM/'interest_sequences.pkl', 'rb') as f:
    seq_dict = pickle.load(f)

In [7]:
%%time
from src.structured_lstm import preprocess
scaler, train_seqs, val_seqs, test_seqs = preprocess(train, val, test, 
                                    cat_cols, num_cols, seq_dict)

Encoding cats...
Scaling conts...
Extracting seqs...
CPU times: user 9.4 s, sys: 1.5 s, total: 10.9 s
Wall time: 10.9 s


## Sandbox

In [15]:
from src.structured_lstm import MultimodalDataset
from torch.utils.data import DataLoader

In [16]:
train_dl = DataLoader(MultimodalDataset(
                train[cat_cols], train[num_cols],
                train_seqs[:,:-2], train_seqs[:,-2]),
                batch_size=128, shuffle=True)
val_dl = DataLoader(MultimodalDataset(
                val[cat_cols], val[num_cols],
                val_seqs[:,:-1], val_seqs[:,-1]),
                batch_size=128)

In [41]:
import torch.nn as nn
import torch
from src.neuralnet import NeuralNet

In [83]:
class MultimodalNet(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, szs, drops, 
                 rnn_hidden_sz, rnn_input_sz, rnn_n_layers, rnn_drop,
                 out_sz=1):
        super().__init__()
        self.structured_net = NeuralNet(emb_szs, n_cont=n_cont, 
                        emb_drop=emb_drop, szs=szs, drops=drops, 
                        out_sz=rnn_hidden_sz)
        
        self.lstm = nn.LSTM(rnn_input_sz, rnn_hidden_sz, rnn_n_layers, 
                            dropout=rnn_drop)
        self.out = nn.Linear(rnn_hidden_sz, out_sz)
        
        self.rnn_n_layers = rnn_n_layers
        self.rnn_hidden_sz = rnn_hidden_sz
        
    def forward(self, cats, conts, seqs, hidden):
        x = self.structured_net(cats, conts) # [bs, hs]
        cell = x.unsqueeze(0).expand(self.rnn_n_layers, *x.size()) # [nlay, bs, hs]
        seqs = seqs.transpose(1,0).unsqueeze(2) # [sqlen, bs, 1] 1<=rnn_inp_sz
        outputs, hidden = self.lstm(seqs, (hidden, cell.contiguous()))
        out = self.out(outputs[-1]) # != if bidirectional
        return out
        
    def init_hidden(self, batch_sz):
        return torch.zeros(self.rnn_n_layers, batch_sz, self.rnn_hidden_sz)

In [17]:
x = next(iter(train_dl))

In [23]:
x[0].shape, x[1].shape, x[2].shape, x[3].shape

(torch.Size([128, 16]),
 torch.Size([128, 6]),
 torch.Size([128, 14]),
 torch.Size([128]))

In [20]:
cat_szs = [int(train[col].max() + 1) for col in cat_cols]
emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]

In [48]:
model = MultimodalNet(emb_szs, n_cont=len(num_cols), emb_drop=0.04,
                      szs=[1000,500], drops=[0.001, 0.01],
                      rnn_hidden_sz=64, rnn_input_sz=1, rnn_n_layers=2,
                      rnn_drop=0.04)

In [56]:
hidden = model.init_hidden(len(x[0]))
output = model(x[0], x[1], x[2], hidden)

In [58]:
def train_step(model, cats, conts, seqs, hidden, 
               targets, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    preds = model(cats, conts, seqs, hidden)
    loss = criterion(preds.view(-1), targets)
    loss.backward()
    optimizer.step()
    return loss.item()

In [75]:
def get_predictions(model, data_loader, print_every=800, USE_CUDA=False):
    targets = []
    preds = []
    model.eval()
    for batch_idx, (cats, conts, seqs, target) in enumerate(data_loader):
        with torch.no_grad():            
            hidden = model.init_hidden(len(cats))
            if USE_CUDA:
                cats, conts, target, hidden = cats.cuda(), conts.cuda(), \
                                              target.cuda(), hidden.cuda()
            pred = model(cats, conts, seqs, hidden)
            targets.extend(target.cpu())
            preds.extend(pred.cpu())
            assert len(targets) == len(preds)
            if batch_idx % print_every == 0:
                print('[{}/{} ({:.0f}%)]'.format(
                        batch_idx * len(cats), len(data_loader.dataset),
                        100. * batch_idx / len(data_loader)))
    return [x.item() for x in targets], [F.sigmoid(x).item() for x in preds]

In [76]:
def train_model(model, train_loader, val_loader, optimizer, criterion,
                n_epochs, print_every=200, val_every=5, USE_CUDA=False):
    if USE_CUDA:
        model = model.cuda()
    train_losses = []
    val_losses = []
    val_auc_scores = []
    val_every *= print_every
    for epoch in range(n_epochs):
        train_loss = 0
        for batch_idx, (cats, conts, seqs, target) in enumerate(train_loader):
            hidden = model.init_hidden(len(cats))
            if USE_CUDA:
                cats, conts, seqs, target = cats.cuda(), conts.cuda(), \
                                             seqs.cuda(), target.cuda()
            train_loss += train_step(model, cats, conts, seqs, hidden, 
                                     target, optimizer, criterion)
            
            if batch_idx > 0 and batch_idx % print_every == 0:
                train_loss /= print_every
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                        epoch + 1, batch_idx * len(seqs), len(train_loader.dataset),
                        100. * batch_idx / len(train_loader), train_loss))
                train_losses.append(train_loss)
                train_loss = 0
            
            if batch_idx > 0 and batch_idx % val_every == 0:
                targets, preds = get_predictions(model, val_loader, USE_CUDA=USE_CUDA)
                val_loss = nn.BCELoss()(torch.Tensor(preds),
                                        torch.Tensor(targets)).item()
                val_losses.append(val_loss)
                val_auc = roc_auc_score(targets, preds)
                val_auc_scores.append(val_auc)
                print(f'ROC AUC Score: {val_auc:.6f}') 
                print(f'Validation Loss: {val_loss:.6f}')
        print()
    return model, train_losses, val_losses, val_auc_scores   

In [77]:
from sklearn.metrics import roc_auc_score
from torch import optim
import torch.nn.functional as F

In [78]:
USE_CUDA = True

In [84]:
model = MultimodalNet(emb_szs, n_cont=len(num_cols), emb_drop=0.04,
                      szs=[1000,500], drops=[0.001, 0.01],
                      rnn_hidden_sz=64, rnn_input_sz=1, rnn_n_layers=2,
                      rnn_drop=0.04)

if USE_CUDA: model = model.cuda()

In [85]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [86]:
train_dl = DataLoader(MultimodalDataset(
                train[cat_cols], train[num_cols],
                train_seqs[:,:-2], train_seqs[:,-2]),
                batch_size=128, shuffle=True)
val_dl = DataLoader(MultimodalDataset(
                val[cat_cols], val[num_cols],
                val_seqs[:,:-1], val_seqs[:,-1]),
                batch_size=128)

In [1]:
%%time
model, train_losses, val_losses, val_auc_scores = train_model(
                model, train_dl, val_dl, optimizer, criterion,
                n_epochs=2, USE_CUDA=USE_CUDA, val_every=5)

## Checkpoint

In [20]:
from torch.utils.data import DataLoader
from torch import optim
import torch.nn as nn
from src.structured_lstm import MultimodalDataset, MultimodalNet, train_model

In [14]:
train_dl = DataLoader(MultimodalDataset(
                train[cat_cols], train[num_cols],
                train_seqs[:,:-2], train_seqs[:,-2]),
                batch_size=128, shuffle=True)
val_dl = DataLoader(MultimodalDataset(
                val[cat_cols], val[num_cols],
                val_seqs[:,:-1], val_seqs[:,-1]),
                batch_size=128)

In [15]:
cat_szs = [int(train[col].max() + 1) for col in cat_cols]
emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]

In [28]:
USE_CUDA = True

In [29]:
model = MultimodalNet(emb_szs, n_cont=len(num_cols), emb_drop=0.2,
                      szs=[1000,500], drops=[0.5, 0.5],
                      rnn_hidden_sz=64, rnn_input_sz=1, rnn_n_layers=2,
                      rnn_drop=0.5)

if USE_CUDA: model = model.cuda()

In [30]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [31]:
%%time
model, train_losses, val_losses, val_auc_scores = train_model(
                model, train_dl, val_dl, optimizer, criterion,
                n_epochs=2, USE_CUDA=USE_CUDA, val_every=10)

ROC AUC Score: 0.541212
Validation Loss: 0.157820
ROC AUC Score: 0.562847
Validation Loss: 0.160305
ROC AUC Score: 0.574533
Validation Loss: 0.156474

ROC AUC Score: 0.585368
Validation Loss: 0.157241
ROC AUC Score: 0.582974
Validation Loss: 0.153366
ROC AUC Score: 0.589472
Validation Loss: 0.158835

CPU times: user 3min 22s, sys: 3.89 s, total: 3min 26s
Wall time: 3min 26s
