In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
# submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
# market     = pd.read_csv(RAW/'Market.csv', low_memory=False)

In [4]:
from src.utils import get_weeks
week_labels = get_weeks(day_from=20160104, num_weeks=121)[104:]

In [5]:
print(week_labels)

[20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [6]:
trade.head()

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest
0,20161207,2789,8478,Sell,653168.0,0.0,Unknown,1.0
1,20170329,2574,14562,Buy,1656487.0,0.0,Unknown,1.0
2,20170418,2574,4747,Buy,939673.0,0.0,Unknown,1.0
3,20170310,2574,9885,Sell,708082.0,0.0,Unknown,1.0
4,20161116,2574,8885,Buy,1147709.0,0.0,Unknown,1.0


In [7]:
weekly_trades = trade[trade.TradeDateKey > 20180000].copy()

In [8]:
from src.utils import week_num

In [9]:
weekly_trades['week'] = weekly_trades.TradeDateKey.apply(
                            lambda x: week_num(week_labels, x))

In [10]:
weekly_trades.head()

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest,week
1527,20180201,2447,19665,Sell,748160.0,102.65,Done,1.0,4
1528,20180220,2447,18972,Sell,2959167.0,102.093,NotTraded,1.0,7
1529,20180108,2554,24873,Buy,2815003.0,103.877,Done,1.0,1
1530,20180108,2554,19072,Sell,2815003.0,121.963,Done,1.0,1
1538,20180116,1922,25986,Buy,601586.0,97.984,Done,1.0,2


In [11]:
weekly_trades = weekly_trades.groupby(['CustomerIdx', 'IsinIdx', 'BuySell', 'week'],
                                      as_index=False)['CustomerInterest'].agg('max')

In [98]:
weekly_trades.sample(5)

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,week,CustomerInterest
748409,3247,27283,Sell,15,1.0
330738,1964,2615,Buy,8,0.0
712287,3032,9330,Buy,4,0.0
242915,1622,15261,Sell,6,1.0
163641,958,25333,Buy,4,0.0


In [13]:
weekly_trades[(weekly_trades.CustomerIdx==0) & (weekly_trades.IsinIdx==24944)]

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,week,CustomerInterest
0,0,24944,Sell,10,1.0


In [14]:
weekly_trades.week.max()

15

In [15]:
n_weeks = weekly_trades.week.nunique()

In [17]:
%%time
interests = {} # 5 GB of RAM
for idx, row in weekly_trades.drop_duplicates(
                        ['CustomerIdx', 'IsinIdx']).iterrows():
    for b in ['Buy', 'Sell']:
        interests[(row.CustomerIdx, row.IsinIdx, b)] = [0] * n_weeks

CPU times: user 31.7 s, sys: 104 ms, total: 31.8 s
Wall time: 31.8 s


In [18]:
%%time
for idx, row in challenge.drop_duplicates(
                        ['CustomerIdx', 'IsinIdx']).iterrows():
    for b in ['Buy', 'Sell']:
        interests[(row.CustomerIdx, row.IsinIdx, b)] = [0] * n_weeks

CPU times: user 18 s, sys: 28 ms, total: 18 s
Wall time: 18 s


In [19]:
len(interests) # vs 985,972 110,182,700

985972

In [16]:
# %%time
# interests = {} # 5 GB of RAM
# for cIdx in weekly_trades.CustomerIdx.unique():
#     for iIdx in weekly_trades.IsinIdx.unique():
#         for b in ['Buy', 'Sell']:
#             interests[(cIdx, iIdx, b)] = [0] * n_weeks

CPU times: user 2min 18s, sys: 9.22 s, total: 2min 27s
Wall time: 2min 27s


In [25]:
%%time
from tqdm import tqdm_notebook
for idx, row in tqdm_notebook(weekly_trades.iterrows(), total=len(weekly_trades)):
    interests[(row.CustomerIdx, row.IsinIdx, row.BuySell)][row.week] = row.CustomerInterest


CPU times: user 1min, sys: 480 ms, total: 1min
Wall time: 1min


In [26]:
import pickle
with open(INTERIM/'interest_sequences.pkl', 'wb') as f:
    pickle.dump(interests, f, pickle.HIGHEST_PROTOCOL)

In [152]:
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim

In [121]:
# params: (input_size, hidden_size, num_layers, bias, 
#          batch_first, dropout, bidirectional)
#   input (seq_len, batch, input_size)
#   (h_0, c_0) (num_layers * num_directions, batch, hidden_size)
#   output (seq_len, batch, hidden_size * num_directions)

class LSTMClassifier(nn.Module):
    def __init__(self, input_sz, hidden_sz, n_layers, drop=0.1):
        super().__init__()
        self.input_sz = input_sz
        self.hidden_sz = hidden_sz
        self.n_layers = n_layers
        
        self.lstm = nn.LSTM(input_sz, hidden_sz, n_layers, 
                            batch_first=False, dropout=drop)
        self.out = nn.Linear(hidden_sz, 1) # output_sz 1
        
    def forward(self, sequence, hidden):
        # or transpose if batch_first = False
        # inp = sequence.view(batch_sz, -1, self.input_sz) 
        # h0 = self.init_hidden()
        # seq = (seq_len=1, batch, input_sz=1)
        out, hidden = self.lstm(sequence, hidden)
        out = self.out(out[-1])
        return out, hidden
        
    def init_hidden(self, batch_sz):
        h0 = torch.zeros(self.n_layers, batch_sz, self.hidden_sz)
        c0 = torch.zeros(self.n_layers, batch_sz, self.hidden_sz)
        return (h0, c0)

In [290]:
def train_step(model, seqs, targets, optimizer, criterion):
    '''
    seqs: (batch_sz, seq_len)
    targets: (batch_sz, seq_len)
    '''
    model.train()
    optimizer.zero_grad()
    hidden = model.init_hidden(len(seqs)) # len(seqs) == batch_sz
    seqs = seqs.transpose(0,1) # [seq_len, batch_sz]
    targets = targets.transpose(0,1)
    loss = 0
    for i in range(len(seqs)): # for each timestep
        output, hidden = model(seqs[i].unsqueeze(0).unsqueeze(2), hidden)
        loss += criterion(output, targets[i].unsqueeze(1))
    loss.backward()
    optimizer.step()
    return loss.item() / len(seqs)

In [417]:
def evaluate(model, seqs, targets=None):
    '''
    seqs: (batch_sz, seq_len)
    targets: (batch_sz, seq_len)
    '''
    with torch.no_grad():
        model.eval()
        hidden = model.init_hidden(len(seqs)) # len(seqs) == batch_sz
        seqs = seqs.transpose(0,1) # [seq_len, batch_sz]
        if targets is not None:
            targets = targets.transpose(0,1)
            loss = 0
        output = None
        for i in range(len(seqs)): # for each timestep
            output, hidden = model(seqs[i].unsqueeze(0).unsqueeze(2), hidden)
            if targets is not None:
                loss += criterion(output, targets[i].unsqueeze(1))
        print(output.size())
        return loss.item() / len(seqs) if targets is not None else None, \
               F.sigmoid(output).view(-1).numpy()

In [418]:
input_sz = 1
hidden_sz = 32
n_layers = 2
seq_len = 16
model = LSTMClassifier(input_sz=1, hidden_sz=32, n_layers=2, drop=0.1)

In [419]:
seqs = torch.Tensor([interests[(2429, 24845, 'Sell')][:-1],
                     interests[(1622, 15261, 'Sell')][:-1]])
targets = torch.Tensor([interests[(2429, 24845, 'Sell')][1:],
                         interests[(1622, 15261, 'Sell')][1:]])

In [420]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [421]:
loss = train_step(model, seqs, targets, optimizer, criterion)
loss

0.6906216939290365

In [422]:
loss, output = evaluate(model, seqs, targets)
loss, output

torch.Size([2, 1])


(0.6889921824137369, array([0.50356585, 0.50683796], dtype=float32))

In [337]:
class SequentialDataset(torch.utils.data.Dataset):
    def __init__(self, seqs, targets):
        self.seqs = np.array(seqs).astype(np.float32)
        self.targets = np.array(targets).astype(np.float32)
    
    def __len__(self):
        return len(self.seqs)
    
    def __getitem__(self, idx):
        return [self.seqs[idx], self.targets[idx]]        

In [339]:
len(interests), len(challenge)

(985972, 484758)

In [352]:
challenge.head()

Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest
0,a1e0d80784,20180423,1856,13323,Buy,
1,c2cc6cc2a8,20180423,1856,9230,Buy,
2,a8e94f6344,20180423,1780,9157,Buy,
3,758bae1e35,20180423,2129,9131,Buy,
4,02ab378ee8,20180423,1758,7151,Buy,


In [356]:
weekly_trades.head()

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,week,CustomerInterest
0,0,24944,Sell,10,1.0
1,0,25992,Buy,8,1.0
2,0,26726,Sell,10,1.0
3,0,26793,Sell,3,1.0
4,0,26793,Sell,4,1.0


In [None]:
for cIdx, iIdx, b in zip(challenge.)

In [358]:
interests[(0, 24944, 'Sell')]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]

In [360]:
list(interests.values())[26226]

[1.0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0, 0, 1.0]

In [361]:
%%time
sequences = np.array(list(interests.values()))

CPU times: user 1.28 s, sys: 36 ms, total: 1.32 s
Wall time: 1.31 s


In [371]:
## 78.14% 
sequences.max(axis=1).sum(), sequences.shape, sequences.sum()

(208426.0, (985972, 16), 277081.0)

In [369]:
from torch.utils.data import DataLoader

In [437]:
train_ds = DataLoader(SequentialDataset(sequences[:,:-1], sequences[:,1:]),
                      batch_size=128, shuffle=True)

In [438]:
model = LSTMClassifier(input_sz=1, hidden_sz=512, n_layers=3, drop=0)

In [439]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [440]:
def train_model(model, train_loader, optimizer, criterion, n_epochs, 
                print_every=100):
    train_losses = []
    for epoch in range(n_epochs):
        train_loss = 0
        for batch_idx, (seqs, targets) in enumerate(train_loader):
            seqs, targets = Variable(seqs), Variable(targets)
            train_loss += train_step(model, seqs, targets, optimizer, 
                                     criterion)
            
            if batch_idx > 0 and batch_idx % print_every == 0:
                train_loss /= print_every
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                        epoch + 1, batch_idx * len(seqs), len(train_loader.dataset),
                        100. * batch_idx / len(train_loader), train_loss))
                train_losses.append(train_loss)
                train_loss = 0            
        print()
    return model, train_losses

In [None]:
model, train_losses = train_model(model, train_ds, optimizer, criterion,
                                  n_epochs=2)
# Train Epoch: 1 [12800/985972 (1%)]	Loss: 0.125625
# Train Epoch: 1 [25600/985972 (3%)]	Loss: 0.090907
# Train Epoch: 1 [38400/985972 (4%)]	Loss: 0.086232
# Train Epoch: 1 [51200/985972 (5%)]	Loss: 0.090720
# Train Epoch: 1 [64000/985972 (6%)]	Loss: 0.092088
# Train Epoch: 1 [76800/985972 (8%)]	Loss: 0.088952
# Train Epoch: 1 [89600/985972 (9%)]	Loss: 0.091166
# Train Epoch: 1 [102400/985972 (10%)]	Loss: 0.089416
# Train Epoch: 1 [115200/985972 (12%)]	Loss: 0.088074

## Validation

In [7]:
from src.lstm import LSTMClassifier, train_model, SequentialDataset
from torch.utils.data import DataLoader

In [452]:
train_ds = DataLoader(SequentialDataset(sequences[:,:-2], sequences[:,1:-1]),
                      batch_size=128, shuffle=True)

In [453]:
%%time
val = pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{week_labels[-2]}_SVD_diffscount.feather')
val_seqs = []
val_targets = []
for cIdx, iIdx, b, interest in zip(val.CustomerIdx, val.IsinIdx, \
                                   val.BuySell, val.CustomerInterest):
    val_seqs.append(interests[(cIdx, iIdx, b)])
    val_targets.append(interest)
val_seqs, val_targets = np.array(val_seqs), np.array(val_targets)

CPU times: user 1.56 s, sys: 424 ms, total: 1.98 s
Wall time: 2.65 s


In [454]:
val_seqs.shape, val_targets.shape

((493590, 16), (493590,))

In [459]:
val_ds = DataLoader(SequentialDataset(val_seqs[:,:-1], val_seqs[:,1:]))

In [473]:
%%time
import pickle
with open(INTERIM/'seq_train_ds.pkl', 'wb') as f:
    pickle.dump(train_ds, f, pickle.HIGHEST_PROTOCOL)
with open(INTERIM/'seq_val_ds.pkl', 'wb') as f:
    pickle.dump(val_ds, f, pickle.HIGHEST_PROTOCOL)

CPU times: user 124 ms, sys: 268 ms, total: 392 ms
Wall time: 408 ms


In [None]:
%%time
import pickle
with open(INTERIM/'seq_train_ds.pkl', 'rb') as f:
    train_ds = pickle.load(f)
with open(INTERIM/'seq_val_ds.pkl', 'rb') as f:
    val_ds = pickle.load(f)

In [469]:
USE_CUDA = False

In [470]:
model = LSTMClassifier(input_sz=1, hidden_sz=128, n_layers=2, drop=0, 
                       USE_CUDA=USE_CUDA)

In [471]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [451]:
model, train_losses, val_losses, val_auc_scores = train_model(
                model, train_ds, val_ds, optimizer, criterion,
                n_epochs=2, USE_CUDA=USE_CUDA)




KeyboardInterrupt: 