In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
# submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
# market     = pd.read_csv(RAW/'Market.csv', low_memory=False)

In [4]:
from src.utils import get_weeks
week_labels = get_weeks(day_from=20160104, num_weeks=121)[104:]

In [5]:
print(week_labels)

[20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [6]:
trade.head()

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest
0,20161207,2789,8478,Sell,653168.0,0.0,Unknown,1.0
1,20170329,2574,14562,Buy,1656487.0,0.0,Unknown,1.0
2,20170418,2574,4747,Buy,939673.0,0.0,Unknown,1.0
3,20170310,2574,9885,Sell,708082.0,0.0,Unknown,1.0
4,20161116,2574,8885,Buy,1147709.0,0.0,Unknown,1.0


In [7]:
weekly_trades = trade[trade.TradeDateKey > 20180000].copy()

In [8]:
from src.utils import week_num

In [9]:
weekly_trades['week'] = weekly_trades.TradeDateKey.apply(
                            lambda x: week_num(week_labels, x))

In [10]:
weekly_trades.head()

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest,week
1527,20180201,2447,19665,Sell,748160.0,102.65,Done,1.0,4
1528,20180220,2447,18972,Sell,2959167.0,102.093,NotTraded,1.0,7
1529,20180108,2554,24873,Buy,2815003.0,103.877,Done,1.0,1
1530,20180108,2554,19072,Sell,2815003.0,121.963,Done,1.0,1
1538,20180116,1922,25986,Buy,601586.0,97.984,Done,1.0,2


In [11]:
weekly_trades = weekly_trades.groupby(['CustomerIdx', 'IsinIdx', 'BuySell', 'week'],
                                      as_index=False)['CustomerInterest'].agg('max')

In [12]:
weekly_trades.sample(5)

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,week,CustomerInterest
681800,2821,20118,Sell,4,0.0
364981,2050,19520,Buy,8,0.0
506818,2429,24845,Sell,10,1.0
414607,2164,1906,Sell,12,0.0
398125,2106,8295,Buy,13,0.0


In [13]:
weekly_trades[(weekly_trades.CustomerIdx==0) & (weekly_trades.IsinIdx==24944)]

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,week,CustomerInterest
0,0,24944,Sell,10,1.0


In [14]:
weekly_trades.week.max()

15

In [15]:
n_weeks = weekly_trades.week.nunique()

In [17]:
%%time
interests = {} # 5 GB of RAM
for idx, row in weekly_trades.drop_duplicates(
                        ['CustomerIdx', 'IsinIdx']).iterrows():
    for b in ['Buy', 'Sell']:
        interests[(row.CustomerIdx, row.IsinIdx, b)] = [0] * n_weeks

CPU times: user 31.7 s, sys: 104 ms, total: 31.8 s
Wall time: 31.8 s


In [18]:
%%time
for idx, row in challenge.drop_duplicates(
                        ['CustomerIdx', 'IsinIdx']).iterrows():
    for b in ['Buy', 'Sell']:
        interests[(row.CustomerIdx, row.IsinIdx, b)] = [0] * n_weeks

CPU times: user 18 s, sys: 28 ms, total: 18 s
Wall time: 18 s


In [19]:
len(interests) # vs 985,972 110,182,700

985972

In [16]:
# %%time
# interests = {} # 5 GB of RAM
# for cIdx in weekly_trades.CustomerIdx.unique():
#     for iIdx in weekly_trades.IsinIdx.unique():
#         for b in ['Buy', 'Sell']:
#             interests[(cIdx, iIdx, b)] = [0] * n_weeks

CPU times: user 2min 18s, sys: 9.22 s, total: 2min 27s
Wall time: 2min 27s


In [25]:
%%time
from tqdm import tqdm_notebook
for idx, row in tqdm_notebook(weekly_trades.iterrows(), total=len(weekly_trades)):
    interests[(row.CustomerIdx, row.IsinIdx, row.BuySell)][row.week] = row.CustomerInterest


CPU times: user 1min, sys: 480 ms, total: 1min
Wall time: 1min


In [26]:
import pickle
with open(INTERIM/'interest_sequences.pkl', 'wb') as f:
    pickle.dump(interests, f, pickle.HIGHEST_PROTOCOL)

In [27]:
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [83]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_sz, hidden_sz, n_layers, batch_sz):
        super().__init__()
        self.input_sz = input_sz
        self.hidden_sz = hidden_sz
        self.batch_sz = batch_sz    
        self.n_layers = n_layers
        
        self.lstm = nn.LSTM(input_sz, hidden_sz, n_layers, 
                            batch_first=True, dropout=0.1)
        self.out = nn.Linear(hidden_sz, 1) # output_sz 1
        
    def forward(self, sequence):
        # or transpose if batch_first = False
        inp = sequence.view(self.batch_sz, -1, self.input_sz) 
        h0 = self.init_hidden()
        out, hn = self.lstm(inp, h0)
        out = self.out(out[-1])
        return out
        
    def init_hidden(self):
        h0 = torch.zeros(self.n_layers, 1, self.hidden_sz)
        c0 = torch.zeros(self.n_layers, 1, self.hidden_sz)
        return (h0, c0)

In [84]:
model = LSTMClassifier(input_sz=1, hidden_sz=32, n_layers=2, batch_sz=1)

In [85]:
inp = torch.Tensor(interests[(2429, 24845, 'Sell')])

In [86]:
out = model(inp)

In [87]:
inp.size(), out.size()

(torch.Size([16]), torch.Size([16, 1]))

In [89]:
F.sigmoid(out)

tensor([[ 0.5016],
        [ 0.5013],
        [ 0.5012],
        [ 0.5007],
        [ 0.5012],
        [ 0.5001],
        [ 0.5002],
        [ 0.5005],
        [ 0.5007],
        [ 0.5004],
        [ 0.5003],
        [ 0.5004],
        [ 0.5000],
        [ 0.4995],
        [ 0.4998],
        [ 0.5002]])

In [52]:
# params: (input_size, hidden_size, num_layers, bias, 
#          batch_first, dropout, bidirectional)
#   input (seq_len, batch, input_size)
#   (h_0, c_0) (num_layers * num_directions, batch, hidden_size)
#   output (seq_len, batch, hidden_size * num_directions)

In [59]:
interests[(2429, 24845, 'Sell')]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0]

In [69]:
input_sz = 1
hidden_sz = 32
n_layers = 2
seq_len = 16
rnn = nn.LSTM(input_sz, hidden_sz, n_layers, batch_first=True)

In [70]:
inp = torch.Tensor(interests[(2429, 24845, 'Sell')])
h0 = torch.randn(n_layers, 1, hidden_sz)
c0 = torch.randn(n_layers, 1, hidden_sz)

In [71]:
inp.view(-1, seq_len, input_sz).size(), h0.size(), c0.size()

(torch.Size([1, 16, 1]), torch.Size([2, 1, 32]), torch.Size([2, 1, 32]))

In [72]:
output, hn = rnn(inp.view(-1, seq_len, input_sz), (h0, c0))

In [74]:
output.size(), hn[0].size(), hn[1].size()

(torch.Size([1, 16, 32]), torch.Size([2, 1, 32]), torch.Size([2, 1, 32]))

In [76]:
output.transpose(0,1).size()

torch.Size([16, 1, 32])

In [75]:
output[0][0]

tensor([-0.0221, -0.2872,  0.4475,  0.0206,  0.0537,  0.2258, -0.2488,
        -0.1234, -0.4918,  0.1862,  0.1821, -0.0464, -0.0417,  0.1354,
         0.2397, -0.1834, -0.1053,  0.1626,  0.4373, -0.1913,  0.1289,
         0.0450, -0.0621,  0.1743, -0.5609,  0.2412,  0.4210, -0.2229,
        -0.0502, -0.0410, -0.0127, -0.2277])

In [53]:
rnn = nn.LSTM(10, 20, 2)
inp = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, hn = rnn(inp, (h0, c0))

In [57]:
model = nn.Module()