In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
from src.utils import get_weeks, week_num
week_labels = get_weeks(day_from=20160104, num_weeks=121)[91:]

In [5]:
NEURALNET = INTERIM/'neuralnet'

In [6]:
%%time
train = pd.read_feather(NEURALNET/'train_preproc.feather')
val = pd.read_feather(NEURALNET/'val_preproc.feather')
test = pd.read_feather(NEURALNET/'test_preproc.feather')

CPU times: user 1.18 s, sys: 1.25 s, total: 2.43 s
Wall time: 2.44 s


In [9]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
# customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
# isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
# submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
# trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
market     = pd.read_csv(RAW/'Market.csv', low_memory=False)
macro      = pd.read_csv(RAW/'MarketData_Macro.csv', low_memory=False)

In [14]:
market = market[market.DateKey >= week_labels[0]].copy()
market['Week'] = market.DateKey.apply(
                        lambda x: week_num(week_labels, x))

In [16]:
market.head()

Unnamed: 0,IsinIdx,DateKey,Price,Yield,ZSpread,Week
5109025,1,20171002,116.5,6.606,4.227,0
5109026,7,20171002,119.25,5.255,3.17,0
5109027,15,20171002,114.5,-4.447,-6.303,0
5109028,19,20171002,118.0,-2.032,-15.532,0
5109029,21,20171002,104.5,16.614,-5.168,0


In [17]:
market['Price'] = market.Price - 100

In [34]:
weeks_mean = market.groupby(['IsinIdx', 'Week'], as_index=False) \
                    ['Price', 'Yield', 'ZSpread'].agg('mean')
weeks_std = market.groupby(['IsinIdx', 'Week'], as_index=False) \
                    ['Price', 'Yield', 'ZSpread'].agg({'Price': 'std',
                                                       'Yield': 'std',
                                                       'ZSpread': 'std'})

In [21]:
n_weeks = weeks_mean.Week.nunique()

In [54]:
price_dict = {}
yield_dict = {}
zspread_dict = {}

df = weeks_mean.drop_duplicates('IsinIdx')
for i in df.IsinIdx:
    price_dict[i] = [0] * n_weeks
    yield_dict[i] = [0] * n_weeks
    zspread_dict[i] = [0] * n_weeks
    
df = challenge.drop_duplicates('IsinIdx')
for i in df.IsinIdx:
    price_dict[i] = [0] * n_weeks
    yield_dict[i] = [0] * n_weeks
    zspread_dict[i] = [0] * n_weeks
    
for i in train.IsinIdx.unique():
    price_dict[i] = [0] * n_weeks
    yield_dict[i] = [0] * n_weeks
    zspread_dict[i] = [0] * n_weeks
    
for i in val.IsinIdx.unique():
    price_dict[i] = [0] * n_weeks
    yield_dict[i] = [0] * n_weeks
    zspread_dict[i] = [0] * n_weeks

for i in test.IsinIdx.unique():
    price_dict[i] = [0] * n_weeks
    yield_dict[i] = [0] * n_weeks
    zspread_dict[i] = [0] * n_weeks
    
for i, w, p, y, z in zip(*[weeks_mean[c] for c in \
            ['IsinIdx', 'Week', 'Price', 'Yield', 'ZSpread']]):
    price_dict[i][w] = p
    yield_dict[i][w] = y
    zspread_dict[i][w] = z

In [55]:
price_dict_std = {}
yield_dict_std = {}
zspread_dict_std = {}

df = weeks_mean.drop_duplicates('IsinIdx')
for i in df.IsinIdx:
    price_dict_std[i] = [0] * n_weeks
    yield_dict_std[i] = [0] * n_weeks
    zspread_dict_std[i] = [0] * n_weeks
    
df = challenge.drop_duplicates('IsinIdx')
for i in df.IsinIdx:
    price_dict_std[i] = [0] * n_weeks
    yield_dict_std[i] = [0] * n_weeks
    zspread_dict_std[i] = [0] * n_weeks
    
for i in train.IsinIdx.unique():
    price_dict_std[i] = [0] * n_weeks
    yield_dict_std[i] = [0] * n_weeks
    zspread_dict_std[i] = [0] * n_weeks
    
for i in val.IsinIdx.unique():
    price_dict_std[i] = [0] * n_weeks
    yield_dict_std[i] = [0] * n_weeks
    zspread_dict_std[i] = [0] * n_weeks

for i in test.IsinIdx.unique():
    price_dict_std[i] = [0] * n_weeks
    yield_dict_std[i] = [0] * n_weeks
    zspread_dict[i] = [0] * n_weeks
    
for i, w, p, y, z in zip(*[weeks_std[c] for c in \
            ['IsinIdx', 'Week', 'Price', 'Yield', 'ZSpread']]):
    price_dict_std[i][w] = p
    yield_dict_std[i][w] = y
    zspread_dict_std[i][w] = z

## Assign

In [39]:
from src.structurednet import shift_right

In [40]:
def roll_sequences(prices, yields, zspreads, 
                   prices_std, yields_std, zspreads_std,
                   i, w, n_weeks):
    return [shift_right(prices[i], w, n_weeks), 
            shift_right(prices_std[i], w, n_weeks),
            shift_right(yields[i], w, n_weeks),
            shift_right(yields_std[i], w, n_weeks),
            shift_right(zspreads[i], w, n_weeks),
            shift_right(zspreads_std[i], w, n_weeks),
            ]

In [47]:
def extract_seqs(df, prices, yields, zspreads, 
                   prices_std, yields_std, zspreads_std, n_weeks):
    return np.array([roll_sequences(prices, yields, zspreads, 
                   prices_std, yields_std, zspreads_std,
                   i, w, n_weeks) \
                     for i,w in tqdm_notebook(zip(df.IsinIdx, 
                     df.Week), total=len(df))])

In [57]:
%%time
n_weeks = len(week_labels)
train_seqs = extract_seqs(train, price_dict, yield_dict, 
                          zspread_dict, 
                   price_dict_std, yield_dict_std, zspread_dict_std, n_weeks)

HBox(children=(IntProgress(value=0, max=8102750), HTML(value='')))

MemoryError: 

In [None]:
%%time
val_seqs = extract_seqs(val, transactions, buysells, 
                              customers, isins, n_weeks)
test_seqs = extract_seqs(test, transactions, buysells, 
                              customers, isins, n_weeks)

In [None]:
%%time
import pickle
with open(NEURALNET/'market_train_seqs.pkl', 'wb') as f:
    pickle.dump(train_seqs, f, pickle.HIGHEST_PROTOCOL)

In [None]:

with open(NEURALNET/'market_val_seqs.pkl', 'wb') as f:
    pickle.dump(val_seqs, f, pickle.HIGHEST_PROTOCOL)
with open(NEURALNET/'market_test_seqs.pkl', 'wb') as f:
    pickle.dump(test_seqs, f, pickle.HIGHEST_PROTOCOL)

## Model

In [None]:
from torch.utils.data import DataLoader
from torch import optim
import torch.nn as nn
from src.structured_lstm import MultimodalDataset, MultimodalNet, train_model

In [None]:
%%time
import pickle
with open(NEURALNET/'train_seqs.pkl', 'rb') as f:
    orig_train_seqs = pickle.load(f)
with open(NEURALNET/'val_seqs.pkl', 'rb') as f:
    orig_val_seqs = pickle.load(f)
with open(NEURALNET/'test_seqs.pkl', 'rb') as f:
    orig_test_seqs = pickle.load(f)

In [None]:
orig_train_seqs.shape, train_seqs.shape

In [None]:
np.concatenate([orig_train_seqs, train_seqs]).shape

In [None]:
train_seqs = np.concatenate([orig_train_seqs, train_seqs])
val_seqs = np.concatenate([orig_val_seqs, val_seqs])
test_seqs = np.concatenate([orig_test_seqs, test_seqs])

In [None]:
%%time
train_ds = MultimodalDataset(train[cat_cols], train[num_cols],
                             train_seqs, train[target_col])
val_ds = MultimodalDataset(val[cat_cols], val[num_cols],
                             val_seqs, val[target_col])
test_ds = MultimodalDataset(test[cat_cols], test[num_cols],
                             test_seqs, test[target_col])

In [None]:
cat_cols = ['Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType',
            'CompositeRatingCat', 'CustomerIdxCat', 'IsinIdxCat', 'BuySellCat']
num_cols = ['ActualMaturityDateKey', 'IssueDateKey', 'IssuedAmount', 
            'BondDuration', 'BondRemaining', 'BondLife', 
            'Day', 'CompositeRating', 'BuySellCont',
            
            'DaysSinceBuySell', 'DaysSinceTransaction', 'DaysSinceCustomerActivity',
            'DaysSinceBondActivity', 'DaysCountBuySell', 'DaysCountTransaction',
            'DaysCountCustomerActivity', 'DaysCountBondActivity', 'SVD_CustomerBias',
            'SVD_IsinBuySellBias', 'SVD_Recommend', 'SVD_CustomerFactor00',
            'SVD_CustomerFactor01', 'SVD_CustomerFactor02', 'SVD_CustomerFactor03',
            'SVD_CustomerFactor04', 'SVD_CustomerFactor05', 'SVD_CustomerFactor06',
            'SVD_CustomerFactor07', 'SVD_CustomerFactor08', 'SVD_CustomerFactor09',
            'SVD_CustomerFactor10', 'SVD_CustomerFactor11', 'SVD_CustomerFactor12',
            'SVD_CustomerFactor13', 'SVD_CustomerFactor14']
id_cols = ['CustomerIdx', 'IsinIdx', 'BuySell']
target_col = 'CustomerInterest'

## Model

In [None]:
from torch.utils.data import DataLoader
from torch import optim
import torch.nn as nn
from src.structured_lstm import MultimodalDataset, MultimodalNet, train_model

In [None]:
%%time
train_ds = MultimodalDataset(train[cat_cols], train[num_cols],
                             train_seqs, train[target_col])
val_ds = MultimodalDataset(val[cat_cols], val[num_cols],
                             val_seqs, val[target_col])
test_ds = MultimodalDataset(test[cat_cols], test[num_cols],
                             test_seqs, test[target_col])

In [None]:
all_train_ds = torch.utils.data.ConcatDataset([train_ds, val_ds])

In [None]:
%%time
all_train_dl = DataLoader(all_train_ds, batch_size=128, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=128)

In [None]:
USE_CUDA = True

In [None]:
model = MultimodalNet(emb_szs, n_cont=len(num_cols), emb_drop=0.2,
                      szs=[1000,500], drops=[0.5, 0.5],
                      rnn_hidden_sz=64, rnn_input_sz=10, rnn_n_layers=2,
                      rnn_drop=0.5)

if USE_CUDA: model = model.cuda()

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [None]:
%%time
model, train_losses, _, _ = train_model(
                model, all_train_dl, None, optimizer, criterion,
                n_epochs=1, USE_CUDA=USE_CUDA, print_every=800)