In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
from src.utils import get_weeks, week_num
week_labels = get_weeks(day_from=20160104, num_weeks=121)[91:]

In [4]:
print(week_labels)

[20171002, 20171009, 20171016, 20171023, 20171030, 20171106, 20171113, 20171120, 20171127, 20171204, 20171211, 20171218, 20171225, 20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [5]:
%%time
train = pd.DataFrame()
for name in week_labels[13:-2]:
    train = pd.concat([train, pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{name}_SVD_diffscount.feather')])

CPU times: user 1min 26s, sys: 46.6 s, total: 2min 12s
Wall time: 2min 23s


In [6]:
%%time
val = pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{week_labels[-2]}_SVD_diffscount.feather')
test = pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{week_labels[-1]}_SVD_diffscount.feather')

CPU times: user 572 ms, sys: 380 ms, total: 952 ms
Wall time: 951 ms


## End checkpoint

In [15]:
%%time
train_ids = set([(c,i,b) for c,i,b in zip(pd.concat([train.CustomerIdx, val.CustomerIdx]),
                              pd.concat([train.IsinIdx, val.IsinIdx]),
                              pd.concat([train.BuySell, val.BuySell]))])
test_ids = set([(c,i,b) for c,i,b in zip(test.CustomerIdx, test.IsinIdx, test.BuySell)])

CPU times: user 4.24 s, sys: 504 ms, total: 4.74 s
Wall time: 4.73 s


In [17]:
train.shape, val.shape, test.shape

((8102750, 72), (493590, 72), (484758, 73))

In [16]:
len(train_ids), len(test_ids), len(test_ids.difference(train_ids))

(985972, 484758, 0)

In [17]:
len(train_ids), len(test_ids), len(test_ids.difference(train_ids))

(1314014, 484758, 0)

In [30]:
def get_seqs(trade, challenge, week_labels, keys, agg='sum'):
    trade = trade[trade.TradeDateKey >= week_labels[0]]
    if 'Week' not in trade.columns:
        trade['Week'] = trade.TradeDateKey.apply(
                            lambda x: week_num(week_labels, x))
    weeks = trade.groupby(keys + ['Week'], as_index=False) \
                            ['CustomerInterest'].agg(agg)
    n_weeks = weeks.Week.nunique()
    seq_dict = {}
    df = weeks.drop_duplicates(keys)
    for tup in zip(*[df[c] for c in keys]):
        tup = tup[0] if len(tup)==1 else tup
        seq_dict[tup] = [0] * n_weeks
    df = challenge.drop_duplicates(keys)
    for tup in zip(*[df[c] for c in keys]):
        tup = tup[0] if len(tup)==1 else tup
        seq_dict[tup] = [0] * n_weeks
    for tup in zip(*[weeks[c] for c in keys + ['Week', 'CustomerInterest']]):
        tup, week, q = tup[:-2], tup[-2], tup[-1]
        tup = tup[0] if len(tup)==1 else tup
        seq_dict[tup][week] = q
    return seq_dict

In [18]:
from src.structurednet import get_seqs

In [19]:
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)

In [21]:
%%time
transactions = get_seqs(trade, challenge, week_labels, 
                        ['CustomerIdx', 'IsinIdx', 'BuySell'])
buysells = get_seqs(trade, challenge, week_labels, 
                        ['CustomerIdx', 'IsinIdx'])
customers = get_seqs(trade, challenge, week_labels, ['CustomerIdx'])
isins = get_seqs(trade, challenge, week_labels, ['IsinIdx'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  lambda x: week_num(week_labels, x))


CPU times: user 15.8 s, sys: 308 ms, total: 16.1 s
Wall time: 16.1 s


In [50]:
%%time
train['Week'] = train.TradeDateKey.apply(
                      lambda x: week_num(week_labels, x))
val['Week'] = val.TradeDateKey.apply(
                      lambda x: week_num(week_labels, x))
test['Week'] = test.TradeDateKey.apply(
                      lambda x: week_num(week_labels, x))

CPU times: user 7.67 s, sys: 260 ms, total: 7.93 s
Wall time: 7.93 s


In [51]:
train.Week.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27])

In [52]:
val.Week.unique()

array([28])

In [48]:
x = np.roll(isins[19020], 2)
x[:2] = 0

In [54]:
len(week_labels)

30

In [60]:
def shift_right(seq, week, week_labels):
    places = len(week_labels) - week - 1
    seq = np.roll(seq, places)
    seq[:places] = 0
    return seq

In [63]:
import pickle
with open(INTERIM/'scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [None]:
train.describe().transpose()

In [62]:
shift_right(isins[19020], 27, week_labels)

array([0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 2., 0., 0., 1., 0.,
       0., 0., 3., 1., 0., 1., 4., 2., 0., 0., 2., 0.])

In [56]:
print(isins[19020])

[0, 1.0, 0, 0, 1.0, 0, 0, 0, 1.0, 0, 2.0, 0, 0.0, 1.0, 0, 0, 0, 3.0, 1.0, 0, 1.0, 4.0, 2.0, 0, 0, 2.0, 0.0, 1.0, 0]


In [41]:
len(train), len(val), len(test)

(15387370, 493590, 484758)

In [None]:
(9, 7259, 'Buy')


In [53]:
len(train_ids)

985972

In [74]:
(9,7259, 'Buy') in transactions

True

In [65]:
(9,7259, 'Buy') in train_ids

True

In [63]:
train[(train.CustomerIdx==9) & (train.IsinIdx==7259)]

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,DaysSinceBuySell,DaysSinceTransaction,DaysSinceCustomerActivity,DaysSinceBondActivity,DaysCountBuySell,DaysCountTransaction,DaysCountCustomerActivity,DaysCountBondActivity,SVD_CustomerBias,SVD_IsinBuySellBias,SVD_Recommend,SVD_CustomerFactor00,SVD_CustomerFactor01,SVD_CustomerFactor02,SVD_CustomerFactor03,SVD_CustomerFactor04,SVD_CustomerFactor05,SVD_CustomerFactor06,SVD_CustomerFactor07,SVD_CustomerFactor08,SVD_CustomerFactor09,SVD_CustomerFactor10,SVD_CustomerFactor11,SVD_CustomerFactor12,SVD_CustomerFactor13,SVD_CustomerFactor14,SVD_IsinBuySellFactor00,SVD_IsinBuySellFactor01,SVD_IsinBuySellFactor02,SVD_IsinBuySellFactor03,SVD_IsinBuySellFactor04,SVD_IsinBuySellFactor05,SVD_IsinBuySellFactor06,SVD_IsinBuySellFactor07,SVD_IsinBuySellFactor08,SVD_IsinBuySellFactor09,SVD_IsinBuySellFactor10,SVD_IsinBuySellFactor11,SVD_IsinBuySellFactor12,SVD_IsinBuySellFactor13,SVD_IsinBuySellFactor14,Year,Month,Day,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType,BondDuration,BondRemaining,BondLife,CompositeRatingCat,Week,BuySellCat,CustomerIdxCat,IsinIdxCat
136,20180101,9,7259,Buy,0.0,915,915,96,3,0,0,14,123,0.168586,0.113142,0.642453,0.021119,0.045339,-0.075194,0.04243,0.04705,0.168435,0.045051,0.080392,-0.175639,-0.03347,-0.085924,-0.057379,-0.006258,0.001205,0.036391,0.233954,-0.026064,0.0284,0.11674,-0.036194,0.230744,0.00106,0.134995,-0.178426,-0.071588,-0.112515,-0.054956,0.277415,-0.389541,0.005499,2018,18,1,Asset Owners,Insurance,Asia Pacific,TAIWAN,834,20400301,20100301,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH NONFIN,US IG TMT CASH,22,Communications,Cable/Satellite TV,US domestic,1000000000.0,FIXED,10958,8095,2863,22,13,Buy,9,7259
137,20180101,9,7259,Sell,1.0,915,915,96,3,0,0,14,123,0.168586,0.046835,0.549887,0.021119,0.045339,-0.075194,0.04243,0.04705,0.168435,0.045051,0.080392,-0.175639,-0.03347,-0.085924,-0.057379,-0.006258,0.001205,0.036391,-0.084943,0.125975,0.029886,0.086831,-0.006835,0.240718,-0.053067,0.076811,-0.083388,-0.039372,-0.042091,-0.13568,0.101862,-0.036478,-0.085612,2018,18,1,Asset Owners,Insurance,Asia Pacific,TAIWAN,834,20400301,20100301,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH NONFIN,US IG TMT CASH,22,Communications,Cable/Satellite TV,US domestic,1000000000.0,FIXED,10958,8095,2863,22,13,Sell,9,7259


In [64]:
trade[(trade.CustomerIdx==9) & (trade.IsinIdx==7259)]

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest
598819,20180104,9,7259,Sell,8321198.0,131.5956,Unknown,1.0


## Checkpoint

In [7]:
from src.structurednet import get_seqs, shift_right

In [8]:
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)

In [9]:
%%time
transactions = get_seqs(trade, challenge, week_labels, 
                        ['CustomerIdx', 'IsinIdx', 'BuySell'])
buysells = get_seqs(trade, challenge, week_labels, 
                        ['CustomerIdx', 'IsinIdx'])
customers = get_seqs(trade, challenge, week_labels, ['CustomerIdx'])
isins = get_seqs(trade, challenge, week_labels, ['IsinIdx'])

CPU times: user 13.2 s, sys: 636 ms, total: 13.8 s
Wall time: 13.8 s


In [10]:
for c in ['CustomerIdx', 'IsinIdx', 'BuySell', 'CompositeRating']:
    train[f'{c}Cat'] = train[c]
    val[f'{c}Cat'] = val[c]
    test[f'{c}Cat'] = test[c]

In [11]:
train['BuySellCont'] = train.BuySell.apply(lambda x: int(x == 'Buy'))
val['BuySellCont'] = val.BuySell.apply(lambda x: int(x == 'Buy'))
test['BuySellCont'] = test.BuySell.apply(lambda x: int(x == 'Buy'))

In [12]:
train['Week'] = train.TradeDateKey.apply(lambda x: week_num(week_labels, x))
val['Week'] = val.TradeDateKey.apply(lambda x: week_num(week_labels, x))
test['Week'] = test.TradeDateKey.apply(lambda x: week_num(week_labels, x))

In [13]:
cat_cols = ['Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType',
            'CompositeRatingCat', 'CustomerIdxCat', 'IsinIdxCat', 'BuySellCat']
num_cols = ['ActualMaturityDateKey', 'IssueDateKey', 'IssuedAmount', 
            'BondDuration', 'BondRemaining', 'BondLife', 
            'Day', 'CompositeRating', 'BuySellCont',
            
            'DaysSinceBuySell', 'DaysSinceTransaction', 'DaysSinceCustomerActivity',
            'DaysSinceBondActivity', 'DaysCountBuySell', 'DaysCountTransaction',
            'DaysCountCustomerActivity', 'DaysCountBondActivity', 'SVD_CustomerBias',
            'SVD_IsinBuySellBias', 'SVD_Recommend', 'SVD_CustomerFactor00',
            'SVD_CustomerFactor01', 'SVD_CustomerFactor02', 'SVD_CustomerFactor03',
            'SVD_CustomerFactor04', 'SVD_CustomerFactor05', 'SVD_CustomerFactor06',
            'SVD_CustomerFactor07', 'SVD_CustomerFactor08', 'SVD_CustomerFactor09',
            'SVD_CustomerFactor10', 'SVD_CustomerFactor11', 'SVD_CustomerFactor12',
            'SVD_CustomerFactor13', 'SVD_CustomerFactor14']
id_cols = ['CustomerIdx', 'IsinIdx', 'BuySell']
target_col = 'CustomerInterest'

In [14]:
%%time
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(pd.concat([train[num_cols], 
                            val[num_cols], test[num_cols]]))

CPU times: user 10 s, sys: 8.88 s, total: 18.9 s
Wall time: 18.9 s


In [15]:
NEURALNET = INTERIM/'neuralnet'

In [16]:
%%time
train.reset_index(drop=True).to_feather(NEURALNET/'train.feather')
val.reset_index(drop=True).to_feather(NEURALNET/'val.feather')
test.reset_index(drop=True).to_feather(NEURALNET/'test.feather')

In [20]:
%%time
from src.structurednet import extract_seqs
n_weeks = len(week_labels)
train_seqs = extract_seqs(train, transactions, buysells, 
                              customers, isins, n_weeks)


CPU times: user 10min 56s, sys: 2min 44s, total: 13min 40s
Wall time: 10min 49s


In [21]:
%%time
val_seqs = extract_seqs(val, transactions, buysells, 
                              customers, isins, n_weeks)
test_seqs = extract_seqs(test, transactions, buysells, 
                              customers, isins, n_weeks)





CPU times: user 1min 15s, sys: 19 s, total: 1min 34s
Wall time: 1min 13s


In [22]:
%%time
import pickle
with open(NEURALNET/'train_seqs.pkl', 'wb') as f:
    pickle.dump(train_seqs, f, pickle.HIGHEST_PROTOCOL)
with open(NEURALNET/'val_seqs.pkl', 'wb') as f:
    pickle.dump(val_seqs, f, pickle.HIGHEST_PROTOCOL)
with open(NEURALNET/'test_seqs.pkl', 'wb') as f:
    pickle.dump(test_seqs, f, pickle.HIGHEST_PROTOCOL)

In [23]:
with open(NEURALNET/'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)

In [25]:
%%time
from src.structurednet import preprocess_catsconts
preprocess_catsconts(train, val, test, cat_cols, num_cols, scaler)

Encoding cats...
Scaling conts...
CPU times: user 34.5 s, sys: 964 ms, total: 35.5 s
Wall time: 35.6 s


In [27]:
%%time
train.reset_index(drop=True).to_feather(NEURALNET/'train_preproc.feather')
val.reset_index(drop=True).to_feather(NEURALNET/'val_preproc.feather')
test.reset_index(drop=True).to_feather(NEURALNET/'test_preproc.feather')

CPU times: user 5.36 s, sys: 6 s, total: 11.4 s
Wall time: 9.53 s


## Model

In [28]:
from torch.utils.data import DataLoader
from torch import optim
import torch.nn as nn
from src.structured_lstm import MultimodalDataset, MultimodalNet, train_model

In [29]:
train_seqs.shape

(8102750, 4, 29)

In [55]:
%%time
train_dl = DataLoader(MultimodalDataset(
                train[cat_cols], train[num_cols],
                train_seqs, train[target_col]),
                batch_size=128, shuffle=True)
val_dl = DataLoader(MultimodalDataset(
                val[cat_cols], val[num_cols],
                val_seqs, val[target_col]), 
                batch_size=128)

CPU times: user 3.08 s, sys: 2.93 s, total: 6.01 s
Wall time: 6.01 s


In [31]:
cat_szs = [int(train[col].max() + 1) for col in cat_cols]
emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]

In [33]:
emb_szs

[(5, 3),
 (36, 18),
 (3, 2),
 (86, 43),
 (3240, 50),
 (9, 5),
 (21, 11),
 (3, 2),
 (8, 4),
 (15, 8),
 (37, 19),
 (101, 50),
 (14, 7),
 (330, 50),
 (15, 8),
 (6, 3),
 (29, 15),
 (2594, 50),
 (22989, 50),
 (2, 1)]

In [41]:
USE_CUDA = True

In [56]:
model = MultimodalNet(emb_szs, n_cont=len(num_cols), emb_drop=0.2,
                      szs=[1000,500], drops=[0.5, 0.5],
                      rnn_hidden_sz=64, rnn_input_sz=4, rnn_n_layers=2,
                      rnn_drop=0.5)

if USE_CUDA: model = model.cuda()

In [57]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [None]:
%%time
model, train_losses, val_losses, val_auc_scores = train_model(
                model, train_dl, val_dl, optimizer, criterion,
                n_epochs=2, USE_CUDA=USE_CUDA, val_every=20)

ROC AUC Score: 0.783245
Validation Loss: 0.136874
ROC AUC Score: 0.836314
Validation Loss: 0.129311
ROC AUC Score: 0.860724
Validation Loss: 0.122721
ROC AUC Score: 0.864471
Validation Loss: 0.116261
ROC AUC Score: 0.871583
Validation Loss: 0.120534
ROC AUC Score: 0.870721
Validation Loss: 0.113148
ROC AUC Score: 0.876722
Validation Loss: 0.114800
ROC AUC Score: 0.878927
Validation Loss: 0.111803
ROC AUC Score: 0.881537
Validation Loss: 0.113562
ROC AUC Score: 0.879608
Validation Loss: 0.110745
ROC AUC Score: 0.883358
Validation Loss: 0.106027
ROC AUC Score: 0.882786
Validation Loss: 0.111879
ROC AUC Score: 0.883002
Validation Loss: 0.108723
ROC AUC Score: 0.885339
Validation Loss: 0.109204
ROC AUC Score: 0.884779
Validation Loss: 0.110941


In [51]:
from src.structured_lstm import get_predictions

In [54]:
len(train_dl), len(val_dl)

(63303, 3857)

In [52]:
targets, preds = get_predictions(model, val_dl, USE_CUDA=USE_CUDA)



In [53]:
from sklearn.metrics import roc_auc_score
roc_auc_score(targets, preds)

0.8714340905872601

In [None]:
targets, preds = get_predictions(model, train_dl, USE_CUDA=USE_CUDA)

In [None]:
roc_auc_score(targets, preds)