In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
# customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
# isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
# submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)
# market     = pd.read_csv(RAW/'Market.csv', low_memory=False)

In [6]:
from src.utils import get_weeks, week_num
week_labels = get_weeks(day_from=20160104, num_weeks=121)[104:]

In [5]:
print(week_labels)

[20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [7]:
weekly_trades = trade[trade.TradeDateKey > 20180000].copy()
weekly_trades['week'] = weekly_trades.TradeDateKey.apply(
                            lambda x: week_num(week_labels, x))
weekly_trades = weekly_trades.groupby(['CustomerIdx', 'IsinIdx', 'BuySell', 'week'],
                                      as_index=False)['CustomerInterest'].agg('max')

In [8]:
import pickle
with open(INTERIM/'interest_sequences.pkl', 'rb') as f:
    interests = pickle.load(f)

## Get sequences

In [10]:
train_seqs = np.array(list(interests.values()))

In [11]:
%%time
val = pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{week_labels[-2]}_SVD_diffscount.feather')
val_seqs = []
for cIdx, iIdx, b, interest in zip(val.CustomerIdx, val.IsinIdx, \
                                   val.BuySell, val.CustomerInterest):
    val_seqs.append(interests[(cIdx, iIdx, b)])
val_seqs = np.array(val_seqs)

CPU times: user 1.48 s, sys: 256 ms, total: 1.73 s
Wall time: 1.79 s


In [22]:
from torch import optim
import torch.nn as nn
from src.lstm import LSTMClassifier, train_model, SequentialDataset
from torch.utils.data import DataLoader

In [15]:
train_ds = DataLoader(SequentialDataset(train_seqs[:,:-2], train_seqs[:,1:-1]),
                      batch_size=128, shuffle=True)
val_ds = DataLoader(SequentialDataset(val_seqs[:,:-1], val_seqs[:,1:]),
                      batch_size=128)

In [16]:
%%time
import pickle
with open(INTERIM/'seq_train_ds.pkl', 'wb') as f:
    pickle.dump(train_ds, f, pickle.HIGHEST_PROTOCOL)
with open(INTERIM/'seq_val_ds.pkl', 'wb') as f:
    pickle.dump(val_ds, f, pickle.HIGHEST_PROTOCOL)

CPU times: user 100 ms, sys: 244 ms, total: 344 ms
Wall time: 353 ms


In [61]:
USE_CUDA = True

In [70]:
model = LSTMClassifier(input_sz=1, hidden_sz=128, n_layers=2, drop=0, 
                       USE_CUDA=USE_CUDA)

In [71]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [72]:
%%time
model, train_losses, val_losses, val_auc_scores = train_model(
                model, train_ds, val_ds, optimizer, criterion,
                n_epochs=2, USE_CUDA=USE_CUDA, val_every=10)

ROC AUC Score: 0.516366
Validation Loss: 0.152716
ROC AUC Score: 0.516664
Validation Loss: 0.155678
ROC AUC Score: 0.516414
Validation Loss: 0.154768

ROC AUC Score: 0.516485
Validation Loss: 0.156397


KeyboardInterrupt: 

In [74]:
from src.lstm import evaluate, get_predictions

In [77]:
targets, preds = get_predictions(model, val_ds, criterion=criterion,
                                 USE_CUDA=USE_CUDA)



In [78]:
len(targets), len(preds)

(493590, 493590)

In [85]:
targets = pd.Series(targets)
preds = pd.Series(preds)

In [86]:
targets.describe()

count    493590.000000
mean          0.035007
std           0.183797
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
dtype: float64

In [87]:
preds.describe()

count    493590.000000
mean          0.018124
std           0.018668
min           0.013976
25%           0.013976
50%           0.013976
75%           0.014300
max           0.546746
dtype: float64

In [93]:
idxs = []
for idx, pred in enumerate(preds):
    if pred > 0.4:
        idxs.append(idx)

In [94]:
len(idxs)

114

In [110]:
val_seqs[idxs[12]]

array([1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])