In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
from src.utils import get_weeks
# week_labels = get_weeks(day_from=20160104, num_weeks=121)[52:]
# week_labels = get_weeks(day_from=20160104, num_weeks=121)[96:]
week_labels = get_weeks(day_from=20160104, num_weeks=121)[104:]

In [4]:
print(week_labels)

[20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [5]:
%%time
train = pd.DataFrame()
for name in week_labels[:-2]:
    train = pd.concat([train, pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{name}_SVD_diffscount.feather')])

CPU times: user 1min 25s, sys: 45.8 s, total: 2min 11s
Wall time: 2min 19s


In [6]:
%%time
val = pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{week_labels[-2]}_SVD_diffscount.feather')
test = pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{week_labels[-1]}_SVD_diffscount.feather')

CPU times: user 636 ms, sys: 348 ms, total: 984 ms
Wall time: 980 ms


## Fill nan

In [7]:
nan_cols = ['Subsector', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue']

In [8]:
%%time
train.fillna({x: '-999' for x in nan_cols}, inplace=True)
val.fillna({x: '-999' for x in nan_cols}, inplace=True)
test.fillna({x: '-999' for x in nan_cols}, inplace=True)

CPU times: user 6.25 s, sys: 2.59 s, total: 8.84 s
Wall time: 8.83 s


# Preprocessing

In [9]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [10]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [11]:
num_cols = [c for c in train.columns \
              if c not in id_cols + cat_cols and c != target_col]

In [12]:
%%time
# Label encode cats
from src.utils import to_cat_codes, apply_cats
to_cat_codes(train, cat_cols)
apply_cats(val, train)
apply_cats(test, train)

for col in cat_cols:
    train[col] = train[col].cat.codes
    val[col] = val[col].cat.codes
    test[col] = test[col].cat.codes

CPU times: user 23.8 s, sys: 5.41 s, total: 29.2 s
Wall time: 29.2 s


In [13]:
# Scale conts

# Model

In [21]:
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim

In [15]:
cat_szs = [train[col].nunique() for col in cat_cols]

In [16]:
cat_szs

[2, 5, 36, 3, 86, 3239, 9, 21, 3, 8, 15, 37, 101, 14, 330, 15, 6]

In [17]:
emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]

In [18]:
emb_szs

[(2, 1),
 (5, 3),
 (36, 18),
 (3, 2),
 (86, 43),
 (3239, 50),
 (9, 5),
 (21, 11),
 (3, 2),
 (8, 4),
 (15, 8),
 (37, 19),
 (101, 50),
 (14, 7),
 (330, 50),
 (15, 8),
 (6, 3)]

In [69]:
class NeuralNet(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops,
                 use_bn=False):
        super().__init__()
        
        self.embs = nn.ModuleList([
            nn.Embedding(c, s) for c,s in emb_szs
        ])
        for emb in self.embs:
            self.emb_init(emb)
            
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont = n_emb, n_cont
        szs = [n_emb + n_cont] + szs
        
        self.lins = nn.ModuleList([
            nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)
        ])
        for o in self.lins: 
            nn.init.kaiming_normal_(o.weight.data)
        
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in szs[1:]
        ])        
            
        self.outp = nn.Linear(szs[-1], out_sz)
        nn.init.kaiming_normal_(self.outp.weight.data)
        
        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([
            nn.Dropout(drop) for drop in drops
        ])
        self.bn = nn.BatchNorm1d(n_cont)
        
        self.use_bn = use_bn
    
    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [emb(x_cat[:,i]) for i,emb in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
        for lin, drop, bn in zip(self.lins, self.drops, self.bns):
            x = F.relu(lin(x))
            if self.use_bn:
                x = bn(x)
            x = drop(x)
        return self.outp(x) # coupled with BCEWithLogitsLoss
    
    def emb_init(self, x):
        # higher init range for low-dimensional embeddings
        x = x.weight.data
        sc = 2 / (x.size(1) + 1)
        x.uniform_(-sc, sc)

In [85]:
class TabularDataset(torch.utils.data.Dataset):
    def __init__(self, df, cat_cols, num_cols, target_col=None):
        self.cats = df[cat_cols].values.astype(np.int64)
        self.conts = df[num_cols].values.astype(np.float32)
        self.target = df[target_col].values.astype(np.float32) if target_col \
                            else np.zeros((len(df),1)).astype(np.float32)
    
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.target[idx]]

In [86]:
from torch.utils.data import DataLoader

In [87]:
%%time
train_ds = DataLoader(TabularDataset(train, cat_cols, num_cols, target_col), batch_size=128)
val_ds = DataLoader(TabularDataset(val, cat_cols, num_cols, target_col), batch_size=128)
test_ds = DataLoader(TabularDataset(val, cat_cols, num_cols), batch_size=128)

CPU times: user 1.72 s, sys: 2.71 s, total: 4.43 s
Wall time: 4.43 s


In [88]:
model = NeuralNet(emb_szs, n_cont=len(num_cols), emb_drop=0.04, 
                  out_sz=1, szs=[1000, 500], drops=[0.001, 0.01],
                  use_bn=True)

In [89]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [100]:
def train_model(model, optimizer, train_loader, val_loader, print_every=100):
    for epoch in range(2):
        model.train()
        train_loss, val_loss = 0, 0
        for batch_idx, (cats, conts, target) in enumerate(train_loader):
            cats, conts, target = Variable(cats), Variable(conts), \
                                  Variable(target)
            pred = model(cats, conts)
            loss = criterion(pred.view(-1), target)
            loss.backward()
            optimizer.step()
            if batch_idx % print_every == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                        epoch, batch_idx * len(cats), len(train_loader.dataset),
                        100. * batch_idx / len(train_loader), loss.data[0]))
    return model

In [101]:
model = train_model(model, optimizer, train_ds, val_ds)

  




KeyboardInterrupt: 