This notebook is an update for previous version. 
1-I changed the custom dataset to meet other types of data including embeddings.
2- Since the dataloader of fastai is too slow I took a small part of the code from here: https://muellerzr.github.io/fastblog/2020/04/22/TabularNumpy.html
and built a new dataloader. Many thanks to Zachary Mueller for his notebook.
3- The model does not need an additional batch normalization at the beginning as it already exists in the LinBnDrop


In [None]:
import os, gc
import numpy as np
#from numba import njit
import datatable as dtable
import pandas as pd


from sklearn.model_selection import GroupKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import torch.optim as optim
import fastai
from fastai.tabular.all import *

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Preprocessing

In [None]:
%%time

print('Loading...')
train = dtable.fread('../input/jane-street-market-prediction/train.csv').to_pandas()
features = [c for c in train.columns if 'feature' in c]

print('Filling...')
train = train.query('weight > 0').reset_index(drop = True)
#train[features] = train[features].fillna(method = 'ffill').fillna(0)

print('Finish.')

In [None]:
f_mean = train[features[1:]].mean()

In [None]:
train[features[1:]] = train[features[1:]].fillna(f_mean)

In [None]:
train[features].astype('float32')
train['action'] = (train['resp'] > 0).astype('int')

In [None]:
np.isnan(train.values).sum()

In [None]:
target_column = ['action']

In [None]:
len(features)

Deviding the data:

The easiest way is to use sklearn tool or even the fastai spilitter.
Actually I got better results deviding thee data with fastai splitters, where the date is not considered.

In [None]:
#gkf = GroupKFold(n_splits = 5)
#for fold, (tr, te) in enumerate(gkf.split(train['action'].values, train['action'].values, train['date'].values)):
    
    #X_tr, X_val = train.loc[tr, features], train.loc[te, features]
    #y_tr, y_val = train.loc[tr, target_column], train.loc[te, target_column]

In [None]:
splits = RandomSplitter(valid_pct=0.2)(range_of(train))

In [None]:
len(splits[0])

In [None]:
X_tr, X_val = train.loc[splits[0], features], train.loc[splits[1], features]
y_tr, y_val = train.loc[splits[0], target_column], train.loc[splits[1], target_column]

Using the fastai DataLoader and Dataoaders is useful if you like to reduce the iteration time. The idea of creating the custom dataset class is simple and generally looks like the one created for pytorch application as created here https://towardsdatascience.com/deep-learning-using-pytorch-for-tabular-data-c68017d8b480.

The only difference is to return some value for the categories (cats), when they are not existing. np.zeros(len(df),).astype(np.long) will do this for us. Now we can return all values cats, conts and y.

In [None]:
"""
Older one
class Fastai_Cust_Ds():
    def __init__(self, df, cats=None, y=None):
        df = df.copy()
                        
        if cats is not None: 
            self.dfcats = df[cats] #type: pandas.core.frame.DataFrame
            self.cats = np.stack([c.values for n, c in self.dfcats.items()], axis=1).astype(np.long)
            self.dfconts = df.drop(cats, axis=1)
            self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32)
        else:

            self.dfconts = df.copy()
            self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32)
            self.cats = np.zeros(len(df),).astype(np.long)
        self.y = y.values
        
        
    def __len__(self): return len(self.y)

    def __getitem__(self, idx):

        return [self.cats[idx], self.conts[idx], self.y[idx]]"""

New custom_ds

In [None]:
class Fastai_Cust_Ds():
    def __init__(self, df, cats=None, ys=None):
        df = df.copy()
                           
        if cats is not None: 
            self.dfcats = df[cats] #type: pandas.core.frame.DataFrame
            self.cats = self.dfcats.to_numpy().astype(np.long)
            self.dfconts = df.drop(cats, axis=1)
            self.conts = self.dfconts.to_numpy().astype(np.float32)
        else:

            self.dfconts = df.copy()
            self.conts = self.dfconts.to_numpy().astype(np.float32)
            self.dfcats=pd.DataFrame(index=self.dfconts.index,)
            self.cats = self.dfcats.to_numpy().astype(np.long)
        self.ys = ys.values

    def __len__(self): return len(self.ys)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.ys[idx]]

In [None]:
train_ds = Fastai_Cust_Ds(df=X_tr, y=y_tr)
valid_ds = Fastai_Cust_Ds(df=X_val, y=y_val)

Now we create the data loader for the train and validation sets after importing the fastai DataLoader which is similar to the pytorch one.

In [None]:
from fastai.data.core import DataLoader

The dataloader of Fastai is too slow, so I took it out.

In [None]:
#train_dl = DataLoader(train_ds, batch_size = 4096, drop_last=True, shuffle=False)
#valid_dl = DataLoader(valid_ds, batch_size = 2048, drop_last=True, shuffle=False)

Instead, I applied the custom TabDataloader from Zachary Mueller's notebook, which will make a huge differce to the speed. To create the dls, the fastai **DataLoaders** is OK.

In [None]:
class TabDataLoader(DataLoader):
    def __init__(self, dataset, bs=1, num_workers=0, device='cuda', shuffle=False, **kwargs):
        "A `DataLoader` based on a `TabDataset`"
        super().__init__(dataset, bs=bs, num_workers=num_workers, shuffle=shuffle, 
                         device=device, drop_last=shuffle, **kwargs)
        self.dataset.bs=bs
    
    def create_item(self, s): return s

    def create_batch(self, b):
        "Create a batch of data"
        cat, cont, y = self.dataset[b]
        return tensor(cat).to(self.device), tensor(cont).to(self.device), tensor(y).to(self.device)
        #return cat, cont, y
    def get_idxs(self):
        "Get index's to select"
        idxs = Inf.count if self.indexed else Inf.nones
        if self.n is not None: idxs = list(range(len(self.dataset)))
        return idxs

    def shuffle_fn(self):
        "Shuffle the interior dataset"
        rng = np.random.permutation(len(self.dataset))
        self.dataset.cats = self.dataset.cats[rng]
        self.dataset.conts = self.dataset.conts[rng]
        self.dataset.ys = self.dataset.ys[rng]

In [None]:
train_dl = TabDataLoader(train_ds, device='cuda', shuffle=True, bs=5000)
valid_dl = TabDataLoader(valid_ds, device='cuda', shuffle=False, bs=5000)

The fastai DataLoader is the next step, so it must be also imported.

In [None]:
from fastai.data.core import DataLoaders
dls = DataLoaders(train_dl,valid_dl,device='cuda')

In [None]:
dls.cats.shape, dls.conts.shape, dls.y.shape

As we see, the values of cats were returned with zero values. This is important for the learner to accept the dls created.
.

There are also many useful options in both of DataLoader and DataLoaders of fastai that can be used here.  Drop_last and device are the most important.


The only problem of creating such a custom dataset in fastai, is that the inference can not be calculated using the learn.get_preds. Instead, we do learn.model.eval()  then we infer using preds = learn.model(0,test_df). I suppose that a small modification of the custom dataset may solve this problem.


As we will see in the custom model created here, its foreword function will need two values, one for the cats values, while the other one will take the x. The same must be done in the inference using  preds = learn.model(0,test_df).

In [None]:
train_ds.cats

I just took the first line (BatchNorm1d) away, since LinBnDrop begins already with bn.
If the bn is true the bias will be set to false automatically.

In [None]:
class JanStr(nn.Module):

    def __init__(self):
        super(JanStr, self).__init__() 

        self.layers = nn.Sequential(

        LinBnDrop(130, 400, bn=True, p=0, act=Mish(), lin_first=False),
        LinBnDrop(400, 800, bn=True, p=0.2289, act=Mish(), lin_first=False),   
        LinBnDrop(800, 400, bn=True, p=0.2289, act=Mish(), lin_first=False),
        LinBnDrop(400, 2, bn=False, act=None, lin_first=False),

        ) 

    def forward(self,cat, x):
        x = self.layers(x)
        return F.softmax(x, dim=1)

The LinBnDrop is a very summarizing linear model of fastai dealing with three Batchnorm1D, nn.Linear and dropout. The forward function requires two values, cats for categories and x for the features.


In [None]:
model_nn = JanStr()
model_nn = model_nn.to(device)

In [None]:
loss_func = CrossEntropyLossFlat()

In [None]:
roc_auc = RocAucBinary()

In [None]:
learn = Learner(dls, model_nn, loss_func = loss_func, metrics=roc_auc)

In [None]:
callbacks = [
    EarlyStoppingCallback(monitor='valid_loss', min_delta=1e-5, patience=7),    
    ReduceLROnPlateau(monitor='valid_loss', min_delta=0.00001, patience=1, min_lr=1e-8),
    #SaveModelCallback(mode='min'),
    SaveModelCallback(monitor='valid_loss'),
]

In [None]:
learn.lr_find()

In [None]:
lr=0.00005

In [None]:
learn.fit_one_cycle(20, lr, wd = 0.0001, cbs=callbacks)

In [None]:
def fill_nan(test_df):
    test_df=test_df[features]
    xar=test_df[features].values
    na_arr=np.ones((1,len(features)), dtype=float)
    na_arr=na_arr*(999)
    xar = xar - na_arr
    xar = np.nan_to_num(xar, nan=-999)
    xar = xar + na_arr
    test_df = torch.FloatTensor(xar)
    return test_df

In [None]:
learn.model.eval()
import janestreet
env = janestreet.make_env()
env_iter = env.iter_test()

In [None]:
for (test_df, pred_df) in tqdm(env_iter):
    X_test = fill_nan(test_df).cuda()
    preds = learn.model(0,X_test).argmax(dim=1).detach().cpu().numpy()
    eps=0.5

    #action = ((test_df['weight'].values * probas[:, 1]) > 0).astype('int')
    if test_df['weight'].item() == 0:
        action=0
    else:
        action=preds
    pred_df.action = action
    env.predict(pred_df)