## Making datasets and Dataloaders

In [4]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import pickle
import pandas as pd
import numpy as np

In [5]:
TRAIN_DATA = "data/train_encoded.pickle"
VALID_DATA = "data/valid_encoded.pickle"
TEST_DATA = "data/test_encoded.pickle"

In [6]:
with open(TRAIN_DATA, "rb") as file:
    train_df = pickle.load(file)
    train_df.sort_values(by='Date', inplace=True)
    
    
with open(VALID_DATA, "rb") as file:
    valid_df = pickle.load(file)
    valid_df.sort_values(by='Date', inplace=True)

In [7]:
train_df.head()

Unnamed: 0,User,Card,Amount,Use Chip,MCC,Errors,IsFraud_target,Date,Outcome,Time_diff,...,Month_sin,Month_cos,Day_sin,Day_cos,Dow_sin,Dow_cos,Hour_sin,Hour_cos,Minute_sin,Minute_cos
0,791,0,68.0,9,12,121,1,1991-01-02 07:10:00,129,0,...,0.5,0.866025,0.394356,0.918958,0.974928,-0.222521,0.965926,-0.258819,0.8660254,0.5
1,791,0,-68.0,9,12,121,1,1991-01-02 07:17:00,130,420,...,0.5,0.866025,0.394356,0.918958,0.974928,-0.222521,0.965926,-0.258819,0.9781476,-0.207912
2,791,0,113.620003,9,12,121,1,1991-01-02 07:21:00,129,240,...,0.5,0.866025,0.394356,0.918958,0.974928,-0.222521,0.965926,-0.258819,0.809017,-0.587785
3,791,0,114.730003,9,13,121,1,1991-01-02 17:30:00,129,36540,...,0.5,0.866025,0.394356,0.918958,0.974928,-0.222521,-0.965926,-0.258819,5.665539e-16,-1.0
4,791,0,251.710007,9,14,121,1,1991-01-03 09:03:00,129,55980,...,0.5,0.866025,0.571268,0.820763,0.433884,-0.900969,0.707107,-0.707107,0.309017,0.951057


In [8]:
cat_columns = ["Card", "Use Chip", "MCC", "Errors", "Outcome", "is_diff_merchant",
               "is_diff_merchant_city", "is_diff_merchant_state"]
target_columns = ["IsFraud_target"]
drop_columns = ["User", "Date"]
num_columns = np.setdiff1d(train_df.columns.tolist(), cat_columns+target_columns+drop_columns).tolist()

assert len(train_df.columns) == len(cat_columns+target_columns+drop_columns+num_columns)

In [9]:
class CustomDataset(Dataset):
    def __init__(self, df, cat_columns, num_columns, target_columns, drop_columns, max_latest_seq=None):
        self.df = df
        self.cat_columns = cat_columns
        self.num_columns = num_columns
        self.target_columns = target_columns
        self.drop_columns = drop_columns
        self.max_latest_seq = max_latest_seq
        self.indx_to_user = {i: user for i, user in enumerate(self.df.User.unique())}
    
    def __len__(self):
        return len(self.indx_to_user)
    
    def __getitem__(self, indx):
        user_id = self.indx_to_user[indx]
        user_data = (self.df.loc[self.df.loc[:, 'User']==user_id, :]
                     .drop(columns=self.drop_columns)
                     .reset_index(drop=True))
        
        if self.max_latest_seq:
            if len(user_data)>self.max_latest_seq:
                user_data = user_data[-self.max_latest_seq:]
                
        cat_data = user_data[self.cat_columns].to_numpy()
        num_data = user_data[self.num_columns].to_numpy()
        target = np.unique(user_data[self.target_columns].to_numpy())[0]
        return cat_data, num_data, target

In [10]:
def collate_fn(batch):
    category_data = []
    numerical_data = []
    target_data = []
    for category, numerical, target in batch:
        category_data.append(torch.LongTensor(category))
        numerical_data.append(torch.Tensor(numerical))
        target_data.append(target)
        
    category_data = torch.nn.utils.rnn.pad_sequence(category_data, batch_first=True, padding_value=137)
#     numerical_data = torch.nn.utils.rnn.pad_sequence(numerical_data, batch_first=True, padding_value=0)

    target_data = torch.LongTensor(target_data)
    
    return category_data, numerical_data, target_data

## LSTM settings

In [11]:
n_numerical_col = len(num_columns)  # Constant
NUM_UNIQ_EMBEDDINGS = 138  # Constant

MAX_LATEST_SEQ = 32
BATCH_SIZE = 50
EMBEDDING_DIM = 5
feature_dim = n_numerical_col+len(cat_columns)*EMBEDDING_DIM
N_LSTM_LAYER = 2
HIDDEN_DIM = 128
DROPOUT = 0.2
BIDIRECTIONAL = True

## Data preprocessing

In [12]:
train_dataset = CustomDataset(train_df, cat_columns=cat_columns,
                              num_columns=num_columns,
                              target_columns=target_columns,
                              drop_columns=drop_columns,
                              max_latest_seq=MAX_LATEST_SEQ)

valid_dataset = CustomDataset(valid_df, cat_columns=cat_columns,
                              num_columns=num_columns,
                              target_columns=target_columns,
                              drop_columns=drop_columns,
                              max_latest_seq=MAX_LATEST_SEQ)

train_loader = DataLoader(train_dataset, 
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          num_workers=2,
                          collate_fn=collate_fn)

valid_loader = DataLoader(valid_dataset, 
                          batch_size=BATCH_SIZE,
                          shuffle=False,
                          num_workers=2,
                          collate_fn=collate_fn)

### Default predictions

In [13]:
y_true = torch.cat([y for cat, num, y in train_loader])
y_hat = torch.zeros_like(y_true)
constant_accuracy = (y_true==y_hat).sum()/len(y_true)
print(f"Constant prediction accuracy: {constant_accuracy}")

Constant prediction accuracy: 0.602142870426178


In [14]:
y_hat = torch.randint(0, 1, (1400,))
random_accuracy = (y_true==y_hat).sum()/len(y_true)
print(f"Random prediction accuracy: {random_accuracy}")

Random prediction accuracy: 0.602142870426178


### RNN Model definition

In [15]:
class RNN_network(nn.Module):
    def __init__(self,
                num_uniq_embeddings,
                embedding_dim,
                feature_dim,
                n_lstm_layer,
                hidden_dim,
                dropout,
                bidirectional,
                n_numerical_col):
        super(RNN_network, self).__init__()
        
        self.num_uniq_embeddings=num_uniq_embeddings
        self.embedding_dim=embedding_dim
        self.feature_dim=feature_dim
        self.n_lstm_layer=n_lstm_layer
        self.hidden_dim=hidden_dim
        self.dropout=dropout if n_lstm_layer>1 else 0
        self.bidirectional=bidirectional
        self.n_numerical_col=n_numerical_col
        
        self.embedding = nn.Embedding(self.num_uniq_embeddings, self.embedding_dim)
        self.batch_norm = nn.BatchNorm1d(self.n_numerical_col)
        self.rnn = nn.LSTM(input_size=self.feature_dim,
                          num_layers=self.n_lstm_layer,
                          hidden_size=self.hidden_dim,
                          batch_first=True,
                          dropout=self.dropout,
                          bidirectional=self.bidirectional)
        
        self.linear_one = nn.Linear(self.hidden_dim*(self.bidirectional+1), self.hidden_dim)
        self.batchnorm_linear = nn.BatchNorm1d(self.hidden_dim)
        self.linear_two = nn.Linear(self.hidden_dim, 2)
        
    def forward(self, batch):
        cat, num = batch
        
        # Concat embeddings
        cat = self.embedding(cat)
        all_but_last_two_dims = cat.size()[:-2]
        cat = cat.view(*all_but_last_two_dims, -1)
        
        # Batchnorm across numeric features and pack it to tensor
        num = [self.batch_norm(numeric) for numeric in num]
        num = torch.nn.utils.rnn.pad_sequence(num, batch_first=True, padding_value=0)
        
        # Concat all features
        batch = torch.cat((num, cat), dim=-1)
        
        out, (hidden, cell) = self.rnn(batch)
        
        if self.bidirectional:
            to_classifier = torch.cat((hidden[-2], hidden[-1]), dim=-1)
        else:
            to_classifier = hidden[-1]
            
        x = F.relu(self.linear_one(to_classifier))
        x = self.batchnorm_linear(x)
        output = self.linear_two(x)
        return output

In [16]:
model = RNN_network(num_uniq_embeddings=NUM_UNIQ_EMBEDDINGS,
               embedding_dim=EMBEDDING_DIM,
               feature_dim=feature_dim,
               n_lstm_layer=N_LSTM_LAYER,
               hidden_dim=HIDDEN_DIM,
               dropout=DROPOUT,
               bidirectional=BIDIRECTIONAL,
               n_numerical_col=n_numerical_col)

In [17]:
for cat, num, y in valid_loader:
    output=model((cat, num))

In [38]:
import torch
import pytorch_lightning as pl
import torchmetrics



class LightningWrapper(pl.LightningModule):
    def __init__(self, model, cfg=None):
        super(LightningWrapper, self).__init__()
        self.cfg = cfg
        self.model = model
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def forward(self, x):
        x = self.model(x)
        return x
    
    def training_step(self, batch, batch_idx):
        x_cat, x_num, y = batch
        pred = self.model((x_cat, x_num))
        loss = self.criterion(pred, y)
        y_softmax = torch.softmax(pred, dim=-1).detach()
        y_pred = torch.argmax(y_softmax, dim=-1)
        y_proba = y_softmax[:,1]
        return {"loss": loss, "y_pred": y_pred, "y_true": y, "y_proba": y_proba}

    def validation_step(self, batch, batch_idx):
        x_cat, x_num, y = batch
        pred = self.model((x_cat, x_num))
        loss = self.criterion(pred, y)
        y_softmax = torch.softmax(pred, dim=-1).detach()
        y_pred = torch.argmax(y_softmax, dim=-1)
        y_proba = y_softmax[:,1]
        return {"val_loss": loss, "y_pred": y_pred, "y_true": y, "y_proba": y_proba}
    
    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=3e-4)
        lr_schedulers = {'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=5),
                         'monitor': 'val_loss'}
        return [optimizer], [lr_schedulers]
    
    def training_epoch_end(self, outputs):
        y_hat = torch.cat([x['y_pred'].view(-1) for x in outputs])
        y_true = torch.cat([x['y_true'].view(-1) for x in outputs])
        y_proba = torch.cat([x['y_proba'].view(-1) for x in outputs])
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        
        roc_auc = float(torchmetrics.functional.auroc(preds=y_proba, target=y_true, pos_label=1))
        f1_score = float(torchmetrics.functional.f1(preds=y_proba, target=y_true))
        accuracy = float(torchmetrics.functional.accuracy(preds=y_hat, target=y_true))
        print(f'Train: \n   Loss: {avg_loss}, F1 score: {f1_score}, ROC_AUC: {roc_auc}, Accuracy: {accuracy}')

    def validation_epoch_end(self, outputs):
        y_hat = torch.cat([x['y_pred'].view(-1) for x in outputs])
        y_true = torch.cat([x['y_true'].view(-1) for x in outputs])
        y_proba = torch.cat([x['y_proba'].view(-1) for x in outputs])
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()

        
        roc_auc = float(torchmetrics.functional.auroc(preds=y_proba, target=y_true, pos_label=1))
        f1_score = float(torchmetrics.functional.f1(preds=y_proba, target=y_true))
        accuracy = float(torchmetrics.functional.accuracy(preds=y_hat, target=y_true))
        
        self.log('val_loss', avg_loss)
        
        print(f'Valid {self.current_epoch}: \n   Loss: {avg_loss}, F1 score: {f1_score}, ROC_AUC: {roc_auc}, Accuracy: {accuracy}')

In [39]:
model_wrapper = LightningWrapper(model=model)

In [41]:
trainer = pl.Trainer(max_epochs=3)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [42]:
trainer.fit(model_wrapper, train_loader, valid_loader)


  | Name      | Type             | Params
-----------------------------------------------
0 | model     | RNN_network      | 615 K 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
615 K     Trainable params
0         Non-trainable params
615 K     Total params
2.463     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



Valid 0: 
   Loss: 0.6857054233551025, F1 score: 0.7261146903038025, ROC_AUC: 0.5528355836868286, Accuracy: 0.5699999928474426




Training: 0it [00:00, ?it/s]

0.0003
Train: 
   Loss: 0.6603348851203918, F1 score: 0.5870569944381714, ROC_AUC: 0.6845752596855164, Accuracy: 0.6171428561210632


Validating: 0it [00:00, ?it/s]

Valid 0: 
   Loss: 0.684273898601532, F1 score: 0.18309859931468964, ROC_AUC: 0.5498862266540527, Accuracy: 0.6133333444595337


