## Making datasets and Dataloaders

In [1]:
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

import pickle
import pandas as pd
import numpy as np

In [2]:
TRAIN_DATA = "data/train_encoded.pickle"
VALID_DATA = "data/valid_encoded.pickle"
TEST_DATA = "data/test_encoded.pickle"

In [3]:
with open(TRAIN_DATA, "rb") as file:
    train_df = pickle.load(file)
    train_df.sort_values(by='Date', inplace=True)
    
    
with open(VALID_DATA, "rb") as file:
    valid_df = pickle.load(file)
    valid_df.sort_values(by='Date', inplace=True)

In [4]:
train_df.head()

Unnamed: 0,User,Card,Amount,Use Chip,MCC,Errors,IsFraud_target,Date,Outcome,Time_diff,...,Month_sin,Month_cos,Day_sin,Day_cos,Dow_sin,Dow_cos,Hour_sin,Hour_cos,Minute_sin,Minute_cos
0,791,0,68.0,9,12,121,1,1991-01-02 07:10:00,129,0,...,0.5,0.866025,0.394356,0.918958,0.974928,-0.222521,0.965926,-0.258819,0.8660254,0.5
1,791,0,-68.0,9,12,121,1,1991-01-02 07:17:00,130,420,...,0.5,0.866025,0.394356,0.918958,0.974928,-0.222521,0.965926,-0.258819,0.9781476,-0.207912
2,791,0,113.620003,9,12,121,1,1991-01-02 07:21:00,129,240,...,0.5,0.866025,0.394356,0.918958,0.974928,-0.222521,0.965926,-0.258819,0.809017,-0.587785
3,791,0,114.730003,9,13,121,1,1991-01-02 17:30:00,129,36540,...,0.5,0.866025,0.394356,0.918958,0.974928,-0.222521,-0.965926,-0.258819,5.665539e-16,-1.0
4,791,0,251.710007,9,14,121,1,1991-01-03 09:03:00,129,55980,...,0.5,0.866025,0.571268,0.820763,0.433884,-0.900969,0.707107,-0.707107,0.309017,0.951057


In [5]:
cat_columns = ["Card", "Use Chip", "MCC", "Errors", "Outcome", "is_diff_merchant",
               "is_diff_merchant_city", "is_diff_merchant_state"]
target_columns = ["IsFraud_target"]
drop_columns = ["User", "Date"]
num_columns = np.setdiff1d(train_df.columns.tolist(), cat_columns+target_columns+drop_columns).tolist()

assert len(train_df.columns) == len(cat_columns+target_columns+drop_columns+num_columns)

In [6]:
class CustomDataset(Dataset):
    def __init__(self, df, cat_columns, num_columns, target_columns, drop_columns):
        self.df = df
        self.cat_columns = cat_columns
        self.num_columns = num_columns
        self.target_columns = target_columns
        self.drop_columns = drop_columns
        self.indx_to_user = {i: user for i, user in enumerate(self.df.User.unique())}
    
    def __len__(self):
        return len(self.indx_to_user)
    
    def __getitem__(self, indx):
        user_id = self.indx_to_user[indx]
        user_data = self.df.loc[self.df.loc[:, 'User']==user_id, :].drop(columns=self.drop_columns)
        cat_data = user_data[self.cat_columns].to_numpy()
        num_data = user_data[self.num_columns].to_numpy()
        target = np.unique(user_data[self.target_columns].to_numpy())[0]
        return cat_data, num_data, target

In [7]:
def collate_fn(batch):
    category_data = []
    numerical_data = []
    target_data = []
    for category, numerical, target in batch:
        category_data.append(torch.LongTensor(category))
        numerical_data.append(torch.Tensor(numerical))
        target_data.append(target)
        
    category_data = torch.nn.utils.rnn.pad_sequence(category_data, batch_first=True, padding_value=137)
    numerical_data = torch.nn.utils.rnn.pad_sequence(numerical_data, batch_first=True, padding_value=0)

    target_data = torch.LongTensor(target_data)
    
    return category_data, numerical_data, target_data

## LSTM settings

In [8]:
BATCH_SIZE = 50
NUM_UNIQ_EMBEDDINGS = 138
EMBEDDING_DIM = 5
feature_dim = len(num_columns)+len(cat_columns)*EMBEDDING_DIM
N_LSTM_LAYER = 1
HIDDEN_DIM = 10
DROPOUT = 0.2
BIDIRECTIONAL = False

## Data preprocessing

In [9]:
train_dataset = CustomDataset(train_df, cat_columns=cat_columns,
                              num_columns=num_columns,
                              target_columns=target_columns,
                              drop_columns=drop_columns)

valid_dataset = CustomDataset(valid_df, cat_columns=cat_columns,
                              num_columns=num_columns,
                              target_columns=target_columns,
                              drop_columns=drop_columns)

train_loader = DataLoader(train_dataset, 
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          num_workers=2,
                          collate_fn=collate_fn)

valid_loader = DataLoader(valid_dataset, 
                          batch_size=BATCH_SIZE,
                          shuffle=False,
                          num_workers=2,
                          collate_fn=collate_fn)

## Model definition

In [10]:
emb = nn.Embedding(NUM_UNIQ_EMBEDDINGS, EMBEDDING_DIM)
rnn = nn.LSTM(input_size=feature_dim,
              num_layers=N_LSTM_LAYER,
              hidden_size=HIDDEN_DIM,
              batch_first=True,
              dropout=DROPOUT,
              bidirectional=BIDIRECTIONAL)

for cat, num, y in valid_loader:
    cat = emb(cat)
    all_but_last_two_dims = cat.size()[:-2]
    cat = cat.view(*all_but_last_two_dims, -1)
    batch = torch.cat((num, cat), dim=-1)
    out, (hidden, cell) = rnn(batch)
    print(hidden.shape)



torch.Size([1, 50, 10])
torch.Size([1, 50, 10])
torch.Size([1, 50, 10])
torch.Size([1, 50, 10])
torch.Size([1, 50, 10])
torch.Size([1, 50, 10])
