In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import matplotlib
from matplotlib.ticker import MaxNLocator

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import matplotlib.pyplot as plt
from pathlib import Path
import os
from datetime import datetime, timedelta
import pandas as pd
import json
import pytorch_lightning as pl
import torch.nn as nn
import torch.optim as optim
from multiprocessing import cpu_count


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


project_dir = Path('__main__').resolve().parents[1]

  from .autonotebook import tqdm as notebook_tqdm


# DataLoader

In [121]:
class LSTMDataSet(Dataset):
    def __init__(self, features_file, target_file, mapping_file):
        self.features_table = pd.read_parquet(os.path.join(project_dir, 'data', 'interim', features_file))
        # TODO: Tirar isso daqui
        self.features_table['ano_mes_cruzamento'] = pd.to_datetime(self.features_table['ano_mes_cruzamento'], format='%Y-%m')

        self.target_table = pd.read_parquet(os.path.join(project_dir, 'data', 'interim', target_file))
        self.mapping = json.load(open(os.path.join(project_dir, 'data', 'interim', mapping_file), 'r'), 
                                    object_hook=lambda x: {int(k) : v for k,v in x.items()})

    def __len__(self):
        return self.target_table.shape[0]


    def _get_data(self, idx):
        df = self.target_table.iloc[idx : idx+1].reset_index(drop=True)

        cc_num = df['cc_num'][0]
        ano_mes_sup = pd.to_datetime(df['ano_mes'][0])
        ano_mes_inf = pd.to_datetime(df['ano_mes'][0]) - timedelta(days=366)
        range_dates = pd.date_range(ano_mes_inf, ano_mes_sup, inclusive='left', freq='MS')

        df_default = pd.DataFrame({
            'cc_num' : [cc_num] * 12,
            'ano_mes_cruzamento' : range_dates
        })

        df = df.merge(df_default, how='left', on='cc_num')

        df_features = self.features_table.iloc[min(self.mapping[cc_num]): max(self.mapping[cc_num])]
        
        df = df.merge(df_features, how='left', on=['cc_num', 'ano_mes_cruzamento']).fillna(0)

        return df

    def __getitem__(self, idx):

        df = self._get_data(idx)

        return torch.tensor(df.drop(columns=['ano_mes', 'ano_mes_cruzamento', 'is_fraud', 'cc_num', 
                                'categorytop1', 'categorytop2', 'categorytop3']).to_numpy().reshape(12,44)).float(), torch.tensor(df['is_fraud'].iloc[0]).float()



In [122]:
training_data = LSTMDataSet(features_file='features.parquet.gzip', target_file='abt_train.parquet.gzip', mapping_file='dict_cpf_noup.json')
test_data = LSTMDataSet(features_file='features.parquet.gzip', target_file='abt_test.parquet.gzip', mapping_file='dict_cpf_noup.json')

In [123]:
%%time
training_data.__getitem__(10)

CPU times: user 32.4 ms, sys: 212 µs, total: 32.6 ms
Wall time: 30 ms


(tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.000

In [124]:
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [125]:
for _, batch in enumerate(train_loader):
    x_batch, y_batch = batch[0], batch[1]
    print(x_batch.shape, y_batch.shape)
    break

torch.Size([16, 12, 44]) torch.Size([16])


In [126]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers,
                            batch_first=True)

        self.fc = nn.Linear(hidden_size, 2)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size)
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return self.sig(out)

model = LSTM(44, 256, 3)
model
model

LSTM(
  (lstm): LSTM(44, 256, num_layers=3, batch_first=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
  (sig): Sigmoid()
)

In [133]:
def train_one_epoch():
    model.train(True)
    print(f'Epoch: {epoch + 1}')
    running_loss = 0.0

    for batch_index, batch in enumerate(train_loader):
        x_batch, y_batch = batch[0], batch[1]

        output = model(x_batch)[:,1]
        loss = loss_function(output, y_batch)
        running_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_index % 10 == 9:  # print every 10 batches
            avg_loss_across_batches = running_loss / 10
            print('Batch {0}, Loss: {1:.3f}'.format(batch_index+1,
                                                    avg_loss_across_batches))
            running_loss = 0.0
    print()

In [134]:
import warnings
warnings.filterwarnings('ignore')

In [136]:
learning_rate = 0.001
num_epochs = 10
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    train_one_epoch()

Epoch: 1
Batch 10, Loss: 0.144
Batch 20, Loss: 0.205
Batch 30, Loss: 0.220
Batch 40, Loss: 0.181
Batch 50, Loss: 0.103
Batch 60, Loss: 0.245
Batch 70, Loss: 0.202
Batch 80, Loss: 0.218
Batch 90, Loss: 0.127
Batch 100, Loss: 0.223
Batch 110, Loss: 0.142
Batch 120, Loss: 0.240
Batch 130, Loss: 0.162
Batch 140, Loss: 0.161
Batch 150, Loss: 0.149
Batch 160, Loss: 0.310
Batch 170, Loss: 0.135
Batch 180, Loss: 0.127
Batch 190, Loss: 0.244
Batch 200, Loss: 0.142
Batch 210, Loss: 0.119

Epoch: 2
Batch 10, Loss: 0.331
Batch 20, Loss: 0.111
Batch 30, Loss: 0.181
Batch 40, Loss: 0.121
Batch 50, Loss: 0.159
Batch 60, Loss: 0.205
Batch 70, Loss: 0.232
Batch 80, Loss: 0.147
Batch 90, Loss: 0.103
Batch 100, Loss: 0.181
Batch 110, Loss: 0.205
Batch 120, Loss: 0.204
Batch 130, Loss: 0.165
Batch 140, Loss: 0.162
Batch 150, Loss: 0.200
Batch 160, Loss: 0.239
Batch 170, Loss: 0.144
Batch 180, Loss: 0.144
Batch 190, Loss: 0.140
Batch 200, Loss: 0.182
Batch 210, Loss: 0.098

Epoch: 3
Batch 10, Loss: 0.253
B

In [169]:
from sklearn.metrics import roc_auc_score, roc_curve

In [137]:
len(test_data)

700

In [173]:
model.train(False)
predict = []
y_true = []
for batch_index, batch in enumerate(test_loader):
    x_batch, y_batch = batch[0], batch[1]
    with torch.no_grad():
        predict += model(x_batch)[:,1].tolist()
        y_true += y_batch.tolist()

In [174]:
roc_auc_score([int(i) for i in y_true], predict, max_fpr=0.01)

0.49748743718592964

In [176]:
roc_curve([int(i) for i in y_true], predict).plot()

AttributeError: 'tuple' object has no attribute 'plot'