In [1]:
import torch
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from pathlib import Path
import os
from datetime import datetime, timedelta
import pandas as pd
import json


project_dir = Path('__main__').resolve().parents[1]

# DataLoader

In [2]:
class LSTMDataSet(Dataset):
    def __init__(self, features_file, target_file, mapping_file):
        self.features_table = pd.read_parquet(os.path.join(project_dir, 'data', 'interim', features_file))
        # TODO: Tirar isso daqui
        self.features_table['ano_mes_cruzamento'] = pd.to_datetime(self.features_table['ano_mes_cruzamento'], format='%Y-%m')

        self.target_table = pd.read_parquet(os.path.join(project_dir, 'data', 'interim', target_file))
        self.mapping = json.load(open(os.path.join(project_dir, 'data', 'interim', mapping_file), 'r'), 
                                    object_hook=lambda x: {int(k) : v for k,v in x.items()})

    def __len__(self):
        return self.target_table.shape[0]


    def _get_data(self, idx):
        df = self.target_table.iloc[idx : idx+1].reset_index(drop=True)

        cc_num = df['cc_num'][0]
        ano_mes_sup = pd.to_datetime(df['ano_mes'][0])
        ano_mes_inf = pd.to_datetime(df['ano_mes'][0]) - timedelta(days=366)
        range_dates = pd.date_range(ano_mes_inf, ano_mes_sup, inclusive='left', freq='MS')

        df_default = pd.DataFrame({
            'cc_num' : [cc_num] * 12,
            'ano_mes_cruzamento' : range_dates
        })

        df = df.merge(df_default, how='left', on='cc_num')

        df_features = self.features_table.iloc[min(self.mapping[cc_num]): max(self.mapping[cc_num])]
        
        df = df.merge(df_features, how='left', on=['cc_num', 'ano_mes_cruzamento']).fillna(0)

        return df

    def __getitem__(self, idx):

        df = self._get_data(idx)
        return df.drop(columns=['ano_mes', 'ano_mes_cruzamento', 'is_fraud', 'cc_num']), df['is_fraud']


In [3]:
training_data = LSTMDataSet(features_file='features.parquet.gzip', target_file='abt_train.parquet.gzip', mapping_file='dict_cpf_noup.json')

In [12]:
%%time
training_data.__getitem__(10)

CPU times: user 31.9 ms, sys: 334 µs, total: 32.2 ms
Wall time: 28.6 ms


(    count_cpf  sum_cpf  max_cpf  min_cpf   p99_cpf  p90_cpf  p75_cpf  \
 0         0.0     0.00     0.00     0.00    0.0000    0.000   0.0000   
 1         0.0     0.00     0.00     0.00    0.0000    0.000   0.0000   
 2        56.0  2410.54   204.15     1.84  202.2910  104.935  64.4900   
 3        59.0  2861.48   224.75     1.24  210.8764  118.436  74.6100   
 4        86.0  6672.74   852.81     1.07  833.4725  133.555  82.4025   
 5       102.0  6430.91   698.09     1.16  470.5620  117.461  73.3950   
 6        81.0  3543.45   225.48     1.22  223.9360  112.830  63.2600   
 7       109.0  8993.78  3075.09     1.02  196.1304  121.272  83.5800   
 8       102.0  6355.26  1212.58     1.12  395.3417  103.914  74.4250   
 9       100.0  4604.33   152.83     1.12  151.3846  110.343  74.5575   
 10       86.0  4585.79   458.09     1.27  262.7600  113.055  68.0325   
 11       79.0  3164.89   218.93     1.06  217.6742  104.296  62.9850   
 
     median_cpf  p25_cpf  p10_cpf  ...  sum_mcc_t

In [108]:
train_dataloader = DataLoader(training_data, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=128, shuffle=True)