In [102]:
import torch
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from pathlib import Path
import os
from datetime import datetime, timedelta
import pandas as pd


project_dir = Path('__main__').resolve().parents[1]

# DataLoader

In [105]:
class LSTMDataSet(Dataset):
    def __init__(self, features_file, target_file):
        self.features_table = pd.read_parquet(os.path.join(project_dir, 'data', 'interim', features_file))
        self.target_table = pd.read_parquet(os.path.join(project_dir, 'data', 'interim', target_file))


    def __len__(self):
        return self.target_table.shape[0]


    def _get_data(self, idx):
        df = self.target_table.iloc[idx : idx+1].reset_index(drop=True)

        cc_num = df['cc_num'][0]
        ano_mes_sup = pd.to_datetime(df['ano_mes'][0])
        ano_mes_inf = pd.to_datetime(df['ano_mes'][0]) - timedelta(days=366)
        range_dates = pd.date_range(ano_mes_inf, ano_mes_sup, inclusive='left', freq='MS')

        df_default = pd.DataFrame({
            'cc_num' : [cc_num] * 12,
            'ano_mes_cruzamento' : range_dates
        })

        df = df.merge(df_default, how='left', on='cc_num')

        filter = (self.features_table['cc_num'] == 'cc_num') & (pd.to_datetime(self.features_table['ano_mes']) >= ano_mes_inf) & (pd.to_datetime(self.features_table['ano_mes']) < ano_mes_sup)

        df = df.merge(self.features_table[filter].rename(columns={'ano_mes' : 'ano_mes_cruzamento'}), how='left', on=['cc_num', 'ano_mes_cruzamento'])

        return df

    def __getitem__(self, idx):

        df = self._get_data(idx)
        return df.drop(columns=['ano_mes', 'ano_mes_cruzamento', 'is_fraud', 'cc_num']), df['is_fraud']


In [106]:
training_data = LSTMDataSet(features_file='processed.parquet.gzip', target_file='abt.parquet.gzip')
test_data = LSTMDataSet(features_file='processed.parquet.gzip', target_file='abt.parquet.gzip')

In [108]:
train_dataloader = DataLoader(training_data, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=128, shuffle=True)