In [1]:
import functools
import operator
from collections import defaultdict
from typing import Dict
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import sys

sys.path.append('../../../')

from src.data_load.dataloader import create_data_loaders, create_test_loader

from src.data_load import split_strategy
from src.data_load.data_utils import prepare_data, prepare_test_data
from src.data_load.splitting_dataset import (
    ConvertingTrxDataset,
    DropoutTrxDataset,
    SplittingDataset,
    SberSplittingDataset,
    TargetEnumeratorDataset,
)
from src.data_load.parquet_ds import TxnParquetDataset
from src.data_load.dataloader import collate_splitted_rows
from configs.data_configs.rosbank import data_configs

import dask.dataframe as da


In [3]:
conf = data_configs()

In [4]:
df1 = pd.read_parquet(conf.train_path)

In [5]:
df1

Unnamed: 0,cl_id,amount,event_time,mcc,channel_type,currency,trx_category,trx_count,target_target_flag,target_target_sum,partition_idx
0,10084,"[10.308985993422082, 9.615672137861335, 9.5465...","[17307.86204861111, 17308.0, 17308.0, 17308.0,...","[2, 24, 24, 24, 24, 24, 2, 13, 24, 24, 24, 24,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 1, 1, 1, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 2, ...",30,0,0.0,0
1,10093,"[6.957497370876951, 6.255750041753367, 7.26194...","[17141.0, 17141.0, 17142.0, 17142.0, 17142.0, ...","[5, 3, 1, 6, 6, 5, 1, 1, 47, 5, 8, 9, 8, 1, 1,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, ...",52,1,8993.28,0
2,10186,"[9.210440366976517, 4.709530201312334, 6.33859...","[17353.0, 17357.0, 17357.0, 17357.0, 17357.0, ...","[9, 1, 91, 7, 7, 1, 1, 55, 6, 7, 1, 1, 35, 3, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",169,1,9.99,0
3,10206,"[9.90353755128617, 10.596659732783579, 10.1658...","[17275.734791666666, 17276.778819444444, 17276...","[2, 2, 2, 1, 1, 2, 2, 6, 6, 8, 8, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, ...",40,0,0.0,0
4,1066,"[8.517393171418904, 11.918397239722838, 6.3985...","[17287.237060185184, 17296.0, 17298.5056018518...","[2, 2, 29, 29, 45, 45, 31, 28, 29, 2, 35, 3, 6...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 4, 9, 9, 9, 9, 1, 1, 9, 3, 1, 1, 1, 3, 9, ...",127,1,221439.83,0
...,...,...,...,...,...,...,...,...,...,...,...
9712,980,"[8.486940148245216, 7.400620577371135, 6.70869...","[17165.0, 17165.0, 17178.0, 17179.0, 17179.0, ...","[1, 1, 1, 1, 51, 1, 3, 3, 3, 3, 3, 3, 3, 34, 5...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",163,,,9
9713,9837,"[4.499809670330265, 5.720311776607412, 9.28739...","[17130.0, 17130.0, 17130.78539351852, 17130.80...","[1, 3, 2, 2, 1, 1, 3, 1, 4, 69, 35, 35, 4, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",175,,,9
9714,986,"[8.160803920954665, 5.993961427306569, 8.36660...","[17123.0, 17135.58596064815, 17135.58660879629...","[2, 2, 2, 2, 2, 46, 1, 2, 2, 29, 15, 28, 11, 1...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[6, 3, 3, 3, 6, 1, 1, 3, 3, 9, 1, 1, 1, 1, 1, ...",44,,,9
9715,9871,"[9.210440366976517, 9.615805480084347, 9.30574...","[17239.0, 17239.0, 17239.0, 17241.0, 17242.0, ...","[24, 24, 24, 5, 24, 24, 14, 8, 1, 24, 24, 56, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",53,,,9


In [3]:
train, val = create_data_loaders(conf)

In [4]:
for batch in train:
    break

In [9]:
test = create_test_loader(conf)

0it [00:00, ?it/s]

500it [00:00, 6565.52it/s]


In [10]:
for batch in test:
    break

In [11]:
batch

(<src.data_load.dataloader.PaddedBatch at 0x7fcc4eba2830>,
 tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
         [ 0,  1,  1,  1,  0,  1,  1,  1,  0,  0,  1,  1,  1,  0,  0,  1]]))

In [6]:
path = '../data/train_trx.parquet'
df1 = pd.read_parquet(path)

In [7]:
df1 = df1[~df1['target_target_flag'].isna()]

In [8]:
df1

Unnamed: 0,cl_id,amount,event_time,mcc,channel_type,currency,trx_category,trx_count,target_target_flag,target_target_sum
0,10018,"[10.609081944147828, 10.596659732783579, 10.81...","[17120.38773148148, 17133.667800925927, 17134....","[13, 2, 13, 2, 1, 18, 13, 2, 13, 2, 5, 13, 9, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[5, 3, 5, 3, 1, 1, 5, 3, 5, 3, 1, 5, 5, 5, 5]",15,0,0.0
1,10030,"[4.61512051684126, 6.90875477931522, 10.598857...","[17141.0, 17141.0, 17145.0, 17147.0, 17147.0, ...","[9, 9, 21, 1, 25, 6, 14, 14, 3, 3, 3, 13, 1, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 1, 1, 3, ...",42,1,59.51
2,10038,"[7.4127640174265625, 7.370230641807081, 7.8180...","[17301.0, 17301.0, 17301.0, 17301.774780092594...","[1, 1, 1, 2, 2, 4, 2, 8, 1, 22, 8, 1, 8, 4, 2,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 2, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, ...",111,0,0.0
3,10057,"[7.494708263135679, 7.736394428979239, 10.7789...","[17151.0, 17151.0, 17153.0, 17154.0, 17155.0, ...","[6, 21, 2, 6, 2, 4, 2, 22, 15, 2, 1, 35, 4, 2,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 4, 1, 4, 1, 3, 1, 1, 3, 1, 1, 1, 4, 1, ...",61,1,62961.31
4,10062,"[8.31898612539206, 8.824824939175638, 6.509067...","[17143.0, 17143.0, 17143.0, 17144.0, 17144.0, ...","[80, 15, 37, 38, 11, 11, 2, 24, 7, 5, 5, 11, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, ...",82,1,107126.35
...,...,...,...,...,...,...,...,...,...,...
4495,9964,"[4.394449154672439, 5.4449685668737295, 7.6014...","[17304.0, 17304.0, 17304.368136574074, 17305.0...","[1, 5, 2, 1, 1, 1, 23, 16, 1, 5, 1, 1, 5, 40, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",73,0,0.0
4496,9969,"[6.91283236872242, 9.210440366976517, 6.996132...","[17353.0, 17353.67679398148, 17354.0, 17354.0,...","[1, 13, 1, 4, 1, 10, 1, 1, 24, 4, 4, 4, 54, 3,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, ...",299,1,12757.18
4497,997,"[5.303304908059076, 5.442417710521793, 7.02197...","[17366.0, 17366.0, 17366.0, 17367.0, 17367.0, ...","[2, 4, 3, 2, 78, 4, 4, 2, 17, 4, 4, 3, 3, 1, 7...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[6, 1, 1, 6, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, ...",131,0,0.0
4498,9973,"[5.153291594497779, 9.210440366976517, 7.75747...","[17323.0, 17323.598599537036, 17324.0, 17325.0...","[20, 2, 16, 9, 5, 3, 6, 1, 6, 7, 11, 4, 5, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 2, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",180,0,0.0


In [9]:
import pyarrow as pa
import numpy as np

# df is your dataframe
n_partition = 10
df1["partition_idx"] = np.random.choice(range(n_partition), size=df1.shape[0])
table = pa.Table.from_pandas(df1, preserve_index=False)


In [None]:
pa.parquet.write_to_dataset(table, root_path="../data/train.parquet/", partition_cols=["partition_idx"])

In [None]:
pq.write_to_dataset(table, root_path="../data/train.parquet/", partition_cols=["partition_idx"])

In [None]:
import pyarrow as pa

In [None]:
pa.Table.from_pandas

In [None]:
arr = np.array([1, 2, 3])
arr.astype(float)

In [None]:
for batch in train:
    break

In [None]:
arr = np.array(['2.83321334 3.27865303'])

In [None]:
df = pd.read_parquet(conf.train_path)

In [None]:
df['amount']

In [None]:
# df = pd.read_parquet(conf.train_path)
# ddf = da.from_pandas(df, chunksize=100)
# save_dir = '../data/train.parquet'
# ddf.to_parquet(save_dir)

In [None]:
data = TxnParquetDataset(conf, test=False)

In [None]:
conf.valid_size

In [None]:
len(data._rg_lens)

In [None]:
def _pump_my_dataset(dataset, conf, split):
    dataset_type = SplittingDataset
    if hasattr(conf, "sber") and conf.sber == True:
        dataset_type = SberSplittingDataset
        
    dataset = dataset_type(
        dataset,
        split_strategy.create(**conf[split].split_strategy),
        conf.features.target_col,
    )
    dataset = TargetEnumeratorDataset(dataset)
    dataset = ConvertingTrxDataset(dataset)
    
    dropout = conf[split].dropout if hasattr(conf[split], "dropout") else 0.0
    dataset = DropoutTrxDataset(
        dataset, trx_dropout=dropout, seq_len=conf[split].max_seq_len
    )
    return dataset  

train_sampler, val_sampler = data.get_train_val_samplers(conf.valid_size)


In [None]:
conf.train.dropout = 0
assert conf.train.dropout == 0.0
data = _pump_my_dataset(data, conf, "train")


In [None]:
conf.valid_size

In [None]:
train_loader = DataLoader(
    dataset=data,
    sampler=train_sampler,
    collate_fn=collate_splitted_rows,
    num_workers=conf.train.num_workers,
    batch_size=conf.train.batch_size,
)
valid_loader = DataLoader(
    dataset=data,
    sampler=val_sampler,
    collate_fn=collate_splitted_rows,
    num_workers=conf.val.num_workers,
    batch_size=conf.val.batch_size,
)

In [None]:
len(valid_loader), len(train_loader)

In [None]:
train_sampler, val_sampler = data.get_train_val_samplers(conf.valid_size)
assert conf.train.dropout == 0.0
data = _pump_my_dataset(data, conf, "train")

train_loader = DataLoader(
    dataset=data,
    sampler=train_sampler,
    collate_fn=collate_splitted_rows,
    num_workers=conf.train.num_workers,
    batch_size=conf.train.batch_size,
)
valid_loader = DataLoader(
    dataset=data,
    sampler=val_sampler,
    collate_fn=collate_splitted_rows,
    num_workers=conf.val.num_workers,
    batch_size=conf.val.batch_size,
)
return train_loader, valid_loader