In [1]:
import functools
import operator
from collections import defaultdict
from typing import Dict
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import sys

sys.path.append('../../../')

from src.data_load.dataloader import create_data_loaders, create_test_loader

from src.data_load import split_strategy
from src.data_load.data_utils import prepare_data, prepare_test_data
from src.data_load.splitting_dataset import (
    ConvertingTrxDataset,
    DropoutTrxDataset,
    SplittingDataset,
    SberSplittingDataset,
    TargetEnumeratorDataset,
)
from src.data_load.parquet_ds import TxnParquetDataset
from src.data_load.dataloader import collate_splitted_rows
from configs.data_configs.rosbank import data_configs

import dask.dataframe as da


In [4]:
class A():

    def f(self):
        print(1)

class B():
    def f(self):
        print(2)

class C(B, A):
    pass

In [5]:
c = C()
c.f()

2


In [2]:
conf = data_configs()

In [3]:
df1 = pd.read_parquet(conf.test_path)

In [4]:
df1

Unnamed: 0,cl_id,amount,event_time,mcc,channel_type,currency,trx_category,trx_count,target_target_flag,target_target_sum
0,10096,"[5.209486152841421, 11.289794413577894, 6.5381...","[17310.0, 17310.44556712963, 17312.0, 17312.0,...","[3, 2, 1, 8, 16, 32, 1, 5, 16, 1, 3, 3, 8, 3, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",186,0,0.0
1,1718,"[10.463131911491967, 9.200391041122515, 8.4765...","[17113.0, 17113.0, 17113.065914351853, 17117.0...","[2, 80, 2, 2, 1, 6, 6, 1, 2, 100, 2, 2, 2, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[6, 1, 3, 6, 1, 1, 1, 1, 3, 1, 3, 3, 6, 1, 1, ...",42,1,95755.67
2,3260,"[5.995207533386816, 4.0943445622221, 6.0776422...","[17444.0, 17445.0, 17445.0, 17445.0, 17445.617...","[1, 1, 4, 1, 2, 1, 47, 1, 3, 17, 1, 1, 1, 8, 8...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",169,1,36862.75
3,3529,"[7.941626359306345, 7.692113339595466, 6.04263...","[17214.0, 17214.0, 17214.0, 17214.0, 17216.0, ...","[1, 1, 86, 26, 23, 1, 1, 7, 3, 11, 1, 1, 4, 1,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",69,1,9553.28
4,5335,"[8.006700845440367, 7.484930283289661, 7.10414...","[17203.0, 17203.0, 17203.0, 17203.668090277777...","[25, 16, 31, 2, 2, 2, 52, 3, 3, 17, 17, 1, 28,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",155,0,0.0
...,...,...,...,...,...,...,...,...,...,...
495,5712,"[5.471850417308912, 10.986361642914657, 5.2522...","[17146.0, 17149.0, 17150.0, 17150.0, 17151.0, ...","[20, 58, 10, 1, 96, 10, 1, 20, 13, 13, 1, 1, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 7, 5, 1, 1, 3, 5, 7, ...",120,0,0.0
496,7975,"[10.308985993422082, 8.006700845440367, 8.9873...","[17099.56758101852, 17099.568668981483, 17101....","[2, 2, 2, 18, 3, 2, 46, 2, 94, 2, 8, 8, 3, 8, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, ...",52,0,0.0
497,8507,"[7.333676395657684, 6.786716950605081, 7.04056...","[17144.0, 17144.0, 17146.0, 17146.0, 17146.0, ...","[10, 14, 4, 1, 7, 14, 6, 1, 2, 6, 19, 1, 14, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, ...",206,1,351105.52
498,8928,"[6.908614909394019, 4.804021044733257, 10.1058...","[17348.0, 17349.0, 17349.0, 17349.0, 17349.0, ...","[6, 16, 32, 19, 16, 27, 36, 26, 2, 16, 16, 16,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, ...",84,1,1378.0


In [3]:
train, val = create_data_loaders(conf)

In [4]:
for batch in train:
    break

In [9]:
test = create_test_loader(conf)

0it [00:00, ?it/s]

500it [00:00, 6565.52it/s]


In [10]:
for batch in test:
    break

In [11]:
batch

(<src.data_load.dataloader.PaddedBatch at 0x7fcc4eba2830>,
 tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
         [ 0,  1,  1,  1,  0,  1,  1,  1,  0,  0,  1,  1,  1,  0,  0,  1]]))

In [2]:
path = '../../alpha/data/test_trx.parquet'
df1 = pd.read_parquet(path)

In [7]:
df1 = df1[~df1['target_target_flag'].isna()]

In [3]:
df1 = df1.sample(10000)

In [4]:
import pyarrow as pa
import numpy as np

# df is your dataframe
n_partition = 10
df1["partition_idx"] = np.random.choice(range(n_partition), size=df1.shape[0])
table = pa.Table.from_pandas(df1, preserve_index=False)


In [5]:
pa.parquet.write_to_dataset(table, root_path="../../alpha/data/test_subset.parquet/", partition_cols=["partition_idx"])

In [None]:
pq.write_to_dataset(table, root_path="../data/train.parquet/", partition_cols=["partition_idx"])

In [None]:
import pyarrow as pa

In [None]:
pa.Table.from_pandas

In [None]:
arr = np.array([1, 2, 3])
arr.astype(float)

In [None]:
for batch in train:
    break

In [None]:
arr = np.array(['2.83321334 3.27865303'])

In [None]:
df = pd.read_parquet(conf.train_path)

In [None]:
df['amount']

In [None]:
# df = pd.read_parquet(conf.train_path)
# ddf = da.from_pandas(df, chunksize=100)
# save_dir = '../data/train.parquet'
# ddf.to_parquet(save_dir)

In [None]:
data = TxnParquetDataset(conf, test=False)

In [None]:
conf.valid_size

In [None]:
len(data._rg_lens)

In [None]:
def _pump_my_dataset(dataset, conf, split):
    dataset_type = SplittingDataset
    if hasattr(conf, "sber") and conf.sber == True:
        dataset_type = SberSplittingDataset
        
    dataset = dataset_type(
        dataset,
        split_strategy.create(**conf[split].split_strategy),
        conf.features.target_col,
    )
    dataset = TargetEnumeratorDataset(dataset)
    dataset = ConvertingTrxDataset(dataset)
    
    dropout = conf[split].dropout if hasattr(conf[split], "dropout") else 0.0
    dataset = DropoutTrxDataset(
        dataset, trx_dropout=dropout, seq_len=conf[split].max_seq_len
    )
    return dataset  

train_sampler, val_sampler = data.get_train_val_samplers(conf.valid_size)


In [None]:
conf.train.dropout = 0
assert conf.train.dropout == 0.0
data = _pump_my_dataset(data, conf, "train")


In [None]:
conf.valid_size

In [None]:
train_loader = DataLoader(
    dataset=data,
    sampler=train_sampler,
    collate_fn=collate_splitted_rows,
    num_workers=conf.train.num_workers,
    batch_size=conf.train.batch_size,
)
valid_loader = DataLoader(
    dataset=data,
    sampler=val_sampler,
    collate_fn=collate_splitted_rows,
    num_workers=conf.val.num_workers,
    batch_size=conf.val.batch_size,
)

In [None]:
len(valid_loader), len(train_loader)

In [None]:
train_sampler, val_sampler = data.get_train_val_samplers(conf.valid_size)
assert conf.train.dropout == 0.0
data = _pump_my_dataset(data, conf, "train")

train_loader = DataLoader(
    dataset=data,
    sampler=train_sampler,
    collate_fn=collate_splitted_rows,
    num_workers=conf.train.num_workers,
    batch_size=conf.train.batch_size,
)
valid_loader = DataLoader(
    dataset=data,
    sampler=val_sampler,
    collate_fn=collate_splitted_rows,
    num_workers=conf.val.num_workers,
    batch_size=conf.val.batch_size,
)
return train_loader, valid_loader