In [1]:
import gc
from pathlib import Path

import cupy
import cudf
import cudf.core.buffer.spill_manager as sm

In [2]:
cudf.set_option('spill', True)
manager = sm.SpillManager(device_memory_limit=10737418240)
sm.set_global_manager(manager)

In [3]:
data_path = Path.cwd() / 'data'

## LB

In [4]:
lb_in = data_path / 'lb/processed_nvt'
lb_out = data_path / 'lb/sessions_by_day'
if not lb_out.is_dir():
    lb_out.mkdir()

In [5]:
data = []
for train_file in sorted(lb_in.glob('*.parquet')):
    data.append(cudf.read_parquet(train_file.as_posix()))

data = cudf.concat(data)

In [6]:
cupy.random.seed(1)
val_size = 0.1
test_size = 0.1
n_splits = data['day_index'].unique().count()

for i in range(1, n_splits + 1):
    split = data.loc[data['day_index'] == i]

    split_dir = lb_out / str(i)
    split_dir.mkdir()
    
    random_values = cupy.random.rand(len(split))
    train_size = 1 - val_size - test_size

    train_set = split[random_values <= train_size]
    train_set.to_parquet((split_dir / 'train.parquet').as_posix())


    valid_set = split[
        (random_values > train_size) & (random_values <= (train_size + val_size))
    ]
    valid_set.to_parquet((split_dir / 'valid.parquet').as_posix())

    test_set = split[random_values > (1 - test_size)]
    test_set.to_parquet((split_dir / 'test.parquet').as_posix())
    
    del train_set, valid_set, test_set
    gc.collect()

del data