### Initialization

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from tqdm import tqdm
from collections import Counter
from joblib import Parallel, delayed

### Data

In [2]:
DATA_PATH = Path('../input/')

TRAIN_PATH = DATA_PATH / 'train.jsonl'
TEST_PATH = DATA_PATH / 'test.jsonl'

CLASSES = ['clicks', 'carts', 'orders']

In [3]:
OUT_DIR = "../input/processed/"
os.makedirs(OUT_DIR, exist_ok=True)

sample_size = 10000
n_chunks = 12899779 // sample_size + 1

chunks = pd.read_json(TRAIN_PATH, lines=True, chunksize=sample_size)

In [4]:
def process_data(chunk, save_folder=""):
    arrays = c['events'].apply(lambda x: np.array([[c['aid'], c['ts'], CLASSES.index(c['type'])] for c in x]).astype(int))

    if save_folder:
        for session, array in zip(chunk['session'], arrays):
            np.save(save_folder + f"session_{session}.npy", array)

    return arrays

In [5]:
# %%time

# ids = set()
# for c in chunks:
#     arrays = process_data(c, "")
    
#     ids_ = np.concatenate([a[:, 0] for a in arrays])
#     ids.update(ids_.tolist())
#     break

In [8]:
ids = set()

for c in tqdm(chunks, total=n_chunks):
    arrays = process_data(c, OUT_DIR)

    ids_ = np.concatenate([a[:, 0] for a in arrays])
    ids.update(ids_.tolist())
#     break

100%|█████████▉| 1289/1290 [1:02:42<00:02,  2.92s/it]


### Test

In [11]:
OUT_DIR = "../input/processed_test/"
os.makedirs(OUT_DIR, exist_ok=True)

sample_size = 10000
n_chunks = 1671803 // sample_size + 1

chunks = pd.read_json(TEST_PATH, lines=True, chunksize=sample_size)

In [13]:
ids = set()

for c in tqdm(chunks, total=n_chunks):
    arrays = process_data(c, OUT_DIR)

    ids_ = np.concatenate([a[:, 0] for a in arrays])
    ids.update(ids_.tolist())
#     break

100%|██████████| 168/168 [06:26<00:00,  2.30s/it]


In [7]:
# chunks = pd.read_json(TRAIN_PATH, lines=True, chunksize=sample_size)

# _ = Parallel(n_jobs=os.cpu_count())(
#     delayed(process_data)(c, OUT_DIR)
#     for c in tqdm(chunks, total=n_chunks)
# )