In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
cd ../src

/tmp/kaggle/kaggle_otto_rs/src


### Initialization

In [3]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from tqdm import tqdm
from collections import Counter
from joblib import Parallel, delayed

### Val Split

In [4]:
from recsys.testset import *

@beartype
def val_split(train_set: Path, output_path: Path, days: int, seed: int):
    random.seed(seed)
    max_ts = get_max_ts(train_set)

    session_chunks = pd.read_json(train_set, lines=True, chunksize=100000)
    train_file = output_path / 'train_sessions.jsonl'
    test_file_full = output_path / 'val_sessions.jsonl'
    train_test_split(session_chunks, train_file, test_file_full, max_ts, days)

In [5]:
# val_split(
#     Path('../input/train.jsonl'),
#     Path("../output/"),
#     7,
#     42
# )

### Data

In [6]:
DATA_PATH = Path('../input/')
OUT_DIR = Path("../output/")

In [7]:
TRAIN_PATH = DATA_PATH / 'train.jsonl'
TEST_PATH = DATA_PATH / 'test.jsonl'

TRAIN_PATH = DATA_PATH / 'train_sessions.jsonl'
VAL_PATH = DATA_PATH / 'val_sessions.jsonl'
# train_sessions

CLASSES = ['clicks', 'carts', 'orders']

## NPY

In [8]:
def process_data(chunk, save_folder=""):
    arrays = c['events'].apply(lambda x: np.array([[c['aid'], c['ts'], CLASSES.index(c['type'])] for c in x]).astype(int)).values
    
    paths = []
    for session, array in zip(chunk['session'], arrays):
        paths.append(save_folder + f"session_{session}.npy")
        if save_folder:
            np.save(save_folder + f"session_{session}.npy", array)

    start_times = [a[0, 1] for a in arrays]
    end_times = [a[-1, 1] for a in arrays]
    df = pd.DataFrame({"path": paths, "start_time": start_times, "end_time": end_times})
    
    return arrays, df

In [9]:
# SAVE_DIR = "../input/processed/"
# os.makedirs(SAVE_DIR, exist_ok=True)

# sample_size = 10000
# n_chunks = 12899779 // sample_size + 1

# chunks = pd.read_json(TRAIN_PATH, lines=True, chunksize=sample_size)

In [10]:
# %%time

# ids = set()
# for c in chunks:
#     arrays = process_data(c, "")

#     ids_ = np.concatenate([a[:, 0] for a in arrays])
#     ids.update(ids_.tolist())
#     break

In [11]:
# chunks = pd.read_json(TRAIN_PATH, lines=True, chunksize=sample_size)

# ids = set()
# dfs = []

# for c in tqdm(chunks, total=1110):
#     arrays, df = process_data(c, SAVE_DIR)
#     dfs.append(df)

# df = pd.concat(dfs, ignore_index=True)
# df.to_csv(OUT_DIR / "train.csv", index=False)

### Val

In [12]:
# SAVE_DIR = "../input/processed_val/"
# os.makedirs(SAVE_DIR, exist_ok=True)

# sample_size = 10000
# n_chunks = 12899779 // sample_size + 1

# chunks = pd.read_json(TRAIN_PATH, lines=True, chunksize=sample_size)

In [13]:
# ids = set()
# dfs = []

# for c in tqdm(chunks, total=180):
#     arrays, df = process_data(c, SAVE_DIR)
#     dfs.append(df)

# df = pd.concat(dfs, ignore_index=True)
# df.to_csv(OUT_DIR / "val.csv", index=False)

### Test

In [14]:
# SAVE_DIR = "../input/processed_test/"
# os.makedirs(SAVE_DIR, exist_ok=True)

# sample_size = 10000
# n_chunks = 1671803 // sample_size + 1

# chunks = pd.read_json(TEST_PATH, lines=True, chunksize=sample_size)

In [15]:
# ids = set()
# dfs = []

# for c in tqdm(chunks, total=n_chunks):
#     arrays, df = process_data(c, SAVE_DIR)
#     dfs.append(df)
# #     ids_ = np.concatenate([a[:, 0] for a in arrays])
# #     ids.update(ids_.tolist())
# #     break

# df = pd.concat(dfs, ignore_index=True)
# df.to_csv(OUT_DIR / "test.csv", index=False)

In [16]:
# chunks = pd.read_json(TRAIN_PATH, lines=True, chunksize=sample_size)

# _ = Parallel(n_jobs=os.cpu_count())(
#     delayed(process_data)(c, OUT_DIR)
#     for c in tqdm(chunks, total=n_chunks)
# )

## Parquet

In [17]:
from recsys.labels import ground_truth

In [18]:
CLASSES = ['', 'clicks', 'carts', 'orders']

In [19]:
def crop_pad(x, max_len=20):
    if len(x) > max_len:
        return x[:max_len]
    else:
        return list(x) + [0 for i in range(max_len - len(x))]

In [20]:
def create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders):
    df = pd.DataFrame(data={
        'session': sessions,
        'aid': aids,
        'ts': tss,
        'type': types,
        "labels_clicks": labels_clicks,
        "labels_carts": labels_carts,
        "labels_orders": labels_orders,
    })

    # Adapt to NVT compatible format
    df['type'] = df['type'].apply(lambda x: [CLASSES.index(c) for c in x])

    df['labels_carts'] = df['labels_carts'].apply(lambda x_: np.array([crop_pad(x) for x in x_]).astype(float))
    df['labels_orders'] = df['labels_orders'].apply(lambda x_: np.array([crop_pad(x) for x in x_]).astype(float))
    for i in range(20):
        df[f'labels_carts_{i}'] = df['labels_carts'].apply(lambda x: x[:, i])
        df[f'labels_orders_{i}'] = df['labels_orders'].apply(lambda x: x[:, i])

    df.drop(['labels_carts', 'labels_orders'], axis=1, inplace=True)
    
    return df

In [21]:
def jsonl_to_df(filename, total=1290, name="train", max_chunks=100, save=True):
    chunks = pd.read_json(filename, lines=True, chunksize=10000)
    idx = 0

    sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders = [], [], [], [], [], [], []
    for i, chunk in tqdm(enumerate(chunks), total=total):
        for row_idx, session_data in chunk.iterrows():
            aids_, tss_, types_, labels_clicks_, labels_carts_, labels_orders_ = [], [], [], [], [], []

            events = ground_truth(session_data.events)  if len(session_data['events']) > 1 else session_data.events
            for event in events:
                aids_.append(event['aid'])
                tss_.append(event['ts'])
                types_.append(event['type'])
                labels_clicks_.append(event.get('labels', {}).get("clicks", 0))
                labels_carts_.append(list(event.get('labels', {}).get("carts", [])))
                labels_orders_.append(list(event.get('labels', {}).get("orders", [])))

            sessions.append(session_data.session)
            aids.append(aids_)
            tss.append(tss_)
            types.append(types_)
            labels_clicks.append(labels_clicks_)
            labels_carts.append(labels_carts_)
            labels_orders.append(labels_orders_)
            
#         break
        
        if ((i > 0) and (i % max_chunks) == 0):
            df = create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders)
            sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders = [], [], [], [], [], [], []
            if save:
                file = f'../input/parquets/{name}_{idx}.parquet'
                print(f'Saving data to {file}')
                df.to_parquet(file, index=False)
            idx += 1

    df = create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders)
    
    if save:
        file = f'../output/{name}_{idx}.parquet' if idx > 0 else f'../output/{name}.parquet'
        print(f'Saving data to {file}')
        df.to_parquet(file, index=False)
    
    else:
        return df

In [None]:
_ = jsonl_to_df(TRAIN_PATH, total=1200, name="train", save=True)

 17%|█▋        | 200/1200 [16:42<46:13,  2.77s/it]  

In [30]:
# df_val = jsonl_to_df(VAL_PATH, total=181, name="val", save=True)

In [31]:
# df_test = jsonl_to_df(TEST_PATH, total=168, name="test", save=True)

### Parquet nogroup

In [39]:
def jsonl_to_df_nogroup(filename, total=1290, name="train", max_chunks=200, save=True):
    
    chunks = pd.read_json(filename, lines=True, chunksize=10000)
    idx = 0

    sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders = [], [], [], [], [], [], []
    for i, chunk in tqdm(enumerate(chunks), total=total):
        for row_idx, session_data in chunk.iterrows():
            aids_, tss_, types_, labels_clicks_, labels_carts_, labels_orders_ = [], [], [], [], [], []

            events = ground_truth(session_data.events)  if len(session_data['events']) > 1 else session_data.events
            for event in events:
                aids.append(event['aid'])
                tss.append(event['ts'])
                types_.append(event['type'])
                labels_clicks.append(event.get('labels', {}).get("clicks", 0))
                labels_carts.append(list(event.get('labels', {}).get("carts", [])))
                labels_orders.append(list(event.get('labels', {}).get("orders", [])))

                sessions.append(session_data.session)
#         break
        
        if ((i > 0) and (i % max_chunks) == 0):
            df = create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders)
            sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders = [], [], [], [], [], [], []
            if save:
                file = f'../input/parquets/{name}_{idx}.parquet'
                print(f'Saving data to {file}')
                df.to_parquet(file, index=False)
            idx += 1

    df = create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders)
    
    if save:
        file = f'../output/ng_{name}_{idx}.parquet' if idx > 0 else f'../output/{name}.parquet'
        print(f'Saving data to {file}')
        df.to_parquet(file, index=False)
    else:
        return df

In [40]:
# _ = jsonl_to_df_train(TRAIN_PATH, total=1200, name="train", save=True)

In [42]:
df_val = jsonl_to_df_nogroup(VAL_PATH, total=181, name="val", save=True)

100%|██████████| 181/181 [09:05<00:00,  3.01s/it]


CPU times: user 10min 13s, sys: 19.4 s, total: 10min 33s
Wall time: 10min 33s


In [44]:
df_test = jsonl_to_df_nogroup(TEST_PATH, total=168, name="test", save=True)

100%|██████████| 168/168 [04:13<00:00,  1.51s/it]


ValueError: All arrays must be of the same length

In [27]:
df_val.head()

Unnamed: 0,session,aid,ts,type,labels_clicks,labels_carts,labels_orders,target
0,11098528,"[11830, 1679529, 92401, 1055218, 1561739, 1679...","[1661119200060, 1661119417928, 1661119474036, ...","[clicks, clicks, clicks, clicks, clicks, click...","[1679529, 92401, 1055218, 1561739, 1679529, 10...","[[1199737], [1199737], [1199737], [1199737], [...","[[990658, 950341, 1462506, 1561739, 907564, 36...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, ..."
1,11098529,"[1105029, 1105029]","[1661119200259, 1661148152798]","[clicks, clicks]","[1105029, 1298277]","[[], []]","[[], []]","[1, 1]"
2,11098530,"[264500, 264500, 409236, 409236, 409236, 409236]","[1661119200974, 1661119288407, 1661119369986, ...","[clicks, clicks, clicks, clicks, clicks, carts]","[264500, 409236, 409236, 409236, None, None]","[[409236], [409236], [409236], [409236], [4092...","[[409236], [409236], [409236], [409236], [4092...","[1, 1, 1, 1, 1, 2]"
3,11098531,"[452188, 1239060, 1557766, 452188, 396199, 130...","[1661119200997, 1661119227353, 1661119243209, ...","[clicks, clicks, clicks, clicks, clicks, click...","[1239060, 1557766, 452188, 396199, 1309633, 14...","[[], [], [], [], [], [], [], [], [], [], [], [...","[[1365569, 1728212, 396199, 452188, 1271998], ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,11098532,"[7651, 876469, 1596491, 1550739, 1308930, 4611...","[1661119201137, 1661119996249, 1661120147735, ...","[clicks, clicks, clicks, clicks, clicks, click...","[876469, 1596491, 1550739, 1308930, 461190, 46...","[[], [], [], [], [], [], [], [], [], [], [], [...","[[], [], [], [], [], [], [], [], [], [], [], [...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
