In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

### Initialization

In [None]:
import os
import gc
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from tqdm import tqdm
from collections import Counter
from joblib import Parallel, delayed

In [None]:
from otto_src.labels import ground_truth
from otto_src.my_split import train_val_split, create_labels

from data.preparation import json_to_pq, json_to_pq_y

### Val Split

In [None]:
# train_val_split(
#     Path('../input/jsons/train.jsonl'),
#     Path("../output/jsons/"),
#     days=7,
#     train_only=True,
# )

In [None]:
# create_labels(
#     Path("../output/jsons/val_sessions.jsonl"),
#     Path("../output/jsons/"),
# )

In [None]:
# create_labels(
#     Path("../output/jsons/val_sessions_c.jsonl"),
#     Path("../output/jsons/"),
# )

In [None]:
# create_labels(
#     Path("../output/train_sessions.jsonl"),
#     Path("../output/"),
# )

### Convert to parquet

#### Full train history

In [None]:
# json_to_pq(
#     Path("../output/jsons/sessions.jsonl"),
#     Path("../output/"),
#     name="full_train"
# )

#### Train

In [None]:
# json_to_pq(
#     Path("../output/jsons/train_sessions_c.jsonl"),
#     Path("../output/"),
# )

In [None]:
# json_to_pq_y(
#     Path("../output/jsons/train_labels.jsonl"),
#     Path("../output/")
# )

#### Val

In [None]:
# json_to_pq(
#     Path("../output/jsons/val_sessions_c.jsonl"),
#     Path("../output/"),
# )

In [None]:
# json_to_pq_y(
#     Path("../output/jsons/val_labels.jsonl"),
#     Path("../output/"),
# )

#### Retruncate val

In [None]:
# json_to_pq(
#     Path("../output/jsons/val_c_sessions_c.jsonl"),
#     Path("../output/"),
# )

In [None]:
# json_to_pq_y(
#     Path("../output/jsons/val_c_labels.jsonl"),
#     Path("../output/"),
# )

## Deep Prepro

In [None]:
DATA_PATH = Path('../input/')
OUT_DIR = Path("../output/")

In [None]:
TRAIN_PATH = DATA_PATH / 'train.jsonl'
TEST_PATH = DATA_PATH / 'test.jsonl'

TRAIN_PATH = DATA_PATH / 'train_sessions.jsonl'
VAL_PATH = DATA_PATH / 'val_sessions.jsonl'
# train_sessions

CLASSES = ['', 'clicks', 'carts', 'orders']

## Parquet

In [None]:
def crop_pad(x, max_len=20):
    if len(x) > max_len:
        return x[:max_len]
    else:
        return list(x) + [0 for i in range(max_len - len(x))]

In [None]:
def create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders):
    df = pd.DataFrame(data={
        'session': sessions,
        'aid': aids,
        'ts': tss,
        'type': types,
        "labels_clicks": labels_clicks,
        "labels_carts": labels_carts,
        "labels_orders": labels_orders,
    })

    # Adapt to NVT compatible format
    df['type'] = df['type'].apply(lambda x: [CLASSES.index(c) for c in x])

    df['labels_carts'] = df['labels_carts'].apply(lambda x_: np.array([crop_pad(x) for x in x_]).astype(float))
    df['labels_orders'] = df['labels_orders'].apply(lambda x_: np.array([crop_pad(x) for x in x_]).astype(float))
    for i in range(20):
        df[f'labels_carts_{i}'] = df['labels_carts'].apply(lambda x: x[:, i])
        df[f'labels_orders_{i}'] = df['labels_orders'].apply(lambda x: x[:, i])

    df.drop(['labels_carts', 'labels_orders'], axis=1, inplace=True)
    
    return df

In [None]:
def jsonl_to_df(filename, total=1290, name="train", max_chunks=200, save=True):
    chunks = pd.read_json(filename, lines=True, chunksize=10000)
    idx = 0

    sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders = [], [], [], [], [], [], []
    for i, chunk in tqdm(enumerate(chunks), total=total):
        for row_idx, session_data in chunk.iterrows():
            aids_, tss_, types_, labels_clicks_, labels_carts_, labels_orders_ = [], [], [], [], [], []

            events = ground_truth(session_data.events)  if len(session_data['events']) > 1 else session_data.events
            for event in events:
                aids_.append(event['aid'])
                tss_.append(event['ts'])
                types_.append(event['type'])
                labels_clicks_.append(event.get('labels', {}).get("clicks", 0))
                labels_carts_.append(list(event.get('labels', {}).get("carts", [])))
                labels_orders_.append(list(event.get('labels', {}).get("orders", [])))

            sessions.append(session_data.session)
            aids.append(aids_)
            tss.append(tss_)
            types.append(types_)
            labels_clicks.append(labels_clicks_)
            labels_carts.append(labels_carts_)
            labels_orders.append(labels_orders_)

#         break
        
        if ((i > 0) and (i % max_chunks) == 0):
            df = create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders)
            sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders = [], [], [], [], [], [], []
            if save:
                file = f'../output/{name}_{idx}.parquet'
#                 print(f'Saving data to {file}')
                df.to_parquet(file, index=False)
            idx += 1
            
            del df
            gc.collect()

    df = create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders)
    
    if save:
        file = f'../output/{name}_{idx}.parquet' if idx > 0 else f'../output/{name}.parquet'
        print(f'Saving data to {file}')
        df.to_parquet(file, index=False)
    
    else:
        return df

In [None]:
_ = jsonl_to_df(TRAIN_PATH, total=1101, name="train", save=True, max_chunks=25)

In [None]:
# df_val = jsonl_to_df(VAL_PATH, total=181, name="val", save=True)

In [None]:
# df_test = jsonl_to_df(TEST_PATH, total=168, name="test", save=True)

### Parquet nogroup

In [None]:
def jsonl_to_df_nogroup(filename, total=1290, name="train", max_chunks=200, save=True):
    
    chunks = pd.read_json(filename, lines=True, chunksize=10000)
    idx = 0

    sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders = [], [], [], [], [], [], []
    for i, chunk in tqdm(enumerate(chunks), total=total):
        for row_idx, session_data in chunk.iterrows():
            aids_, tss_, types_, labels_clicks_, labels_carts_, labels_orders_ = [], [], [], [], [], []

            events = ground_truth(session_data.events) if len(session_data['events']) > 1 else session_data.events
            for event in events:
                aids.append(event['aid'])
                tss.append(event['ts'])
                types_.append(event['type'])
                labels_clicks.append(event.get('labels', {}).get("clicks", 0))
                labels_carts.append(list(event.get('labels', {}).get("carts", [])))
                labels_orders.append(list(event.get('labels', {}).get("orders", [])))

                sessions.append(session_data.session)
#         break
        
        if ((i > 0) and (i % max_chunks) == 0):
            df = create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders)
            sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders = [], [], [], [], [], [], []
            if save:
                file = f'../input/parquets_2/{name}_{idx}.parquet'
                print(f'Saving data to {file}')
                df.to_parquet(file, index=False)
            idx += 1

    df = create_df(sessions, aids, tss, types, labels_clicks, labels_carts, labels_orders)
    
    if save:
        file = f'../input/parquets_2/{name}_{idx}.parquet' if idx > 0 else f'../input/parquets_2/{name}.parquet'
        print(f'Saving data to {file}')
        df.to_parquet(file, index=False)
    else:
        return df

In [None]:
# _ = jsonl_to_df_train(TRAIN_PATH, total=1200, name="train", save=True)

In [None]:
df_val = jsonl_to_df_nogroup(VAL_PATH, total=181, name="val", save=True)

In [None]:
df_test = jsonl_to_df_nogroup(TEST_PATH, total=168, name="test", save=True)

In [None]:
df_val.head()