In [1]:
import numpy as np
import pandas as pd


In [2]:
id2type = ['clicks', 'carts', 'orders'] # I have analyzed the data
                                          # and so I know we can expect these event types
type2id = {a: i for i, a in enumerate(id2type)}

id2type, type2id

(['clicks', 'carts', 'orders'], {'clicks': 0, 'carts': 1, 'orders': 2})

In [3]:
pd.to_pickle(id2type, 'id2type.pkl')
pd.to_pickle(type2id, 'type2id.pkl')

In [4]:
def jsonl_to_df(fn):
    sessions = []
    aids = []
    tss = []
    types = []

    chunks = pd.read_json(fn, lines=True, chunksize=100_000)

    for chunk in chunks:
        for row_idx, session_data in chunk.iterrows():
            num_events = len(session_data.events)
            sessions += ([session_data.session] * num_events)
            for event in session_data.events:
                aids.append(event['aid'])
                tss.append(event['ts'])
                types.append(type2id[event['type']])
        
    return pd.DataFrame(data={'session': sessions, 'aid': aids, 'ts': tss, 'type': types})

In [5]:
%%time

test_df = jsonl_to_df('../data/train.jsonl')
test_df.type = test_df.type.astype(np.uint8)
test_df.to_parquet('../data/train.parquet', index=False)
test_df.to_csv('../data/train.csv', index=False)

CPU times: user 38min, sys: 2h 43min 50s, total: 3h 21min 51s
Wall time: 15h 27min 32s


In [6]:
%%time

test_df = jsonl_to_df('../data/test.jsonl')
test_df.type = test_df.type.astype(np.uint8)
test_df.to_parquet('../data/test.parquet', index=False)
test_df.to_csv('../data/test.csv', index=False)

CPU times: user 1min 2s, sys: 1.1 s, total: 1min 3s
Wall time: 1min 3s
