In [1]:
import gc
import jsonlines
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
from pathlib import Path

In [2]:
data_path = Path.cwd().parent / "data"
temp_path = Path.cwd() / "data"
if not temp_path.is_dir():
    temp_path.mkdir()

## LB

In [3]:
lb_in = data_path / "lb"
lb_out = temp_path / "lb"
if not lb_out.is_dir():
    lb_out.mkdir()

In [4]:
data_dict = {'session': [], 'aid': [], 'ts': [], 'type': []}

with jsonlines.open((lb_in / "test.jsonl").as_posix()) as reader:
    for result in tqdm(reader):
        for event in result['events']:
            data_dict['session'].append(result['session'])
            data_dict['aid'].append(event['aid'])
            data_dict['ts'].append(event['ts'])
            data_dict['type'].append(event['type'])

data = pd.DataFrame.from_dict(data_dict)
print(data.shape, data['session'].drop_duplicates().shape)

data.to_parquet((lb_out / "test.parquet").as_posix())

del data
gc.collect();

1671803it [00:08, 186319.29it/s]




(13851293, 4) (1671803,)


In [5]:
out = lb_out / "train.parquet"
if out.exists():
    out.unlink()

chunksize = 100_000
chunks = pd.read_json((lb_in / "train.jsonl").as_posix(), lines=True, chunksize=chunksize)
pqwriter = None

for i, chunk in enumerate(tqdm(chunks)):
    data_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    
    for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
        for event in events:
            data_dict['session'].append(session)
            data_dict['aid'].append(event['aid'])
            data_dict['ts'].append(event['ts'])
            data_dict['type'].append(event['type'])
            
    df = pd.DataFrame(data_dict)
    table = pa.Table.from_pandas(df)
    if pqwriter is None:
        pqwriter = pq.ParquetWriter(out.as_posix(), table.schema)
    pqwriter.write_table(table)

    del df
    gc.collect()

if pqwriter:
    pqwriter.close()

129it [07:54,  3.68s/it]




## CV

In [6]:
cv_in = data_path / "cv"
cv_out = temp_path / "cv"
if not cv_out.is_dir():
    cv_out.mkdir()

In [7]:
data_dict = {'session': [], 'aid': [], 'ts': [], 'type': []}

with jsonlines.open((cv_in / "test_sessions.jsonl").as_posix()) as reader:
    for result in tqdm(reader):
        for event in result['events']:
            data_dict['session'].append(result['session'])
            data_dict['aid'].append(event['aid'])
            data_dict['ts'].append(event['ts'])
            data_dict['type'].append(event['type'])

data = pd.DataFrame.from_dict(data_dict)
print(data.shape, data['session'].drop_duplicates().shape)

data.to_parquet((cv_out / "test_sessions.parquet").as_posix())

del data
gc.collect();

1783737it [00:08, 221621.92it/s]




(7580968, 4) (1783737,)


In [8]:
data_dict = {'session': [], 'aid': [], 'type': []}

with jsonlines.open((cv_in / 'test_labels.jsonl').as_posix()) as reader:
    for result in tqdm(reader):   
        for event in result['labels'].keys():
            aids = result['labels'][event]
            if not isinstance(aids, list):
                aids = [aids]
            for aid in aids:
                data_dict['session'].append(result['session'])
                data_dict['aid'].append(aid)
                data_dict['type'].append(event)

data = pd.DataFrame.from_dict(data_dict)

print(data.shape, data['session'].drop_duplicates().shape)

data.to_parquet((cv_out / "test_labels.parquet").as_posix())

del data
gc.collect();

1783737it [00:04, 425579.79it/s]




(2619314, 3) (1783737,)


In [9]:
data_dict = {'session': [], 'aid': [], 'ts': [], 'type': []}

with jsonlines.open((cv_in / "test_sessions_full.jsonl").as_posix()) as reader:
    for result in tqdm(reader):
        for event in result['events']:
            data_dict['session'].append(result['session'])
            data_dict['aid'].append(event['aid'])
            data_dict['ts'].append(event['ts'])
            data_dict['type'].append(event['type'])

data = pd.DataFrame.from_dict(data_dict)
print(data.shape, data['session'].drop_duplicates().shape)

data.to_parquet((cv_out / "test_sessions_full.parquet").as_posix())

del data
gc.collect();

1783737it [00:12, 141619.46it/s]




(15160611, 4) (1783737,)


In [10]:
out = cv_out / "train_sessions.parquet"
if out.exists():
    out.unlink()

chunksize = 100_000
chunks = pd.read_json((cv_in / "train_sessions.jsonl").as_posix(), lines=True, chunksize=chunksize)
pqwriter = None

for i, chunk in enumerate(tqdm(chunks)):
    data_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    
    for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
        for event in events:
            data_dict['session'].append(session)
            data_dict['aid'].append(event['aid'])
            data_dict['ts'].append(event['ts'])
            data_dict['type'].append(event['type'])
            
    df = pd.DataFrame(data_dict)
    table = pa.Table.from_pandas(df)
    if pqwriter is None:
        pqwriter = pq.ParquetWriter(out.as_posix(), table.schema)
    pqwriter.write_table(table)

    del df
    gc.collect()

if pqwriter:
    pqwriter.close()

106it [05:39,  3.20s/it]


