In [1]:
import pandas as pd, numpy as np, polars as pl
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
from pandarallel import pandarallel

pandarallel.initialize(nb_workers=28, progress_bar=False)

# co visitation matrix version
VER = 1
is_validation = True
type_labels = {'clicks':0, 'carts':1, 'orders':2}

# for validation
if is_validation:
    # co visitation matrix path
    co_visitation_matrix_path = './val_co_visitation_matrix/'
    # data path
    data_path = './val_data/*_parquet/*'
# for full dataset
else:
    # co visitation matrix path
    co_visitation_matrix_path = './co_visitation_matrix/'
    # data path
    data_path = './data/*_parquet/*'

def read_file_into_mem(f):
    return (
        pl.read_parquet(
            f
        )
        .with_columns([
            (pl.col('ts') / 1000).cast(pl.Int32).alias('ts'),
            pl.col('type').apply(lambda x: type_labels[x]).cast(pl.Int8).alias('type')
        ])
    )

INFO: Pandarallel will run on 28 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### 1 - loading data

In [2]:
%%time
# init variables
type_labels = {'clicks':0, 'carts':1, 'orders':2}
df = None
# grab files
files = glob.glob(data_path)
# log
print(f"Total loaded {len(files)} files, start to load into memory")
# loading file into memroy
for f in files:
    if df is None:
        df = read_file_into_mem(f)
    else:
        df = df.vstack(read_file_into_mem(f))
df

Total loaded 120 files, start to load into memory
CPU times: user 22.4 s, sys: 8.79 s, total: 31.2 s
Wall time: 35.8 s


session,aid,ts,type
i32,i32,i32,i8
11098528,11830,1661119200,0
11098529,1105029,1661119200,0
11098530,264500,1661119200,0
11098530,264500,1661119288,0
11098530,409236,1661119369,0
11098530,409236,1661119441,0
11098530,409236,1661120165,0
11098530,409236,1661120532,1
11098531,452188,1661119200,0
11098531,1239060,1661119227,0


### 2 - training data and label generation
- This rerank model will focus on rerank aids are like to be ordered
- Hence if type = 2 (order) then we will give it a positive label, otherwise will give it a negative label

In [7]:
%%time
df = (
    df.with_column(pl.when(pl.col('type') == 2).then(1).otherwise(0).alias('gt'))
)
df.head(3)

CPU times: user 160 ms, sys: 243 ms, total: 403 ms
Wall time: 402 ms


session,aid,ts,type,gt
i32,i32,i32,i8,i32
11098528,11830,1661119200,0,0
11098529,1105029,1661119200,0,0
11098530,264500,1661119200,0,0


### 3 - feature engineering


#### session features
- session count


#### item(aid) features
- item count

### 4 - model building and training

### 5 - make prediction and run validation on validation dataset