In [1]:
import pandas as pd, numpy as np, polars as pl
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
from pandarallel import pandarallel
from lightgbm import LGBMRanker
import lightgbm as lgb

pandarallel.initialize(nb_workers=28, progress_bar=False)

# co visitation matrix version
VER = 1
is_validation = True
type_labels = {'clicks':0, 'carts':1, 'orders':2}

# for validation
if is_validation:
    # co visitation matrix path
    co_visitation_matrix_path = './val_co_visitation_matrix/'
    # data path
    data_path = './val_data/*_parquet/*'
    # label path
    test_label = './val_data/test_labels.parquet'
    # co visitation matrix
    co_candidates = './val_data/cv_candidates.pgt'
    # model path
    model_path = './model/val_model.txt'
# for full dataset
else:
    # co visitation matrix path
    co_visitation_matrix_path = './co_visitation_matrix/'
    # data path
    data_path = './data/*_parquet/*'
    # label path - no label files for whole datset
    test_label = ''
    # co visitation matrix
    co_candidates = './data/cv_candidates.pgt'
    # model path
    model_path = './model/full_model.txt'

def read_file_into_mem(f):
    return (
        pl.read_parquet(
            f
        )
        .with_columns([
            (pl.col('ts') / 1000).cast(pl.Int32).alias('ts'),
            pl.col('type').apply(lambda x: type_labels[x]).cast(pl.Int8).alias('type')
        ])
    )

INFO: Pandarallel will run on 28 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 1 - loading data

In [3]:
%%time
# init variables
type_labels = {'clicks':0, 'carts':1, 'orders':2}
df = None
# grab files
files = glob.glob(data_path)
# log
print(f"Total loaded {len(files)} files, start to load into memory")
# loading file into memroy
for f in files:
    if df is None:
        df = read_file_into_mem(f)
    else:
        df = df.vstack(read_file_into_mem(f))
df

Total loaded 120 files, start to load into memory
CPU times: user 18.1 s, sys: 6.75 s, total: 24.9 s
Wall time: 21.3 s


session,aid,ts,type
i32,i32,i32,i8
11098528,11830,1661119200,0
11098529,1105029,1661119200,0
11098530,264500,1661119200,0
11098530,264500,1661119288,0
11098530,409236,1661119369,0
11098530,409236,1661119441,0
11098530,409236,1661120165,0
11098530,409236,1661120532,1
11098531,452188,1661119200,0
11098531,1239060,1661119227,0


## 2 - feature engineering


### Generate user(session) features
- number of clicks/carts/orders and ratio for each session - can indicate whether this customer browse a lot buy little or browse a lot and buy a lot
- transaction hours - can indicate what kind of this user is(AM or PM user), may help model to cluster customers


### Generate item(aid) features
- global aid counts for click/cart/order - can show popularity of certain aids
- global aid transaction hour

In [4]:
%%time
# user related features - number of clicks/carts/orders and its ratio for each session
user_type_count = (
    df.sort(
        ['session', 'aid']
    )
    .groupby(
        ['session', 'type'],
        maintain_order=True
    )
    .count()
    .pivot(
        values = 'count', index = 'session', columns='type', aggregate_fn='sum'
    )
    .with_column(
        pl.col("*").fill_null(pl.lit(0))
    )
    .rename(
        {
            '0':'u_clicks', # user(session) clicks
            '1':'u_carts', # user(session) carts
            '2':'u_orders' # user(session) orders
        }
    )
    .with_columns([
        (pl.col('u_clicks') + pl.col('u_carts') + pl.col('u_orders')).alias('u_sess_len'), # user session length
        pl.col("u_clicks").rank().alias('ur_clicks'), # user ranked clicks
        pl.col("u_carts").rank().alias('ur_carts'), # user ranked carts
        pl.col("u_orders").rank().alias('ur_orders'), # user ranked orders
        pl.when(pl.col('u_clicks') == 0).then(0).otherwise(pl.col('u_carts') / pl.col('u_clicks')).cast(pl.Float32).alias('u_click_cart_ratio'),
        pl.when(pl.col('u_clicks') == 0).then(0).otherwise(pl.col('u_orders') / pl.col('u_clicks')).cast(pl.Float32).alias('u_order_click_ratio'),
        pl.when(pl.col('u_carts') == 0).then(0).otherwise(pl.col('u_orders') / pl.col('u_carts')).cast(pl.Float32).alias('u_order_cart_ratio')
    ])
    .with_columns([
        ( (pl.col('ur_clicks') - pl.col('ur_clicks').min())/(pl.col('ur_clicks').max() - pl.col('ur_clicks').min()) ).alias('ur_clicks'), # normalization
        ( (pl.col('ur_carts') - pl.col('ur_carts').min())/(pl.col('ur_carts').max() - pl.col('ur_carts').min()) ).alias('ur_carts'), # normalization
        ( (pl.col('ur_orders') - pl.col('ur_orders').min())/(pl.col('ur_orders').max() - pl.col('ur_orders').min()) ).alias('ur_orders') # normalization
    ])
)

user_type_count.head(3)

CPU times: user 1min 38s, sys: 11.4 s, total: 1min 49s
Wall time: 9.69 s


session,u_clicks,u_orders,u_carts,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio
i32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32
0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667
1,19,0,8,27,0.852088,0.95117,0.0,0.421053,0.0,0.0
2,13,0,0,13,0.787064,0.0,0.0,0.0,0.0,0.0


In [6]:
%%time
# user related features - user interaction hour - mean, min, max, std, rank across dataset
user_transaction_hour = (
    df.select(
        pl.col(['session', 'ts'])
    )
    .with_column(
        pl.from_epoch("ts", unit="s").alias('ts'),
    )
    .with_column(
        pl.col('ts').dt.hour().alias('hod') # hour of day
    )
    .select(
        pl.col(['session', 'hod'])
    )
    .groupby(
        'session'
    )
    .agg([
        pl.col('hod').mean().cast(pl.Float32).alias('u_mean_txn_hod'),
        pl.col('hod').median().cast(pl.UInt8).alias('u_median_txn_hod'),
        pl.col('hod').min().cast(pl.UInt8).alias('u_min_txn_hod'),
        pl.col('hod').max().cast(pl.UInt8).alias('u_max_txn_hod'),
        pl.col('hod').std().cast(pl.Float32).alias('u_std_txn_hod')
    ])
    .with_columns([
        pl.col("u_mean_txn_hod").rank().alias('u_rmean_txn_hod'), # ranked mean transaction hour
        pl.col("u_median_txn_hod").rank().alias('u_rmedian_txn_hod'), # ranked median transaction hour
        pl.col("u_min_txn_hod").rank().alias('u_rmin_txn_hod'), # ranked min transaction hour
        pl.col("u_max_txn_hod").rank().alias('u_rmax_txn_hod') # ranked max transaction hour
    ])
    .with_columns([
        ( (pl.col('u_rmean_txn_hod') - pl.col('u_rmean_txn_hod').min())/(pl.col('u_rmean_txn_hod').max() - pl.col('u_rmean_txn_hod').min()) ).alias('u_rmean_txn_hod'), # normalization
        ( (pl.col('u_rmedian_txn_hod') - pl.col('u_rmedian_txn_hod').min())/(pl.col('u_rmedian_txn_hod').max() - pl.col('u_rmedian_txn_hod').min()) ).alias('u_rmedian_txn_hod'), # normalization
        ( (pl.col('u_rmin_txn_hod') - pl.col('u_rmin_txn_hod').min())/(pl.col('u_rmin_txn_hod').max() - pl.col('u_rmin_txn_hod').min()) ).alias('u_rmin_txn_hod'), # normalization
        ( (pl.col('u_rmax_txn_hod') - pl.col('u_rmax_txn_hod').min())/(pl.col('u_rmax_txn_hod').max() - pl.col('u_rmax_txn_hod').min()) ).alias('u_rmax_txn_hod') # normalization
    ])
)

user_transaction_hour.head(3)

CPU times: user 1min 18s, sys: 3.76 s, total: 1min 22s
Wall time: 5.67 s


session,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod
i32,f32,u8,u8,u8,f32,f32,f32,f32,f32
2692128,14.25,15,12,15,1.5,0.531808,0.567714,0.561416,0.38081
7491200,18.0,18,18,18,0.0,0.817975,0.778399,0.863875,0.593982
11948096,6.0,6,6,6,0.0,0.042347,0.056508,0.161098,0.033973


In [7]:
%%time
# item related features - number of clicks/carts/orders and its ratio for each aid
item_type_count = (
    df.sort(
        ['aid', 'type']
    )
    .groupby(
        ['aid', 'type'],
        maintain_order=True
    )
    .count()
    .pivot(
        values = 'count', index = 'aid', columns='type', aggregate_fn='sum'
    )
    .with_column(
        pl.col("*").fill_null(pl.lit(0))
    )
    .rename(
        {
            '0':'i_clicks', # item clicks
            '1':'i_carts', # item carts
            '2':'i_orders' # item orders
        }
    )
    .with_columns([
        (pl.col('i_clicks') + pl.col('i_carts') + pl.col('i_orders')).alias('i_inter_len'), # item number of interaction in total
        pl.col("i_clicks").rank().alias('ir_clicks'), # item ranked clicks
        pl.col("i_carts").rank().alias('ir_carts'), # item ranked carts
        pl.col("i_orders").rank().alias('ir_orders'), # item ranked orders
        pl.when(pl.col('i_clicks') == 0).then(0).otherwise(pl.col('i_carts') / pl.col('i_clicks')).cast(pl.Float32).alias('i_cart_click_ratio'),
        pl.when(pl.col('i_clicks') == 0).then(0).otherwise(pl.col('i_orders') / pl.col('i_clicks')).cast(pl.Float32).alias('i_order_click_ratio'),
        pl.when(pl.col('i_carts') == 0).then(0).otherwise(pl.col('i_orders') / pl.col('i_carts')).cast(pl.Float32).alias('i_order_cart_ratio')
    ])
    .with_columns([
        ( (pl.col('ir_clicks') - pl.col('ir_clicks').min())/(pl.col('ir_clicks').max() - pl.col('ir_clicks').min()) ).alias('ir_clicks'),
        ( (pl.col('ir_carts') - pl.col('ir_carts').min())/(pl.col('ir_carts').max() - pl.col('ir_carts').min()) ).alias('ir_carts'),
        ( (pl.col('ir_orders') - pl.col('ir_orders').min())/(pl.col('ir_orders').max() - pl.col('ir_orders').min()) ).alias('ir_orders')
    ])
)
item_type_count.head(3)

CPU times: user 2min 30s, sys: 4.71 s, total: 2min 35s
Wall time: 8.93 s


aid,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio
i32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32
0,38,0,0,38,0.743345,0.0,0.0,0.0,0.0,0.0
1,31,1,0,32,0.703778,0.361061,0.0,0.032258,0.0,0.0
2,16,0,0,16,0.540791,0.0,0.0,0.0,0.0,0.0


In [9]:
%%time
# item time related features - item interaction hour - mean, min, max, std across dataset
item_transaction_hour = (
    df.select(
        pl.col(['aid', 'ts'])
    )
    .with_column(
        pl.from_epoch("ts", unit="s").alias('ts'),
    )
    .with_column(
        pl.col('ts').dt.hour().alias('hod') # hour of day
    )
    .select(
        pl.col(['aid', 'hod'])
    )
    .groupby(
        'aid'
    )
    .agg([
        pl.col('hod').mean().cast(pl.Float32).alias('i_mean_txn_hod'),
        pl.col('hod').median().cast(pl.Float32).alias('i_median_txn_hod'),
        pl.col('hod').min().cast(pl.UInt8).alias('i_min_txn_hod'),
        pl.col('hod').max().cast(pl.UInt8).alias('i_max_txn_hod'),
        pl.col('hod').std().cast(pl.Float32).alias('i_std_txn_hod')
    ])
    .with_columns([
        pl.col("i_mean_txn_hod").rank().alias('i_rmean_txn_hod'), # ranked mean transaction hour
        pl.col("i_median_txn_hod").rank().alias('i_rmedian_txn_hod'), # ranked median transaction hour
        pl.col("i_min_txn_hod").rank().alias('i_rmin_txn_hod'), # ranked min transaction hour
        pl.col("i_max_txn_hod").rank().alias('i_rmax_txn_hod') # ranked max transaction hour
    ])
    .with_columns([
        ( (pl.col('i_rmean_txn_hod') - pl.col('i_rmean_txn_hod').min())/(pl.col('i_rmean_txn_hod').max() - pl.col('i_rmean_txn_hod').min()) ).alias('i_rmean_txn_hod'), # normalization
        ( (pl.col('i_rmedian_txn_hod') - pl.col('i_rmedian_txn_hod').min())/(pl.col('i_rmedian_txn_hod').max() - pl.col('i_rmedian_txn_hod').min()) ).alias('i_rmedian_txn_hod'), # normalization
        ( (pl.col('i_rmin_txn_hod') - pl.col('i_rmin_txn_hod').min())/(pl.col('i_rmin_txn_hod').max() - pl.col('i_rmin_txn_hod').min()) ).alias('i_rmin_txn_hod'), # normalization
        ( (pl.col('i_rmax_txn_hod') - pl.col('i_rmax_txn_hod').min())/(pl.col('i_rmax_txn_hod').max() - pl.col('i_rmax_txn_hod').min()) ).alias('i_rmax_txn_hod') # normalization
    ])
)

item_transaction_hour.head(3)

CPU times: user 1min 8s, sys: 1.61 s, total: 1min 10s
Wall time: 4.46 s


aid,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod
i32,f32,f32,u8,u8,f32,f32,f32,f32,f32
1410944,20.4,21.0,15,23,3.130495,0.997134,0.994537,0.972186,1.0
1483136,15.205882,15.5,0,23,5.958337,0.788635,0.697842,0.0,1.0
750848,10.166667,10.0,6,14,2.786874,0.044008,0.07293,0.565466,0.033231


In [10]:
%%time
# item feature - is ordered item has been clicked or put into carts before
# one obersavtion is that clicked or carted items are more likely to be purchased
df_order = (
    df.filter(
        pl.col('type') == 2
    )
    .select(
        pl.col(['session', 'aid'])
    )
)

df_cart = (
    df.filter(
        pl.col('type') == 1
    )
    .with_column(
        pl.lit(1).cast(pl.Int8).alias('is_carted')
    )
    .select(
        pl.col(['session', 'aid', 'is_carted'])
    )
)

df_click = (
    df.filter(
        pl.col('type') == 0
    )
    .with_column(
        pl.lit(1).cast(pl.Int8).alias('is_clicked')
    )
    .select(
        pl.col(['session', 'aid', 'is_clicked'])
    )
)

# left join order dataframe with cart dataframe
item_order_interactions = (
    df_order.join(
        df_cart,
        on=['session', 'aid'],
        how='left'
    )
    .join(
        df_click,
        on=['session', 'aid'],
        how='left'
    )
    .unique(
        subset=['session', 'aid']
    )
    .with_columns([
        pl.col('is_clicked').fill_null(pl.lit(0)),
        pl.col('is_carted').fill_null(pl.lit(0))
    ])
)

item_order_interactions

CPU times: user 1min 38s, sys: 10.6 s, total: 1min 49s
Wall time: 6.43 s


session,aid,is_carted,is_clicked
i32,i32,i8,i8
11098531,1728212,0,1
11098531,452188,0,1
11098531,1271998,0,1
11098531,396199,0,1
11098537,1409748,1,0
11098538,1711586,1,1
11098545,52798,1,1
11098565,1741607,1,1
11098565,363037,0,0
11098565,1072244,0,0


In [11]:
%%time
# join with session features
df = df.join(user_type_count, on=['session'], how='left')
df = df.join(user_transaction_hour, on=['session'], how='left')

# join with item features
df = df.join(item_transaction_hour, on=['aid'], how='left')
df = df.join(item_type_count, on=['aid'], how='left')

# join with item interaction
df = df.join(item_order_interactions, on=['session', 'aid'], how='left')
# fill na
df = (
    df.with_columns([
        pl.col('is_clicked').fill_null(pl.lit(0)),
        pl.col('is_carted').fill_null(pl.lit(0))
    ])
)

df

CPU times: user 1min 29s, sys: 17 s, total: 1min 46s
Wall time: 12.3 s


session,aid,ts,type,u_clicks,u_orders,u_carts,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio,is_carted,is_clicked
i32,i32,i32,i8,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,i8,i8
11098528,11830,1661119200,0,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.348798,14.0,0,23,4.964787,0.356143,0.444778,0.0,1.0,28997,3682,1097,33776,0.999956,0.999971,0.999955,0.126979,0.037832,0.297936,0,0
11098529,1105029,1661119200,0,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,14.742719,15.0,0,23,5.257916,0.708933,0.617036,0.0,1.0,200,5,1,206,0.931524,0.757182,0.630161,0.025,0.005,0.2,0,0
11098530,264500,1661119200,0,5,0,1,6,0.546797,0.665784,0.0,0.2,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.429879,14.0,0,23,5.199224,0.376136,0.444778,0.0,1.0,3253,100,34,3387,0.997408,0.986634,0.984236,0.030741,0.010452,0.34,0,0
11098530,264500,1661119288,0,5,0,1,6,0.546797,0.665784,0.0,0.2,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.429879,14.0,0,23,5.199224,0.376136,0.444778,0.0,1.0,3253,100,34,3387,0.997408,0.986634,0.984236,0.030741,0.010452,0.34,0,0
11098530,409236,1661119369,0,5,0,1,6,0.546797,0.665784,0.0,0.2,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.266547,14.0,0,23,5.195176,0.336701,0.444778,0.0,1.0,13047,1102,400,14549,0.999739,0.999656,0.999618,0.084464,0.030658,0.362976,0,0
11098530,409236,1661119441,0,5,0,1,6,0.546797,0.665784,0.0,0.2,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.266547,14.0,0,23,5.195176,0.336701,0.444778,0.0,1.0,13047,1102,400,14549,0.999739,0.999656,0.999618,0.084464,0.030658,0.362976,0,0
11098530,409236,1661120165,0,5,0,1,6,0.546797,0.665784,0.0,0.2,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.266547,14.0,0,23,5.195176,0.336701,0.444778,0.0,1.0,13047,1102,400,14549,0.999739,0.999656,0.999618,0.084464,0.030658,0.362976,0,0
11098530,409236,1661120532,1,5,0,1,6,0.546797,0.665784,0.0,0.2,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.266547,14.0,0,23,5.195176,0.336701,0.444778,0.0,1.0,13047,1102,400,14549,0.999739,0.999656,0.999618,0.084464,0.030658,0.362976,0,0
11098531,452188,1661119200,0,20,4,0,24,0.859643,0.0,0.961455,0.0,0.2,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.55814,14.0,0,22,5.021377,0.408533,0.444778,0.0,0.744619,75,8,3,86,0.845528,0.830789,0.838933,0.106667,0.04,0.375,0,1
11098531,1239060,1661119227,0,20,4,0,24,0.859643,0.0,0.961455,0.0,0.2,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,16.117647,16.0,5,22,4.910223,0.886938,0.754839,0.448751,0.744619,16,1,0,17,0.540791,0.361061,0.0,0.0625,0.0,0.0,0,0


## 3 - training data and label generation
- This rerank model will focus on rerank aids are like to be ordered
- Hence if type = 2 (order) then we will give it a positive label, otherwise will give it a negative label

In [12]:
%%time
df = (
    df.with_column(
        pl.when(pl.col('type') == 2).then(1).otherwise(0).alias('gt')
    )
    .sort(
        by=['session', 'aid', 'type'], reverse=[False,False,True]  # sort by session, aid and type
    )
    .unique(
        subset=['session', 'aid'], keep='first' # only keep the type == order as positive label
    )
)
df.head(3)

CPU times: user 4min 23s, sys: 36.9 s, total: 5min
Wall time: 20 s


session,aid,ts,type,u_clicks,u_orders,u_carts,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio,is_carted,is_clicked,gt
i32,i32,i32,i8,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,i8,i8,i32
0,16246,1659367719,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,13.888357,14.0,0,23,5.362842,0.499344,0.444778,0.0,1.0,1095,120,39,1254,0.988595,0.9894,0.986613,0.109589,0.035616,0.325,0,0,0
0,30373,1661103687,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,13.951766,15.0,0,23,5.297382,0.515674,0.617036,0.0,1.0,1340,114,18,1472,0.991186,0.988699,0.967845,0.085075,0.013433,0.157895,0,0,0
0,97836,1660895309,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,15.169675,16.0,0,23,6.014784,0.783097,0.754839,0.0,1.0,263,12,2,277,0.94672,0.878527,0.777515,0.045627,0.007605,0.166667,0,0,0


In [13]:
# log
print(f"After remove duplicate records still have train {df.shape} samples and features.")

After remove duplicate records still have train (107680559, 45) samples and features.


In [14]:
# release memory
# del user_type_count, user_transaction_hour
# del item_type_count, item_transaction_hour
# del item_interaction, item_cart_interactions, item_order_interactions
# del df_click, df_cart, df_order
# _ = gc.collect()

## 04 - data down smapling
-  only keep the sessions have order event

In [15]:
# retrieve df carts and orders
df_down_sampled = (
    df.filter(
        # (pl.col('u_orders') > 0) | (pl.col('u_carts') > 0) # only keep data if # order and # cart greater than 0 
        pl.col('u_orders') > 0 # only keep data if # order > 0
    )
)
df_down_sampled

session,aid,ts,type,u_clicks,u_orders,u_carts,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio,is_carted,is_clicked,gt
i32,i32,i32,i8,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,i8,i8,i32
0,16246,1659367719,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,13.888357,14.0,0,23,5.362842,0.499344,0.444778,0.0,1.0,1095,120,39,1254,0.988595,0.9894,0.986613,0.109589,0.035616,0.325,0,0,0
0,30373,1661103687,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,13.951766,15.0,0,23,5.297382,0.515674,0.617036,0.0,1.0,1340,114,18,1472,0.991186,0.988699,0.967845,0.085075,0.013433,0.157895,0,0,0
0,97836,1660895309,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,15.169675,16.0,0,23,6.014784,0.783097,0.754839,0.0,1.0,263,12,2,277,0.94672,0.878527,0.777515,0.045627,0.007605,0.166667,0,0,0
0,102416,1661019639,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,13.974304,15.0,0,23,5.521988,0.520175,0.617036,0.0,1.0,6420,1524,501,8445,0.999123,0.999808,0.999746,0.237383,0.078037,0.32874,0,0,0
0,154930,1660546209,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,14.428572,15.0,3,23,5.232388,0.642708,0.617036,0.260132,1.0,63,0,0,63,0.82328,0.0,0.0,0.0,0.0,0.0,0,0,0
0,166547,1661017956,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,14.387979,15.0,0,23,5.508874,0.632413,0.617036,0.0,1.0,327,33,6,366,0.956811,0.952423,0.909236,0.100917,0.018349,0.181818,0,0,0
0,173702,1659775934,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,14.383648,15.0,0,23,5.403468,0.6314,0.617036,0.0,1.0,603,30,3,636,0.977184,0.947537,0.838933,0.049751,0.004975,0.1,0,0,0
0,218130,1661017911,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,14.081937,14.0,0,23,5.254113,0.55444,0.444778,0.0,1.0,1545,55,11,1611,0.992703,0.972644,0.947402,0.035599,0.00712,0.2,0,0,0
0,240346,1660157783,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,14.092422,15.0,0,23,5.404967,0.557393,0.617036,0.0,1.0,500,33,8,541,0.972061,0.952423,0.929535,0.066,0.016,0.242424,0,0,0
0,275288,1660801729,0,142,2,3,147,0.992504,0.846928,0.915017,0.021127,0.014085,0.666667,12.489796,14,0,22,5.307487,0.377781,0.496303,0.0,0.958687,13.576164,14.0,0,23,5.476217,0.413366,0.444778,0.0,1.0,6392,1286,377,8055,0.999117,0.999744,0.999574,0.201189,0.05898,0.293157,0,0,0


### 4 - model building and training

In [16]:
%%time
# sort training data and generate group information
df_down_sampled = df_down_sampled.sort('session')

# generate group information for LGB-ranker
group = df_down_sampled.groupby('session').agg([ pl.col('session').count().alias('session_length')])['session_length'].to_numpy()

group[:10]

CPU times: user 8.74 s, sys: 3.43 s, total: 12.2 s
Wall time: 782 ms


array([112,  11, 228, 122,   9,  17,   5,  15,  13,  27], dtype=uint32)

In [17]:
### model build
lgb = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=20,
    importance_type='gain',
    n_jobs=30,
)

lgb

In [28]:
# define feature columns and target columns
aid_column = 'aid'
user_feature_columns = [c for c in df_down_sampled.columns if c.startswith('u')]
item_feature_columns = [c for c in df_down_sampled.columns if c.startswith('i')]
# all features
feature_columns = [aid_column] + user_feature_columns + item_feature_columns 
# target
target = 'gt'


# if non_important_features is defined remove it from feature columns
if 'non_important_features' in globals():
    feature_columns = [c for c in feature_columns if c not in non_important_features]

print(f"All feature columns are {feature_columns} and target column is [{target}]")

All feature columns are ['u_clicks', 'u_orders', 'u_carts', 'u_sess_len', 'u_click_cart_ratio', 'u_order_click_ratio', 'u_order_cart_ratio', 'u_std_txn_hod', 'i_clicks', 'i_carts', 'i_orders', 'i_cart_click_ratio', 'i_order_click_ratio', 'i_order_cart_ratio', 'is_carted', 'is_clicked'] and target column is [gt]


In [29]:
%%time
# fit model
ranker = lgb.fit(
    df_down_sampled[feature_columns].to_pandas(),
    df_down_sampled[target].to_pandas(),
    group=group
)

CPU times: user 4min 13s, sys: 2.12 s, total: 4min 15s
Wall time: 11.4 s


In [30]:
feature_importance = pl.DataFrame(
    {
        "feature_name": ranker.feature_name_,
        "importance_gain": ranker.feature_importances_
    }
)

feature_importance.sort('importance_gain', reverse=True)

feature_name,importance_gain
str,f64
"""is_carted""",27355356.0
"""is_clicked""",3470900.0
"""i_order_click_...",441164.825569
"""u_order_cart_r...",268513.432495
"""u_order_click_...",82324.930405
"""u_click_cart_r...",51926.721893
"""u_carts""",45831.837784
"""u_clicks""",20097.840843
"""i_cart_click_r...",10207.81192
"""i_order_cart_r...",7162.44812


In [25]:
# locate feature importance equals to 0
non_important_features = feature_importance.filter(pl.col('importance_gain')  == 0)['feature_name'].to_list()
non_important_features

['aid',
 'ur_clicks',
 'ur_carts',
 'ur_orders',
 'u_mean_txn_hod',
 'u_median_txn_hod',
 'u_min_txn_hod',
 'u_max_txn_hod',
 'u_rmean_txn_hod',
 'u_rmedian_txn_hod',
 'u_rmin_txn_hod',
 'u_rmax_txn_hod',
 'i_mean_txn_hod',
 'i_median_txn_hod',
 'i_min_txn_hod',
 'i_max_txn_hod',
 'i_std_txn_hod',
 'i_rmean_txn_hod',
 'i_rmedian_txn_hod',
 'i_rmin_txn_hod',
 'i_rmax_txn_hod',
 'i_inter_len',
 'ir_clicks',
 'ir_carts',
 'ir_orders']

In [31]:
%%time
# save model
ranker.booster_.save_model(model_path)

CPU times: user 136 ms, sys: 1.57 ms, total: 138 ms
Wall time: 19 ms


<lightgbm.basic.Booster at 0x7f9dbb5e53a0>

### 5 - make prediction and run validation on validation dataset

In [2]:
def compute_metric(preditions, labels, topn=20):
    # init variables
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    # compute metric
    for t in ['clicks','carts','orders']:
        sub = preditions.loc[preditions.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:topn]])
        labels_sub = labels.loc[labels['type']==t]
        labels_sub = labels_sub.merge(sub, how='left', on=['session'])
        labels_sub['hits'] = labels_sub.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        labels_sub['gt_count'] = labels_sub.ground_truth.str.len().clip(0,20)
        recall = labels_sub['hits'].sum() / labels_sub['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)
    print('=============')
    print('overall recall =',score)
    print('=============')

In [33]:
%%time
# loading candidates file generate by co visitation matrix
candidates = pl.read_parquet(co_candidates)
candidates = candidates[['session_type','labels']]

# loading labels
labels = pd.read_parquet('./val_data/test_labels.parquet')
labels

CPU times: user 4.16 s, sys: 1.22 s, total: 5.38 s
Wall time: 10.6 s


Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]
...,...,...,...
2212687,12899774,clicks,[1399483]
2212688,12899775,clicks,[1760714]
2212689,12899776,clicks,[1737908]
2212690,12899777,clicks,[384045]


In [34]:
%%time
# run validation for candidates
print("=====validation recall for pure co visitation matrix")
compute_metric(candidates.to_pandas(), labels, topn=100)

=====validation recall for pure co visitation matrix
clicks recall = 0.620077423735456
carts recall = 0.49115323635430075
orders recall = 0.6999135022645809
overall recall = 0.6293018146385844
CPU times: user 50.6 s, sys: 2.42 s, total: 53 s
Wall time: 53 s


In [35]:
%%time
# retrieve order candidates generated by co visitation matrix
orders_candidates = (
    candidates
    .filter(
        pl.col('session_type').str.contains('orders')  # only select orders
    )
    .with_columns([
        pl.col('session_type').str.split('_').alias('session_type'), # split session and type
        pl.col('labels').str.split(' ').alias('aid') # split candidate and rename back to aid
    ])
    .with_columns([
        pl.col('session_type').arr.get(i).alias('session' if i == 0 else 'type') for i in range(2) # expand session and type to new columns
    ])
    .select(
        pl.col(['session', 'aid'])
    )
    .explode('aid')
    .with_columns([
        pl.col('session').cast(pl.Int32),
        pl.col('aid').cast(pl.Int32)
    ])
)
orders_candidates

CPU times: user 5.61 s, sys: 4.04 s, total: 9.65 s
Wall time: 4.41 s


session,aid
i32,i32
11098528,11830
11098528,1732105
11098528,588923
11098528,1157882
11098528,884502
11098528,876129
11098528,571762
11098528,1182614
11098528,231487
11098528,1790438


In [36]:
# join with features
orders_candidates = orders_candidates.join(user_type_count, on=['session'], how='left')
orders_candidates = orders_candidates.join(user_transaction_hour, on=['session'], how='left')

# join with item features
orders_candidates = orders_candidates.join(item_transaction_hour, on=['aid'], how='left')
orders_candidates = orders_candidates.join(item_type_count, on=['aid'], how='left')

# join with item interaction
orders_candidates = (
    orders_candidates.join(
        item_order_interactions, on=['session', 'aid'], how='left'
    )
    .with_columns([
        pl.col('is_clicked').fill_null(pl.lit(0)),
        pl.col('is_carted').fill_null(pl.lit(0))
    ])
)

orders_candidates

session,aid,u_clicks,u_orders,u_carts,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio,is_carted,is_clicked
i32,i32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,i8,i8
11098528,11830,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.348798,14.0,0,23,4.964787,0.356143,0.444778,0.0,1.0,28997,3682,1097,33776,0.999956,0.999971,0.999955,0.126979,0.037832,0.297936,0,0
11098528,1732105,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.237467,14.0,0,23,5.098406,0.329817,0.444778,0.0,1.0,8049,1877,526,10452,0.9994,0.999875,0.999766,0.233197,0.06535,0.280234,0,0
11098528,588923,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.306938,14.0,0,23,4.999054,0.345474,0.444778,0.0,1.0,22162,1770,656,24588,0.999916,0.999861,0.999843,0.079866,0.0296,0.370621,0,0
11098528,1157882,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.841642,14.0,0,23,4.875664,0.486966,0.444778,0.0,1.0,25238,2324,779,28341,0.999936,0.999913,0.999893,0.092083,0.030866,0.335198,0,0
11098528,884502,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.222987,14.0,0,23,4.975208,0.327466,0.444778,0.0,1.0,28184,2408,930,31522,0.999951,0.999921,0.999925,0.085439,0.032997,0.386213,0,0
11098528,876129,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.834414,14.0,0,23,4.95326,0.485729,0.444778,0.0,1.0,13293,1604,666,15563,0.999751,0.999833,0.999847,0.120665,0.050102,0.415212,0,0
11098528,571762,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.890341,14.0,0,23,5.097111,0.50072,0.444778,0.0,1.0,17021,1257,544,18822,0.99985,0.99973,0.99978,0.07385,0.031961,0.432776,0,0
11098528,1182614,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.901481,14.0,0,23,4.816454,0.503644,0.444778,0.0,1.0,32360,3303,1426,37089,0.999966,0.999962,0.999975,0.10207,0.044067,0.431729,0,0
11098528,231487,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.479405,14.0,0,23,4.923593,0.386731,0.444778,0.0,1.0,56302,9191,3895,69388,0.999995,0.999997,1.0,0.163245,0.06918,0.423784,0,0
11098528,1790438,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.839099,14.0,0,23,4.922565,0.486481,0.444778,0.0,1.0,20478,1314,439,22231,0.999896,0.999756,0.999677,0.064166,0.021438,0.334094,0,0


In [37]:
orders_candidates.head(3)

session,aid,u_clicks,u_orders,u_carts,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio,is_carted,is_clicked
i32,i32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,i8,i8
11098528,11830,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.348798,14.0,0,23,4.964787,0.356143,0.444778,0.0,1.0,28997,3682,1097,33776,0.999956,0.999971,0.999955,0.126979,0.037832,0.297936,0,0
11098528,1732105,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.237467,14.0,0,23,5.098406,0.329817,0.444778,0.0,1.0,8049,1877,526,10452,0.9994,0.999875,0.999766,0.233197,0.06535,0.280234,0,0
11098528,588923,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.306938,14.0,0,23,4.999054,0.345474,0.444778,0.0,1.0,22162,1770,656,24588,0.999916,0.999861,0.999843,0.079866,0.0296,0.370621,0,0


In [38]:
%%time
# save order candidates and features 
orders_candidates.write_parquet('./val_data/order_candidates_and_features.pgt')

CPU times: user 44.4 s, sys: 13 s, total: 57.4 s
Wall time: 39.9 s


## 5 - Shutdonw and restart kernel to make inference

In [3]:
%%time
# loading order candidates and features
pred_df_dl = pl.read_parquet('./val_data/order_candidates_and_features.pgt')
pred_df_dl

CPU times: user 51.2 s, sys: 13.3 s, total: 1min 4s
Wall time: 2.64 s


session,aid,u_clicks,u_orders,u_carts,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio,is_carted,is_clicked
i32,i32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,i8,i8
11098528,11830,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.348798,14.0,0,23,4.964787,0.356143,0.444778,0.0,1.0,28997,3682,1097,33776,0.999956,0.999971,0.999955,0.126979,0.037832,0.297936,0,0
11098528,1732105,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.237467,14.0,0,23,5.098406,0.329817,0.444778,0.0,1.0,8049,1877,526,10452,0.9994,0.999875,0.999766,0.233197,0.06535,0.280234,0,0
11098528,588923,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.306938,14.0,0,23,4.999054,0.345474,0.444778,0.0,1.0,22162,1770,656,24588,0.999916,0.999861,0.999843,0.079866,0.0296,0.370621,0,0
11098528,1157882,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.841642,14.0,0,23,4.875664,0.486966,0.444778,0.0,1.0,25238,2324,779,28341,0.999936,0.999913,0.999893,0.092083,0.030866,0.335198,0,0
11098528,884502,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.222987,14.0,0,23,4.975208,0.327466,0.444778,0.0,1.0,28184,2408,930,31522,0.999951,0.999921,0.999925,0.085439,0.032997,0.386213,0,0
11098528,876129,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.834414,14.0,0,23,4.95326,0.485729,0.444778,0.0,1.0,13293,1604,666,15563,0.999751,0.999833,0.999847,0.120665,0.050102,0.415212,0,0
11098528,571762,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.890341,14.0,0,23,5.097111,0.50072,0.444778,0.0,1.0,17021,1257,544,18822,0.99985,0.99973,0.99978,0.07385,0.031961,0.432776,0,0
11098528,1182614,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.901481,14.0,0,23,4.816454,0.503644,0.444778,0.0,1.0,32360,3303,1426,37089,0.999966,0.999962,0.999975,0.10207,0.044067,0.431729,0,0
11098528,231487,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.479405,14.0,0,23,4.923593,0.386731,0.444778,0.0,1.0,56302,9191,3895,69388,0.999995,0.999997,1.0,0.163245,0.06918,0.423784,0,0
11098528,1790438,1,0,0,1,0.053137,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.990979,0.986949,0.992185,0.958687,13.839099,14.0,0,23,4.922565,0.486481,0.444778,0.0,1.0,20478,1314,439,22231,0.999896,0.999756,0.999677,0.064166,0.021438,0.334094,0,0


In [4]:
%%time
# loading model
ranker = lgb.Booster(model_file=model_path)
ranker

CPU times: user 496 ms, sys: 0 ns, total: 496 ms
Wall time: 18.5 ms


<lightgbm.basic.Booster at 0x7fb54c873cd0>

In [6]:
# all features
feature_columns = ['u_clicks', 'u_orders', 'u_carts', 'u_sess_len', 'u_click_cart_ratio', 'u_order_click_ratio', 'u_order_cart_ratio', 
                   'u_std_txn_hod', 'i_clicks', 'i_carts', 'i_orders', 'i_cart_click_ratio', 'i_order_click_ratio', 'i_order_cart_ratio', 'is_carted', 'is_clicked']
# target
target = 'gt'

print(f"All feature columns are {feature_columns} and target column is [{target}]")

All feature columns are ['u_clicks', 'u_orders', 'u_carts', 'u_sess_len', 'u_click_cart_ratio', 'u_order_click_ratio', 'u_order_cart_ratio', 'u_std_txn_hod', 'i_clicks', 'i_carts', 'i_orders', 'i_cart_click_ratio', 'i_order_click_ratio', 'i_order_cart_ratio', 'is_carted', 'is_clicked'] and target column is [gt]


In [10]:
%%time
# Model inference
scores = ranker.predict(pred_df_dl[feature_columns].to_pandas())

# Appending the model score to the original dataframe
pred_df_dl = pred_df_dl.with_columns(pl.Series(name='rank', values=scores))

# Getting the top 20 candidates from the prediction
pred_df_dl = pred_df_dl.sort(['session', 'rank'], reverse=True).groupby('session').agg([
    pl.col('aid').limit(20).list().alias('labels')
])

# Converting to pandas format and making it align with result format
pred_df = pred_df_dl.with_columns(
    pl.col('session') + '_orders'
).to_pandas()
    
pred_df

CPU times: user 2min 25s, sys: 15.5 s, total: 2min 41s
Wall time: 12.1 s


Unnamed: 0,session,labels
0,12899778_orders,"[1748401, 971566, 1566377, 483042, 1771704, 17..."
1,12899777_orders,"[1148456, 247240, 1493149, 1196256, 1486067, 1..."
2,12899776_orders,"[548599, 1533919, 717801, 360421, 673767, 4478..."
3,12899775_orders,"[493104, 1733943, 823143, 1605870, 670066, 149..."
4,12899774_orders,"[267450, 255895, 669903, 1089446, 1670735, 778..."
...,...,...
1801246,11098532_orders,"[1854872, 406579, 1462420, 634452, 1359971, 18..."
1801247,11098531_orders,"[396199, 1271998, 452188, 1728212, 955181, 284..."
1801248,11098530_orders,"[237586, 29445, 66842, 1145868, 326545, 121218..."
1801249,11098529_orders,"[1022566, 51728, 884785, 205403, 481971, 42858..."


In [11]:
%%time
# converting to reqiured format
pred_df['labels'] = pred_df.labels.parallel_apply(lambda x: " ".join(map(str,x)))
pred_df

CPU times: user 2.38 s, sys: 3.22 s, total: 5.6 s
Wall time: 6.57 s


Unnamed: 0,session,labels
0,12899778_orders,1748401 971566 1566377 483042 1771704 1717432 ...
1,12899777_orders,1148456 247240 1493149 1196256 1486067 1308634...
2,12899776_orders,548599 1533919 717801 360421 673767 447842 748...
3,12899775_orders,493104 1733943 823143 1605870 670066 1498443 1...
4,12899774_orders,267450 255895 669903 1089446 1670735 778604 89...
...,...,...
1801246,11098532_orders,1854872 406579 1462420 634452 1359971 1820294 ...
1801247,11098531_orders,396199 1271998 452188 1728212 955181 284270 93...
1801248,11098530_orders,237586 29445 66842 1145868 326545 1212185 3641...
1801249,11098529_orders,1022566 51728 884785 205403 481971 428581 8805...


In [13]:
%%time
# loading candidates file generate by co visitation matrix
candidates = pd.read_parquet(co_candidates)
candidates = candidates[['session_type','labels']]
# replace order candidates generate by order-reranker model
candidates = pd.concat([
    pred_df.rename(columns={'session':'session_type'}),
    candidates.loc[~candidates.session_type.str.contains('orders')]
])

# loading labels
labels = pd.read_parquet('./val_data/test_labels.parquet')
labels

CPU times: user 5.9 s, sys: 2.62 s, total: 8.52 s
Wall time: 8.22 s


Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]
...,...,...,...
2212687,12899774,clicks,[1399483]
2212688,12899775,clicks,[1760714]
2212689,12899776,clicks,[1737908]
2212690,12899777,clicks,[384045]


In [14]:
%%time
# run validation for candidates
compute_metric(candidates, labels, topn=20)

clicks recall = 0.5239066859428527
carts recall = 0.4061063485069785
orders recall = 0.2400966476541878
overall recall = 0.3182805617388915
CPU times: user 30.4 s, sys: 808 ms, total: 31.2 s
Wall time: 31.2 s
