In [1]:
import pandas as pd, numpy as np, polars as pl
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
from pandarallel import pandarallel
from lightgbm import LGBMRanker
import lightgbm as lgb

pandarallel.initialize(nb_workers=28, progress_bar=False)

# co visitation matrix version
VER = 1
is_validation = True
type_labels = {'clicks':0, 'carts':1, 'orders':2}

# for validation
if is_validation:
    # co visitation matrix path
    co_visitation_matrix_path = './val_co_visitation_matrix/'
    # data path
    train_data_path = './val_data/train_parquet/*'
    test_data_path = './val_data/test_parquet/*'
    # label path
    test_label = './val_data/test_labels.parquet'
    # co visitation matrix
    co_candidates = './val_data/cv_candidates.pgt'
    # model path
    model_path = './model/val_order_model.txt'
    # root path
    root_path = './val_data/'
# for full dataset
else:
    # co visitation matrix path
    co_visitation_matrix_path = './co_visitation_matrix/'
    # data path
    train_data_path = './data/train_parquet/*parquet'
    test_data_path = './data/test_parquet/*parquet'
    # label path - no label files for whole datset
    test_label = ''
    # co visitation matrix
    co_candidates = './data/candidates.pgt'
    # model path
    model_path = './model/full_order_model.txt'
    # root path
    root_path = './data/'

def read_file_into_mem(f):
    return (
        pl.read_parquet(
            f
        )
        .with_columns([
            (pl.col('ts') / 1000).cast(pl.Int32).alias('ts'),
            pl.col('type').apply(lambda x: type_labels[x]).cast(pl.Int8).alias('type'),
            pl.col('session').cast(pl.Int32),
            pl.col('aid').cast(pl.Int32)
        ])
    )

INFO: Pandarallel will run on 28 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 1 - loading train data

In [2]:
%%time
# init variables
type_labels = {'clicks':0, 'carts':1, 'orders':2}
train_df = None
# grab files
files = glob.glob(train_data_path)
# log
print(f"Total loaded {len(files)} files, start to load into memory")
# loading file into memroy
for f in files:
    if train_df is None:
        train_df = read_file_into_mem(f)
    else:
        train_df = train_df.vstack(read_file_into_mem(f))
train_df

Total loaded 100 files, start to load into memory
CPU times: user 21.4 s, sys: 8.61 s, total: 30 s
Wall time: 33.5 s


session,aid,ts,type
i32,i32,i32,i8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0
0,1152674,1659367885,0
0,1649869,1659369893,1
0,461689,1659369898,1
0,305831,1659370027,2
0,461689,1659370027,2


## 2 - loading test data

In [2]:
test_df = None
# grab files
files = glob.glob(test_data_path)
# log
print(f"Total loaded {len(files)} files, start to load into memory")
# loading file into memroy
for f in files:
    if test_df is None:
        test_df = read_file_into_mem(f)
    else:
        test_df = test_df.vstack(read_file_into_mem(f))
test_df

Total loaded 20 files, start to load into memory


session,aid,ts,type
i32,i32,i32,i8
11098528,11830,1661119200,0
11098529,1105029,1661119200,0
11098530,264500,1661119200,0
11098530,264500,1661119288,0
11098530,409236,1661119369,0
11098530,409236,1661119441,0
11098530,409236,1661120165,0
11098530,409236,1661120532,1
11098531,452188,1661119200,0
11098531,1239060,1661119227,0


## 3 - feature engineering


### Generate user(session) features only using - test data
- number of clicks/carts/orders and ratio for each session - can indicate whether this customer browse a lot buy little or browse a lot and buy a lot
- transaction hours - can indicate what kind of this user is(AM or PM user), may help model to cluster customers

In [4]:
%%time
# user related features - number of clicks/carts/orders and its ratio for each session
user_type_count = (
    test_df.sort(
        ['session', 'aid']
    )
    .groupby(
        ['session', 'type'],
        maintain_order=True
    )
    .count()
    .pivot(
        values = 'count', index = 'session', columns='type', aggregate_fn='sum'
    )
    .with_column(
        pl.col("*").fill_null(pl.lit(0))
    )
    .rename(
        {
            '0':'u_clicks', # user(session) clicks
            '1':'u_carts', # user(session) carts
            '2':'u_orders' # user(session) orders
        }
    )
    .with_columns([
        (pl.col('u_clicks') + pl.col('u_carts') + pl.col('u_orders')).alias('u_sess_len'), # user session length
        pl.col("u_clicks").rank().alias('ur_clicks'), # user ranked clicks
        pl.col("u_carts").rank().alias('ur_carts'), # user ranked carts
        pl.col("u_orders").rank().alias('ur_orders'), # user ranked orders
        pl.when(pl.col('u_clicks') == 0).then(0).otherwise(pl.col('u_carts') / pl.col('u_clicks')).cast(pl.Float32).alias('u_click_cart_ratio'),
        pl.when(pl.col('u_clicks') == 0).then(0).otherwise(pl.col('u_orders') / pl.col('u_clicks')).cast(pl.Float32).alias('u_order_click_ratio'),
        pl.when(pl.col('u_carts') == 0).then(0).otherwise(pl.col('u_orders') / pl.col('u_carts')).cast(pl.Float32).alias('u_order_cart_ratio')
    ])
    .with_columns([
        ( (pl.col('ur_clicks') - pl.col('ur_clicks').min())/(pl.col('ur_clicks').max() - pl.col('ur_clicks').min()) ).alias('ur_clicks'), # normalization
        ( (pl.col('ur_carts') - pl.col('ur_carts').min())/(pl.col('ur_carts').max() - pl.col('ur_carts').min()) ).alias('ur_carts'), # normalization
        ( (pl.col('ur_orders') - pl.col('ur_orders').min())/(pl.col('ur_orders').max() - pl.col('ur_orders').min()) ).alias('ur_orders') # normalization
    ])
)

user_type_count.head(3)

CPU times: user 4.52 s, sys: 1.24 s, total: 5.76 s
Wall time: 731 ms


session,u_clicks,u_carts,u_orders,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio
i32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32
11098528,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0
11098529,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0
11098530,5,1,0,6,0.816705,0.808298,0.0,0.2,0.0,0.0


In [5]:
%%time
# user related features - user interaction hour - mean, min, max, std, rank across dataset
user_transaction_hour = (
    test_df.select(
        pl.col(['session', 'ts'])
    )
    .with_column(
        pl.from_epoch("ts", unit="s").alias('ts'),
    )
    .with_column(
        pl.col('ts').dt.hour().alias('hod') # hour of day
    )
    .select(
        pl.col(['session', 'hod'])
    )
    .groupby(
        'session'
    )
    .agg([
        pl.col('hod').mean().cast(pl.Float32).alias('u_mean_txn_hod'),
        pl.col('hod').median().cast(pl.UInt8).alias('u_median_txn_hod'),
        pl.col('hod').min().cast(pl.UInt8).alias('u_min_txn_hod'),
        pl.col('hod').max().cast(pl.UInt8).alias('u_max_txn_hod'),
        pl.col('hod').std().cast(pl.Float32).alias('u_std_txn_hod')
    ])
    .with_columns([
        pl.col("u_mean_txn_hod").rank().alias('u_rmean_txn_hod'), # ranked mean transaction hour
        pl.col("u_median_txn_hod").rank().alias('u_rmedian_txn_hod'), # ranked median transaction hour
        pl.col("u_min_txn_hod").rank().alias('u_rmin_txn_hod'), # ranked min transaction hour
        pl.col("u_max_txn_hod").rank().alias('u_rmax_txn_hod') # ranked max transaction hour
    ])
    .with_columns([
        ( (pl.col('u_rmean_txn_hod') - pl.col('u_rmean_txn_hod').min())/(pl.col('u_rmean_txn_hod').max() - pl.col('u_rmean_txn_hod').min()) ).alias('u_rmean_txn_hod'), # normalization
        ( (pl.col('u_rmedian_txn_hod') - pl.col('u_rmedian_txn_hod').min())/(pl.col('u_rmedian_txn_hod').max() - pl.col('u_rmedian_txn_hod').min()) ).alias('u_rmedian_txn_hod'), # normalization
        ( (pl.col('u_rmin_txn_hod') - pl.col('u_rmin_txn_hod').min())/(pl.col('u_rmin_txn_hod').max() - pl.col('u_rmin_txn_hod').min()) ).alias('u_rmin_txn_hod'), # normalization
        ( (pl.col('u_rmax_txn_hod') - pl.col('u_rmax_txn_hod').min())/(pl.col('u_rmax_txn_hod').max() - pl.col('u_rmax_txn_hod').min()) ).alias('u_rmax_txn_hod') # normalization
    ])
)

user_transaction_hour.head(3)

CPU times: user 8.01 s, sys: 217 ms, total: 8.22 s
Wall time: 429 ms


session,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod
i32,f32,u8,u8,u8,f32,f32,f32,f32,f32
12107168,18.0,18,18,18,0.0,0.774414,0.767042,0.787377,0.734458
11515392,13.0,13,13,13,0.0,0.439588,0.441093,0.47781,0.407779
12119264,19.0,19,19,19,0.0,0.846556,0.839966,0.85494,0.813337


In [6]:
%%time
# save user related feature to disk
user_type_count.write_parquet(f'{root_path}user_type_count.pgt')
user_transaction_hour.write_parquet(f'{root_path}user_transaction_hour.pgt')

CPU times: user 332 ms, sys: 17.4 ms, total: 350 ms
Wall time: 354 ms


### Generate item(aid) features - using train and test data
- global aid counts for click/cart/order - can show popularity of certain aids
- global aid transaction hour

In [7]:
%%time
# concat train and test dataframe
df = train_df.vstack(test_df)
df

CPU times: user 62 µs, sys: 24 µs, total: 86 µs
Wall time: 88.9 µs


session,aid,ts,type
i32,i32,i32,i8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0
0,1152674,1659367885,0
0,1649869,1659369893,1
0,461689,1659369898,1
0,305831,1659370027,2
0,461689,1659370027,2


In [8]:
%%time
# item related features - number of clicks/carts/orders and its ratio for each aid
item_type_count = (
    df.sort(
        ['aid', 'type']
    )
    .groupby(
        ['aid', 'type'],
        maintain_order=True
    )
    .count()
    .pivot(
        values = 'count', index = 'aid', columns='type', aggregate_fn='sum'
    )
    .with_column(
        pl.col("*").fill_null(pl.lit(0))
    )
    .rename(
        {
            '0':'i_clicks', # item clicks
            '1':'i_carts', # item carts
            '2':'i_orders' # item orders
        }
    )
    .with_columns([
        (pl.col('i_clicks') + pl.col('i_carts') + pl.col('i_orders')).alias('i_inter_len'), # item number of interaction in total
        pl.col("i_clicks").rank().alias('ir_clicks'), # item ranked clicks
        pl.col("i_carts").rank().alias('ir_carts'), # item ranked carts
        pl.col("i_orders").rank().alias('ir_orders'), # item ranked orders
        pl.when(pl.col('i_clicks') == 0).then(0).otherwise(pl.col('i_carts') / pl.col('i_clicks')).cast(pl.Float32).alias('i_cart_click_ratio'),
        pl.when(pl.col('i_clicks') == 0).then(0).otherwise(pl.col('i_orders') / pl.col('i_clicks')).cast(pl.Float32).alias('i_order_click_ratio'),
        pl.when(pl.col('i_carts') == 0).then(0).otherwise(pl.col('i_orders') / pl.col('i_carts')).cast(pl.Float32).alias('i_order_cart_ratio')
    ])
    .with_columns([
        ( (pl.col('ir_clicks') - pl.col('ir_clicks').min())/(pl.col('ir_clicks').max() - pl.col('ir_clicks').min()) ).alias('ir_clicks'),
        ( (pl.col('ir_carts') - pl.col('ir_carts').min())/(pl.col('ir_carts').max() - pl.col('ir_carts').min()) ).alias('ir_carts'),
        ( (pl.col('ir_orders') - pl.col('ir_orders').min())/(pl.col('ir_orders').max() - pl.col('ir_orders').min()) ).alias('ir_orders')
    ])
)
item_type_count.head(3)

CPU times: user 2min 28s, sys: 4.82 s, total: 2min 33s
Wall time: 8.72 s


aid,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio
i32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32
0,38,0,0,38,0.743345,0.0,0.0,0.0,0.0,0.0
1,31,1,0,32,0.703778,0.361061,0.0,0.032258,0.0,0.0
2,16,0,0,16,0.540791,0.0,0.0,0.0,0.0,0.0


In [9]:
%%time
# item time related features - item interaction hour - mean, min, max, std across dataset
item_transaction_hour = (
    df.select(
        pl.col(['aid', 'ts'])
    )
    .with_column(
        pl.from_epoch("ts", unit="s").alias('ts'),
    )
    .with_column(
        pl.col('ts').dt.hour().alias('hod') # hour of day
    )
    .select(
        pl.col(['aid', 'hod'])
    )
    .groupby(
        'aid'
    )
    .agg([
        pl.col('hod').mean().cast(pl.Float32).alias('i_mean_txn_hod'),
        pl.col('hod').median().cast(pl.Float32).alias('i_median_txn_hod'),
        pl.col('hod').min().cast(pl.UInt8).alias('i_min_txn_hod'),
        pl.col('hod').max().cast(pl.UInt8).alias('i_max_txn_hod'),
        pl.col('hod').std().cast(pl.Float32).alias('i_std_txn_hod')
    ])
    .with_columns([
        pl.col("i_mean_txn_hod").rank().alias('i_rmean_txn_hod'), # ranked mean transaction hour
        pl.col("i_median_txn_hod").rank().alias('i_rmedian_txn_hod'), # ranked median transaction hour
        pl.col("i_min_txn_hod").rank().alias('i_rmin_txn_hod'), # ranked min transaction hour
        pl.col("i_max_txn_hod").rank().alias('i_rmax_txn_hod') # ranked max transaction hour
    ])
    .with_columns([
        ( (pl.col('i_rmean_txn_hod') - pl.col('i_rmean_txn_hod').min())/(pl.col('i_rmean_txn_hod').max() - pl.col('i_rmean_txn_hod').min()) ).alias('i_rmean_txn_hod'), # normalization
        ( (pl.col('i_rmedian_txn_hod') - pl.col('i_rmedian_txn_hod').min())/(pl.col('i_rmedian_txn_hod').max() - pl.col('i_rmedian_txn_hod').min()) ).alias('i_rmedian_txn_hod'), # normalization
        ( (pl.col('i_rmin_txn_hod') - pl.col('i_rmin_txn_hod').min())/(pl.col('i_rmin_txn_hod').max() - pl.col('i_rmin_txn_hod').min()) ).alias('i_rmin_txn_hod'), # normalization
        ( (pl.col('i_rmax_txn_hod') - pl.col('i_rmax_txn_hod').min())/(pl.col('i_rmax_txn_hod').max() - pl.col('i_rmax_txn_hod').min()) ).alias('i_rmax_txn_hod') # normalization
    ])
)

item_transaction_hour.head(3)

CPU times: user 1min 1s, sys: 1.47 s, total: 1min 2s
Wall time: 4.08 s


aid,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod
i32,f32,f32,u8,u8,f32,f32,f32,f32,f32
1571968,13.4,13.5,5,20,5.538383,0.367922,0.355985,0.448751,0.349028
91872,14.428572,16.0,7,21,5.855401,0.642708,0.754839,0.668572,0.534714
845216,13.777778,14.0,1,23,5.353456,0.468546,0.444778,0.136244,1.0


In [10]:
%%time
# save user related feature to disk
item_type_count.write_parquet(f'{root_path}item_type_count.pgt')
item_transaction_hour.write_parquet(f'{root_path}item_transaction_hour.pgt')

CPU times: user 451 ms, sys: 45.2 ms, total: 496 ms
Wall time: 521 ms


### Generate interaction features only using test data

In [11]:
%%time
# item feature - is ordered item has been clicked or put into carts before
# one obersavtion is that clicked or carted items are more likely to be purchased
df_order = (
    test_df.filter(
        pl.col('type') == 2
    )
    .select(
        pl.col(['session', 'aid'])
    )
)

df_cart = (
    test_df.filter(
        pl.col('type') == 1
    )
    .with_column(
        pl.lit(1).cast(pl.Int8).alias('is_carted')
    )
    .select(
        pl.col(['session', 'aid', 'is_carted'])
    )
)

df_click = (
    test_df.filter(
        pl.col('type') == 0
    )
    .with_column(
        pl.lit(1).cast(pl.Int8).alias('is_clicked')
    )
    .select(
        pl.col(['session', 'aid', 'is_clicked'])
    )
)

# left join order dataframe with cart dataframe
item_order_interactions = (
    df_order.join(
        df_cart,
        on=['session', 'aid'],
        how='left'
    )
    .join(
        df_click,
        on=['session', 'aid'],
        how='left'
    )
    .unique(
        subset=['session', 'aid']
    )
    .with_columns([
        pl.col('is_clicked').fill_null(pl.lit(0)),
        pl.col('is_carted').fill_null(pl.lit(0))
    ])
)

item_order_interactions.head(3)

CPU times: user 2.16 s, sys: 535 ms, total: 2.7 s
Wall time: 214 ms


session,aid,is_carted,is_clicked
i32,i32,i8,i8
11098531,1728212,0,1
11098531,452188,0,1
11098531,1271998,0,1


In [12]:
%%time
# save user related feature to disk
item_order_interactions.write_parquet(f'{root_path}item_order_interactions.pgt')

CPU times: user 5.65 ms, sys: 141 µs, total: 5.79 ms
Wall time: 8.09 ms


## 4 - generate labels
- This rerank model will focus on rerank aids are likly to be ordered
- Hence if type = 2 (order) then we will give it a positive label, otherwise will give it a negative label

In [23]:
%%time

# if it's validation use test_labels.parquet file
if is_validation:
    # log
    print("Validtion -> True, use test label file to generate labels")
    # loading labels
    labels = pd.read_parquet(test_label)
    # only select oders - since this an order rerank model
    labels = labels.loc[ labels['type']=='orders' ]
    # expand items
    aids = labels.ground_truth.explode().astype('int32').rename('aid')
    labels = labels[['session']].astype('int32')
    labels = labels.merge(aids, left_index=True, right_index=True, how='left')
    labels['order'] = 1
    # convert to pl dataframe
    labels = pl.from_pandas(labels).with_column(pl.col('order').cast(pl.Int8))
# if it's not validation use all the test data
else:
    # log
    print("Vlidation -> False, use original test data to generate labels")
    labels = (
        test_df.filter(
            pl.col('type') == 2 # only select order
        )
        .with_columns([
            pl.col('session').cast(pl.Int32),
            pl.col('aid').cast(pl.Int32),
            pl.lit(1).cast(pl.Int8).alias('order')
        ])
        .select(
            pl.col(['session', 'aid', 'order'])
        )
    )

labels

Validtion -> True, use test label file to generate labels
CPU times: user 447 ms, sys: 108 ms, total: 555 ms
Wall time: 501 ms


session,aid,order
i32,i32,i8
11098528,990658,1
11098528,950341,1
11098528,1462506,1
11098528,1561739,1
11098528,907564,1
11098528,369774,1
11098528,440367,1
11098528,92401,1
11098528,11830,1
11098528,1199737,1


In [4]:
%%time
# loading candidates file - generated by co visitation matrix
candidates = pl.read_parquet(co_candidates)
candidates = candidates[['session_type','labels']]
# retrieve order candidates generated by co visitation matrix
orders_candidates = (
    candidates
    .filter(
        pl.col('session_type').str.contains('orders')  # only select orders
    )
    .with_columns([
        pl.col('session_type').str.split('_').alias('session_type'), # split session and type
        pl.col('labels').str.split(' ').alias('aid') # split candidate and rename back to aid
    ])
    .with_columns([
        pl.col('session_type').arr.get(i).alias('session' if i == 0 else 'type') for i in range(2) # expand session and type to new columns
    ])
    .select(
        pl.col(['session', 'aid'])
    )
    .explode('aid')
    .with_columns([
        pl.col('session').cast(pl.Int32),
        pl.col('aid').cast(pl.Int32)
    ])
)
orders_candidates

CPU times: user 8.2 s, sys: 4.89 s, total: 13.1 s
Wall time: 7.35 s


session,aid
i32,i32
11098528,11830
11098528,1732105
11098528,588923
11098528,1157882
11098528,884502
11098528,876129
11098528,571762
11098528,1182614
11098528,231487
11098528,1790438


In [24]:
%%time
# merge candidates and label file to genreate training data
train_df = (
    orders_candidates.join(
        labels,
        on=['session', 'aid'],
        how='left'
    )
    .with_column(
        pl.col('order').fill_null(0)
    )
)

train_df

CPU times: user 9.01 s, sys: 2.92 s, total: 11.9 s
Wall time: 1.45 s


session,aid,order
i32,i32,i8
11098528,11830,1
11098528,1732105,0
11098528,588923,0
11098528,1157882,0
11098528,884502,0
11098528,876129,0
11098528,571762,0
11098528,1182614,0
11098528,231487,0
11098528,1790438,0


In [25]:
%%time
user_type_count = pl.read_parquet(f'{root_path}user_type_count.pgt')
user_transaction_hour = pl.read_parquet(f'{root_path}user_transaction_hour.pgt')
item_transaction_hour = pl.read_parquet(f'{root_path}item_transaction_hour.pgt')
item_type_count = pl.read_parquet(f'{root_path}item_type_count.pgt')
item_order_interactions = pl.read_parquet(f'{root_path}item_order_interactions.pgt')

CPU times: user 350 ms, sys: 99.5 ms, total: 450 ms
Wall time: 70.3 ms


In [26]:
%%time
# join with session features
train_df = train_df.join(user_type_count, on=['session'], how='left')
train_df = train_df.join(user_transaction_hour, on=['session'], how='left')

# join with item features
train_df = train_df.join(item_transaction_hour, on=['aid'], how='left')
train_df = train_df.join(item_type_count, on=['aid'], how='left')

# join with item interaction
train_df = train_df.join(item_order_interactions, on=['session', 'aid'], how='left')
# fill na
train_df = (
    train_df.with_columns([
        pl.col('is_clicked').fill_null(pl.lit(0)),
        pl.col('is_carted').fill_null(pl.lit(0))
    ])
)
train_df

CPU times: user 41.6 s, sys: 10.3 s, total: 51.9 s
Wall time: 6.42 s


session,aid,order,u_clicks,u_carts,u_orders,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio,is_carted,is_clicked
i32,i32,i8,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,i8,i8
11098528,11830,1,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.348798,14.0,0,23,4.964787,0.356143,0.444778,0.0,1.0,28997,3682,1097,33776,0.999956,0.999971,0.999955,0.126979,0.037832,0.297936,0,0
11098528,1732105,0,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.237467,14.0,0,23,5.098406,0.329817,0.444778,0.0,1.0,8049,1877,526,10452,0.9994,0.999875,0.999766,0.233197,0.06535,0.280234,0,0
11098528,588923,0,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.306938,14.0,0,23,4.999054,0.345474,0.444778,0.0,1.0,22162,1770,656,24588,0.999916,0.999861,0.999843,0.079866,0.0296,0.370621,0,0
11098528,1157882,0,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.841642,14.0,0,23,4.875664,0.486966,0.444778,0.0,1.0,25238,2324,779,28341,0.999936,0.999913,0.999893,0.092083,0.030866,0.335198,0,0
11098528,884502,0,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.222987,14.0,0,23,4.975208,0.327466,0.444778,0.0,1.0,28184,2408,930,31522,0.999951,0.999921,0.999925,0.085439,0.032997,0.386213,0,0
11098528,876129,0,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.834414,14.0,0,23,4.95326,0.485729,0.444778,0.0,1.0,13293,1604,666,15563,0.999751,0.999833,0.999847,0.120665,0.050102,0.415212,0,0
11098528,571762,0,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.890341,14.0,0,23,5.097111,0.50072,0.444778,0.0,1.0,17021,1257,544,18822,0.99985,0.99973,0.99978,0.07385,0.031961,0.432776,0,0
11098528,1182614,0,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.901481,14.0,0,23,4.816454,0.503644,0.444778,0.0,1.0,32360,3303,1426,37089,0.999966,0.999962,0.999975,0.10207,0.044067,0.431729,0,0
11098528,231487,0,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.479405,14.0,0,23,4.923593,0.386731,0.444778,0.0,1.0,56302,9191,3895,69388,0.999995,0.999997,1.0,0.163245,0.06918,0.423784,0,0
11098528,1790438,0,1,0,0,1,0.231695,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985539,0.984579,0.986235,0.980365,13.839099,14.0,0,23,4.922565,0.486481,0.444778,0.0,1.0,20478,1314,439,22231,0.999896,0.999756,0.999677,0.064166,0.021438,0.334094,0,0


## 04 - data down smapling
-  down sampling on negative samples

In [27]:
%%time
down_smaple_frac = 0.05 # downsample 20x

train_df = train_df.to_pandas()
positives = train_df.loc[train_df['order']==1]
negatives = train_df.loc[train_df['order']==0].sample(frac=down_smaple_frac)
# merge positive and negative back to generate downsampled dataframe
train_df = pd.concat([positives, negatives], axis=0, ignore_index=True)

# log
print(f"After downsmapling training shape is {train_df.shape}, postive shape is {positives.shape} negative shape is {negatives.shape}.")

After downsmapling training shape is (5961446, 43), postive shape is (219285, 43) negative shape is (5742161, 43).
CPU times: user 27.3 s, sys: 10.6 s, total: 37.8 s
Wall time: 14.1 s


In [28]:
# define feature columns and target columns
aid_column = 'aid'
user_feature_columns = [c for c in train_df.columns if c.startswith('u')]
item_feature_columns = [c for c in train_df.columns if c.startswith('i')]
# all features
feature_columns = user_feature_columns + item_feature_columns 
# target
target = 'order'


# if non_important_features is defined remove it from feature columns
if 'non_important_features' in globals():
    feature_columns = [c for c in feature_columns if c not in non_important_features]

print(f"All feature columns are {feature_columns} and target column is [{target}]")

All feature columns are ['u_clicks', 'u_carts', 'u_orders', 'u_sess_len', 'u_order_cart_ratio', 'u_mean_txn_hod', 'u_median_txn_hod', 'u_min_txn_hod', 'u_max_txn_hod', 'u_std_txn_hod', 'i_mean_txn_hod', 'i_median_txn_hod', 'i_min_txn_hod', 'i_max_txn_hod', 'i_std_txn_hod', 'i_rmean_txn_hod', 'i_clicks', 'i_carts', 'i_orders', 'i_inter_len', 'i_cart_click_ratio', 'i_order_click_ratio', 'i_order_cart_ratio', 'is_carted', 'is_clicked'] and target column is [order]


In [10]:
# release memory
# del user_type_count, user_transaction_hour
# del item_type_count, item_transaction_hour
# del item_interaction, item_cart_interactions, item_order_interactions
# del df_click, df_cart, df_order
# _ = gc.collect()

### 4 - model building and training

In [29]:
%%time
# sort training data and generate group information
train_df = pl.from_pandas(train_df)
train_df = train_df.sort('session')

# generate group information for LGB-ranker
group = train_df.groupby('session').agg([ pl.col('session').count().alias('session_length')])['session_length'].to_numpy()

group[:10]

CPU times: user 13.5 s, sys: 1.26 s, total: 14.8 s
Wall time: 877 ms


array([1, 2, 4, 9, 1, 3, 6, 4, 6, 7], dtype=uint32)

In [30]:
### model build
lgb = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=20,
    importance_type='gain',
    n_jobs=30,
)

lgb

In [31]:
%%time
# fit model
ranker = lgb.fit(
    train_df[feature_columns].to_pandas(),
    train_df[target].to_pandas(),
    group=group
)

CPU times: user 1min 5s, sys: 723 ms, total: 1min 5s
Wall time: 2.85 s


In [32]:
feature_importance = pl.DataFrame(
    {
        "feature_name": ranker.feature_name_,
        "importance_gain": ranker.feature_importances_
    }
)

feature_importance.sort('importance_gain', reverse=True)

feature_name,importance_gain
str,f64
"""u_carts""",212164.689079
"""i_order_click_...",55963.39212
"""is_carted""",19648.415245
"""i_inter_len""",17483.778004
"""i_cart_click_r...",12570.262539
"""i_std_txn_hod""",7662.135498
"""i_carts""",6833.271809
"""u_sess_len""",6831.326328
"""i_order_cart_r...",6754.73304
"""i_mean_txn_hod...",6069.447178


In [33]:
# locate feature importance equals to 0
non_important_features = feature_importance.filter(pl.col('importance_gain')  == 0)['feature_name'].to_list()
non_important_features

['u_clicks', 'i_rmean_txn_hod']

In [34]:
%%time
# save model
ranker.booster_.save_model(model_path)

CPU times: user 4.15 ms, sys: 2.68 ms, total: 6.83 ms
Wall time: 3.37 ms


<lightgbm.basic.Booster at 0x7fc6041c80a0>

### 5 - make prediction and run validation on validation dataset

In [35]:
def compute_metric(preditions, labels, topn=20):
    # init variables
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    # compute metric
    for t in ['clicks','carts','orders']:
        sub = preditions.loc[preditions.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:topn]])
        labels_sub = labels.loc[labels['type']==t]
        labels_sub = labels_sub.merge(sub, how='left', on=['session'])
        labels_sub['hits'] = labels_sub.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        labels_sub['gt_count'] = labels_sub.ground_truth.str.len().clip(0,20)
        recall = labels_sub['hits'].sum() / labels_sub['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)
    print('=============')
    print('overall recall =',score)
    print('=============')

In [36]:
%%time
# loading candidates file generate by co visitation matrix
candidates = pl.read_parquet(co_candidates)
candidates = candidates[['session_type','labels']]

# loading labels
labels = pd.read_parquet('./val_data/test_labels.parquet')
labels

CPU times: user 2.71 s, sys: 604 ms, total: 3.32 s
Wall time: 3.14 s


Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]
...,...,...,...
2212687,12899774,clicks,[1399483]
2212688,12899775,clicks,[1760714]
2212689,12899776,clicks,[1737908]
2212690,12899777,clicks,[384045]


In [19]:
%%time
# run validation for candidates
print("=====validation recall for pure co visitation matrix")
compute_metric(candidates.to_pandas(), labels, topn=20)

=====validation recall for pure co visitation matrix
clicks recall = 0.5239066859428527
carts recall = 0.4061063485069785
orders recall = 0.6289023724637173
overall recall = 0.5515639966246092
CPU times: user 33.5 s, sys: 1.55 s, total: 35 s
Wall time: 34.9 s


## 5 - Make inference

In [40]:
%%time
# Model inference
scores = ranker.predict(train_df[feature_columns].to_pandas())
# Appending the model score to the original dataframe
pred = train_df[['session', 'aid']].with_columns(pl.Series(name='rank', values=scores))
# Getting the top 20 candidates from the prediction
pred = pred.sort(['session', 'rank'], reverse=True).groupby('session').agg([
    pl.col('aid').limit(20).list().alias('labels')
])
# concate orders
pred = pred.with_columns(
    pl.col('session') + '_orders'
).to_pandas()

# converting to reqiured format
pred['labels'] = pred.labels.parallel_apply(lambda x: " ".join(map(str,x)))
pred

CPU times: user 9.08 s, sys: 8.74 s, total: 17.8 s
Wall time: 13 s


Unnamed: 0,session,labels
0,12899778_orders,1771704 1006198
1,12899777_orders,62659 1636805
2,12899776_orders,798010 1341910 1503 695829
3,12899775_orders,329725 264612
4,12899773_orders,493268 22269 132574 702910
...,...,...
1665647,11098532_orders,1462420 933576 461190
1665648,11098531_orders,535150 1758580 248335 889789 214041 32813 8187...
1665649,11098530_orders,611167 409236 310671 1066554
1665650,11098529_orders,1049489 792887 890962


In [41]:
%%time
# if it's validation run compute metric function output scoring
if is_validation:
    # loading candidates file generate by co visitation matrix
    candidates = pd.read_parquet(co_candidates)
    candidates = candidates[['session_type','labels']]
    # replace order candidates generate by order-reranker model
    candidates = pd.concat([
        pred.rename(columns={'session':'session_type'}),
        candidates.loc[~candidates['session_type'].isin(pred['session'])]
    ])

    # run validation for candidates
    compute_metric(candidates, labels, topn=20)
# if it's not valiation output prediction file for later submission
else:
    # conver to pd dataframe
    candidates = candidates.to_pandas()
    # concat with candidate dataframe to supplement missing sessions
    pred = pd.concat([
        pred.rename(columns={'session':'session_type'}),
        candidates.loc[~candidates['session_type'].isin(pred['session'])]
    ])
    # only select orders
    pred = pred.loc[pred.session_type.str.contains('orders')]
    # save to disk
    pred.to_parquet(f'{root_path}order_rerank_result.pgt')

clicks recall = 0.5239066859428527
carts recall = 0.4061063485069785
orders recall = 0.6992176902232025
overall recall = 0.5937531872803004
CPU times: user 36.8 s, sys: 2.87 s, total: 39.7 s
Wall time: 39.4 s


In [34]:
# validation score on order rerank model recall improve from 0.62 to 0.69

clicks recall = 0.5239066859428527
carts recall = 0.4061063485069785
orders recall = 0.6991889640380079
overall recall = 0.5937359515691836
CPU times: user 28.7 s, sys: 651 ms, total: 29.3 s
Wall time: 29.3 s
