In [1]:
import pandas as pd, numpy as np, polars as pl
from pandarallel import pandarallel
from tqdm.notebook import tqdm
import os, sys, glob, gc
from collections import Counter
import itertools

num_of_cpu = 16
pandarallel.initialize(nb_workers=num_of_cpu, progress_bar=True)

fast_experiment = True
type_labels = {'clicks':0, 'carts':1, 'orders':2}

train_file_path = './processed_data/train_parquet/*'
test_file_path = './processed_data/test_parquet/*'
test_label_file_path = './processed_data/test_labels.parquet'

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
def load_data(file_path):    
    dfs = []
    for e, chunk_file in tqdm(enumerate(glob.glob(file_path))):
        df = pl.read_parquet(chunk_file)
        df = df.with_columns([
            (pl.col('ts') / 1000).cast(pl.Int32).alias('ts'),
            pl.col('type').apply(lambda t : type_labels[t]).cast(pl.Int8).alias('type')
        ])
        dfs.append(df)
    return pl.concat(dfs)

### Loading training data
- Subset of original training dataset 163955180 data in total

In [3]:
# local training dataset
train_df = load_data(train_file_path)
print('Train data has shape',train_df.shape)
train_df.head()

0it [00:00, ?it/s]

Train data has shape (163955180, 4)


session,aid,ts,type
i32,i32,i32,i8
10321698,210316,1660924932,0
10321698,680568,1660924967,1
10321698,680568,1660927077,0
10321698,1543107,1660927107,0
10321698,1146385,1660927210,0


In [4]:
# for fast experiment only select 1/10 of the data
if fast_experiment:
    train_df = train_df.filter(pl.col('session') % 10 == 0)
    
train_df

session,aid,ts,type
i32,i32,i32,i8
10321700,651124,1660924932,0
10321700,651124,1660925059,0
10321710,446025,1660924934,0
10321710,1576577,1660924987,0
10321710,479985,1660925006,0
10321720,892324,1660924937,0
10321730,1649879,1660924938,0
10321730,381537,1660942127,0
10321730,1649879,1660942236,0
10321730,381537,1660942372,0


### Loading testing data
- Subset of original training dataset 7683577 data in total
- Unique session only in exist in local test data set (same property as LB test data)

In [5]:
# local test dataset
test_df = load_data(test_file_path)
print('Test data has shape',test_df.shape)
test_df.head()

0it [00:00, ?it/s]

Test data has shape (7683577, 4)


session,aid,ts,type
i32,i32,i32,i8
12179284,387693,1661493373,0
12179284,387693,1661493447,0
12179284,1146382,1661493519,0
12179285,1334074,1661493374,0
12179286,137164,1661493375,0


In [6]:
# concat training and testing dataset before feature generation
all_df = pl.concat([train_df, test_df])

# releaes memory
del train_df
_ = gc.collect()

all_df

session,aid,ts,type
i32,i32,i32,i8
10321700,651124,1660924932,0
10321700,651124,1660925059,0
10321710,446025,1660924934,0
10321710,1576577,1660924987,0
10321710,479985,1660925006,0
10321720,892324,1660924937,0
10321730,1649879,1660924938,0
10321730,381537,1660942127,0
10321730,1649879,1660942236,0
10321730,381537,1660942372,0


### Generate user(session) features
- number of clicks/carts/orders and ratio for each session - can indicate whether this customer browse a lot buy little or browse a lot and buy a lot
- transaction hours - can indicate what kind of this user is(AM or PM user), may help model to cluster customers

In [26]:
%%time
# user related features - number of clicks/carts/orders and its ratio for each session
user_type_count = (
    all_df.groupby(
        ['session', 'type']
    )
    .count()
    .pivot(
        values = 'count', index = 'session', columns='type', aggregate_fn='sum'
    )
    .with_column(
        pl.col("*").fill_null(pl.lit(0))
    )
    .rename(
        {
            '0':'u_clicks', # user(session) clicks
            '1':'u_carts', # user(session) carts
            '2':'u_orders' # user(session) orders
        }
    )
    .with_columns([
        (pl.col('u_clicks') + pl.col('u_carts') + pl.col('u_orders')).alias('u_sess_len'), # user session length
        pl.col("u_clicks").rank().alias('ur_clicks'), # user ranked clicks
        pl.col("u_carts").rank().alias('ur_carts'), # user ranked carts
        pl.col("u_orders").rank().alias('ur_orders'), # user ranked orders
        pl.when(pl.col('u_clicks') == 0).then(0).otherwise(pl.col('u_carts') / pl.col('u_clicks')).cast(pl.Float32).alias('u_click_cart_ratio'),
        pl.when(pl.col('u_clicks') == 0).then(0).otherwise(pl.col('u_orders') / pl.col('u_clicks')).cast(pl.Float32).alias('u_order_click_ratio'),
        pl.when(pl.col('u_carts') == 0).then(0).otherwise(pl.col('u_orders') / pl.col('u_carts')).cast(pl.Float32).alias('u_order_cart_ratio')
    ])
    .with_columns([
        ( (pl.col('ur_clicks') - pl.col('ur_clicks').min())/(pl.col('ur_clicks').max() - pl.col('ur_clicks').min()) ).alias('ur_clicks'), # normalization
        ( (pl.col('ur_carts') - pl.col('ur_carts').min())/(pl.col('ur_carts').max() - pl.col('ur_carts').min()) ).alias('ur_carts'), # normalization
        ( (pl.col('ur_orders') - pl.col('ur_orders').min())/(pl.col('ur_orders').max() - pl.col('ur_orders').min()) ).alias('ur_orders') # normalization
    ])
)

user_type_count.head(3)

CPU times: user 6.23 s, sys: 1.2 s, total: 7.42 s
Wall time: 1.68 s


session,u_clicks,u_orders,u_carts,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio
i32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32
6307760,2,0,0,2,0.400304,0.0,0.0,0.0,0.0,0.0
6131840,3,0,0,3,0.549269,0.0,0.0,0.0,0.0,0.0
1580560,5,3,4,12,0.697404,0.926686,0.971361,0.8,0.6,0.75


In [27]:
%%time
# user related features - user interaction hour - mean, min, max, std, rank across dataset
user_transaction_hour = (
    all_df.select(
        pl.col(['session', 'ts'])
    )
    .with_column(
        pl.from_epoch("ts", unit="s").alias('ts'),
    )
    .with_column(
        pl.col('ts').dt.hour().alias('hod') # hour of day
    )
    .select(
        pl.col(['session', 'hod'])
    )
    .groupby(
        'session'
    )
    .agg([
        pl.col('hod').mean().cast(pl.Float32).alias('u_mean_txn_hod'),
        pl.col('hod').median().cast(pl.UInt8).alias('u_median_txn_hod'),
        pl.col('hod').min().cast(pl.UInt8).alias('u_min_txn_hod'),
        pl.col('hod').max().cast(pl.UInt8).alias('u_max_txn_hod'),
        pl.col('hod').std().cast(pl.Float32).alias('u_std_txn_hod')
    ])
    .with_columns([
        pl.col("u_mean_txn_hod").rank().alias('u_rmean_txn_hod'), # ranked mean transaction hour
        pl.col("u_median_txn_hod").rank().alias('u_rmedian_txn_hod'), # ranked median transaction hour
        pl.col("u_min_txn_hod").rank().alias('u_rmin_txn_hod'), # ranked min transaction hour
        pl.col("u_max_txn_hod").rank().alias('u_rmax_txn_hod') # ranked max transaction hour
    ])
    .with_columns([
        ( (pl.col('u_rmean_txn_hod') - pl.col('u_rmean_txn_hod').min())/(pl.col('u_rmean_txn_hod').max() - pl.col('u_rmean_txn_hod').min()) ).alias('u_rmean_txn_hod'), # normalization
        ( (pl.col('u_rmedian_txn_hod') - pl.col('u_rmedian_txn_hod').min())/(pl.col('u_rmedian_txn_hod').max() - pl.col('u_rmedian_txn_hod').min()) ).alias('u_rmedian_txn_hod'), # normalization
        ( (pl.col('u_rmin_txn_hod') - pl.col('u_rmin_txn_hod').min())/(pl.col('u_rmin_txn_hod').max() - pl.col('u_rmin_txn_hod').min()) ).alias('u_rmin_txn_hod'), # normalization
        ( (pl.col('u_rmax_txn_hod') - pl.col('u_rmax_txn_hod').min())/(pl.col('u_rmax_txn_hod').max() - pl.col('u_rmax_txn_hod').min()) ).alias('u_rmax_txn_hod') # normalization
    ])
)

user_transaction_hour.head(3)

CPU times: user 20.3 s, sys: 641 ms, total: 20.9 s
Wall time: 1.97 s


session,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod
i32,f32,u8,u8,u8,f32,f32,f32,f32,f32
5767440,18.961164,20,13,23,2.531846,0.837181,0.911297,0.539928,1.0
11986208,9.0,9,9,9,0.0,0.17859,0.194544,0.297612,0.153351
12755248,13.0,13,13,13,0.0,0.431362,0.434226,0.539928,0.347912


### Generate item(aid) features
- global aid counts for click/cart/order - can show popularity of certain aids
- global aid transaction hour

In [1]:
%%time
# item related features - number of clicks/carts/orders and its ratio for each aid
item_type_count = (
    all_df.groupby(
        ['aid', 'type']
    )
    .count()
    .pivot(
        values = 'count', index = 'aid', columns='type', aggregate_fn='sum'
    )
    .with_column(
        pl.col("*").fill_null(pl.lit(0))
    )
    .rename(
        {
            '0':'i_clicks', # item clicks
            '1':'i_carts', # item carts
            '2':'i_orders' # item orders
        }
    )
    .with_columns([
        (pl.col('i_clicks') + pl.col('i_carts') + pl.col('i_orders')).alias('i_inter_len'), # item number of interaction in total
        pl.col("i_clicks").rank().alias('ir_clicks'), # item ranked clicks
        pl.col("i_carts").rank().alias('ir_carts'), # item ranked carts
        pl.col("i_orders").rank().alias('ir_orders'), # item ranked orders
        pl.when(pl.col('i_clicks') == 0).then(0).otherwise(pl.col('i_carts') / pl.col('i_clicks')).cast(pl.Float32).alias('i_cart_click_ratio'),
        pl.when(pl.col('i_clicks') == 0).then(0).otherwise(pl.col('i_orders') / pl.col('i_clicks')).cast(pl.Float32).alias('i_order_click_ratio'),
        pl.when(pl.col('i_carts') == 0).then(0).otherwise(pl.col('i_orders') / pl.col('i_carts')).cast(pl.Float32).alias('i_order_cart_ratio')
    ])
    .with_columns([
        ( (pl.col('ir_clicks') - pl.col('ir_clicks').min())/(pl.col('ir_clicks').max() - pl.col('ir_clicks').min()) ).alias('ir_clicks'),
        ( (pl.col('ir_carts') - pl.col('ir_carts').min())/(pl.col('ir_carts').max() - pl.col('ir_carts').min()) ).alias('ir_carts'),
        ( (pl.col('ir_orders') - pl.col('ir_orders').min())/(pl.col('ir_orders').max() - pl.col('ir_orders').min()) ).alias('ir_orders')
    ])
)
item_type_count.head(3)

NameError: name 'all_df' is not defined

In [29]:
%%time
# item time related features - item interaction hour - mean, min, max, std across dataset
item_transaction_hour = (
    all_df.select(
        pl.col(['aid', 'ts'])
    )
    .with_column(
        pl.from_epoch("ts", unit="s").alias('ts'),
    )
    .with_column(
        pl.col('ts').dt.hour().alias('hod') # hour of day
    )
    .select(
        pl.col(['aid', 'hod'])
    )
    .groupby(
        'aid'
    )
    .agg([
        pl.col('hod').mean().cast(pl.Float32).alias('i_mean_txn_hod'),
        pl.col('hod').median().cast(pl.Float32).alias('i_median_txn_hod'),
        pl.col('hod').min().cast(pl.UInt8).alias('i_min_txn_hod'),
        pl.col('hod').max().cast(pl.UInt8).alias('i_max_txn_hod'),
        pl.col('hod').std().cast(pl.Float32).alias('i_std_txn_hod')
    ])
    .with_columns([
        pl.col("i_mean_txn_hod").rank().alias('i_rmean_txn_hod'), # ranked mean transaction hour
        pl.col("i_median_txn_hod").rank().alias('i_rmedian_txn_hod'), # ranked median transaction hour
        pl.col("i_min_txn_hod").rank().alias('i_rmin_txn_hod'), # ranked min transaction hour
        pl.col("i_max_txn_hod").rank().alias('i_rmax_txn_hod') # ranked max transaction hour
    ])
    .with_columns([
        ( (pl.col('i_rmean_txn_hod') - pl.col('i_rmean_txn_hod').min())/(pl.col('i_rmean_txn_hod').max() - pl.col('i_rmean_txn_hod').min()) ).alias('i_rmean_txn_hod'), # normalization
        ( (pl.col('i_rmedian_txn_hod') - pl.col('i_rmedian_txn_hod').min())/(pl.col('i_rmedian_txn_hod').max() - pl.col('i_rmedian_txn_hod').min()) ).alias('i_rmedian_txn_hod'), # normalization
        ( (pl.col('i_rmin_txn_hod') - pl.col('i_rmin_txn_hod').min())/(pl.col('i_rmin_txn_hod').max() - pl.col('i_rmin_txn_hod').min()) ).alias('i_rmin_txn_hod'), # normalization
        ( (pl.col('i_rmax_txn_hod') - pl.col('i_rmax_txn_hod').min())/(pl.col('i_rmax_txn_hod').max() - pl.col('i_rmax_txn_hod').min()) ).alias('i_rmax_txn_hod') # normalization
    ])
)

item_transaction_hour.head(3)

CPU times: user 12.9 s, sys: 325 ms, total: 13.2 s
Wall time: 1.31 s


aid,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod
i32,f32,f32,u8,u8,f32,f32,f32,f32,f32
960144,9.0,9.0,7,11,2.828427,0.093514,0.11652,0.363406,0.116231
785856,17.5,17.5,14,21,4.949748,0.853544,0.802671,0.772858,0.78442
33200,13.333333,13.0,8,19,5.507571,0.41706,0.375717,0.441276,0.527821


### Generate labels - implict score for click events only
- use time weight to create implict scores for click event
- group by session and aid and calc sum scoring for duplicate click event under same session
- use percentile to cutoff and get ranking label

In [11]:
%%time
# min ts (from training set) , max ts (from testing set)
ts_min, ts_max = 1659304800, 1662328791

all_df_click = (
    all_df.filter(
        pl.col('type') == 0 # only select click event
    )
    .with_columns([
        ( 3*(pl.col('ts') - ts_min)/(ts_max-ts_min) ).cast(pl.Float32).alias('score')
    ])
    .select(
        pl.col(['session', 'aid', 'score'])
    )
    .groupby(
        ['session', 'aid']
    )
    .agg([
        pl.col('score').sum().alias('score')
    ])
    .with_columns([
        pl.when(pl.col('score') < pl.col('score').quantile(.25)).then(1)
        .otherwise(
            pl.when(pl.col('score') < pl.col('score').quantile(.5)).then(2)
            .otherwise(
                pl.when(pl.col('score') < pl.col('score').quantile(.75)).then(3)
                .otherwise(
                    pl.when(pl.col('score') < pl.col('score').quantile(.9)).then(4).otherwise(5)
                )
            )
        ).cast(pl.Int8).alias('rank')
    ])
    .select(
        pl.col(['session', 'aid', 'rank'])
    )
)

all_df_click

CPU times: user 17.1 s, sys: 1.43 s, total: 18.5 s
Wall time: 3.09 s


session,aid,rank
i32,i32,i8
11608064,1618795,3
3892960,84074,1
8207360,1246988,4
9079680,175982,4
1324000,1283678,4
2504720,893597,2
12325168,105542,4
12047792,366619,4
2341040,174159,2
1070480,767020,2


### join with previous features

In [12]:
%%time
# join with session features
all_df_click = all_df_click.join(user_type_count, on=['session'], how='left')
all_df_click = all_df_click.join(user_transaction_hour, on=['session'], how='left')

# join with item features
all_df_click = all_df_click.join(item_transaction_hour, on=['aid'], how='left')
all_df_click = all_df_click.join(item_type_count, on=['aid'], how='left')

all_df_click

CPU times: user 13.1 s, sys: 1.14 s, total: 14.3 s
Wall time: 1.83 s


session,aid,rank,u_clicks,u_carts,u_orders,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_orders,i_carts,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio
i32,i32,i8,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32
11608064,1618795,3,5,0,0,5,0.697404,0.0,0.0,0.0,0.0,0.0,18.0,18,18,18,0.0,0.793686,0.771953,0.821165,0.672622,14.333333,17.0,8,18,5.507571,0.564572,0.76486,0.441276,0.430582,3,0,0,3,0.468919,0.0,0.0,0.0,0.0,0.0
3892960,84074,1,2,0,0,2,0.400304,0.0,0.0,0.0,0.0,0.0,10.0,10,10,10,0.0,0.23343,0.248999,0.359035,0.196833,12.589474,14.0,0,22,4.734425,0.321297,0.478913,0.0,0.898884,92,1,2,95,0.972531,0.795548,0.844944,0.021739,0.01087,0.5
8207360,1246988,4,13,4,0,17,0.875846,0.926686,0.0,0.307692,0.0,0.0,9.352942,9,9,10,0.492592,0.201799,0.194544,0.297612,0.196833,13.477465,13.0,0,23,5.215408,0.432419,0.375717,0.0,1.0,646,13,51,710,0.998002,0.996288,0.994629,0.078947,0.020124,0.254902
9079680,175982,4,12,0,0,12,0.865101,0.0,0.0,0.0,0.0,0.0,20.0,20,20,20,0.0,0.923597,0.911297,0.931386,0.851313,14.124679,14.0,0,23,5.293072,0.539858,0.478913,0.0,1.0,758,0,20,778,0.998459,0.985471,0.0,0.026385,0.0,0.0
1324000,1283678,4,109,3,0,112,0.993946,0.898883,0.0,0.027523,0.0,0.0,10.5,11,0,23,6.833938,0.263066,0.308065,0.0,1.0,14.389744,15.0,0,23,5.590115,0.571204,0.586598,0.0,1.0,185,0,10,195,0.988123,0.964486,0.0,0.054054,0.0,0.0
2504720,893597,2,103,3,0,106,0.993217,0.898883,0.0,0.029126,0.0,0.0,11.641509,11,6,20,3.450849,0.331707,0.308065,0.117389,0.851313,13.639344,16.0,4,21,4.775049,0.457259,0.682753,0.141427,0.78442,29,9,23,61,0.910544,0.9881,0.990705,0.793103,0.310345,0.391304
12325168,105542,4,31,0,0,31,0.95505,0.0,0.0,0.0,0.0,0.0,14.483871,17,6,18,5.10471,0.539809,0.701458,0.117389,0.672622,15.44,17.0,7,21,4.263019,0.696683,0.76486,0.363406,0.78442,22,0,3,25,0.885272,0.863458,0.0,0.136364,0.0,0.0
12047792,366619,4,51,2,0,53,0.977552,0.849235,0.0,0.039216,0.0,0.0,15.54717,16,14,17,0.991619,0.618222,0.634149,0.597928,0.595263,12.666667,13.0,7,19,5.573748,0.328605,0.375717,0.363406,0.527821,6,0,0,6,0.670178,0.0,0.0,0.0,0.0,0.0
2341040,174159,2,23,0,0,23,0.93473,0.0,0.0,0.0,0.0,0.0,15.652174,16,8,16,1.668115,0.62064,0.634149,0.235879,0.527378,15.028571,16.0,7,21,4.598502,0.666787,0.682753,0.363406,0.78442,32,0,3,35,0.918362,0.863458,0.0,0.09375,0.0,0.0
1070480,767020,2,54,5,1,60,0.979409,0.944403,0.916656,0.092593,0.018519,0.2,15.333333,14,9,19,2.022473,0.609734,0.500364,0.297612,0.760512,17.5,17.5,14,21,3.511885,0.853544,0.802671,0.772858,0.78442,4,0,0,4,0.559575,0.0,0.0,0.0,0.0,0.0


In [13]:
# release memory
# del user_type_count
# del user_transaction_hour
# del item_transaction_hour
# del item_type_count

# _ = gc.collect()

In [14]:
%%time
# sort training data and generate group information
all_df_click = all_df_click.sort('session')

# generate group information for LGB-ranker
group = all_df_click.groupby('session').agg([ pl.col('session').count().alias('session_length')])['session_length'].to_numpy()

group[:10]

CPU times: user 17.6 s, sys: 1.32 s, total: 18.9 s
Wall time: 3 s


array([112,  25,   7,  14,   2,   1,   2,   3,   2,   1], dtype=uint32)

In [15]:
### model build
from lightgbm import LGBMRanker

lgb = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=20,
    importance_type='gain',
    n_jobs=16,
)

lgb

LGBMRanker(boosting_type='dart', importance_type='gain', metric='ndcg',
           n_estimators=20, n_jobs=16, objective='lambdarank')

In [16]:
# define feature columns and target columns
aid_column = 'aid'
user_feature_columns = [c for c in all_df_click.columns if c.startswith('u')]
item_feature_columns = [c for c in all_df_click.columns if c.startswith('i')]
# all features
feature_columns = [aid_column] + user_feature_columns + item_feature_columns 
# target
target = 'rank'

print(f"All feature columns are {feature_columns} and target column is {target}")

All feature columns are ['aid', 'u_clicks', 'u_carts', 'u_orders', 'u_sess_len', 'ur_clicks', 'ur_carts', 'ur_orders', 'u_click_cart_ratio', 'u_order_click_ratio', 'u_order_cart_ratio', 'u_mean_txn_hod', 'u_median_txn_hod', 'u_min_txn_hod', 'u_max_txn_hod', 'u_std_txn_hod', 'u_rmean_txn_hod', 'u_rmedian_txn_hod', 'u_rmin_txn_hod', 'u_rmax_txn_hod', 'i_mean_txn_hod', 'i_median_txn_hod', 'i_min_txn_hod', 'i_max_txn_hod', 'i_std_txn_hod', 'i_rmean_txn_hod', 'i_rmedian_txn_hod', 'i_rmin_txn_hod', 'i_rmax_txn_hod', 'i_clicks', 'i_orders', 'i_carts', 'i_inter_len', 'ir_clicks', 'ir_carts', 'ir_orders', 'i_cart_click_ratio', 'i_order_click_ratio', 'i_order_cart_ratio'] and target column is rank


In [17]:
%%time
# fit model
ranker = lgb.fit(
    all_df_click[feature_columns].to_pandas(),
    all_df_click[target].to_pandas(),
    group=group
)

CPU times: user 3min 38s, sys: 3.39 s, total: 3min 41s
Wall time: 19.9 s


In [18]:
feature_importance = pl.DataFrame(
    {
        "feature_name": ranker.feature_name_,
        "importance_gain": ranker.feature_importances_
    }
)

feature_importance.sort('importance_gain', reverse=True).transpose()

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,column_37,column_38
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""u_clicks""","""u_std_txn_hod""","""i_std_txn_hod""","""i_clicks""","""u_sess_len""","""i_inter_len""","""i_carts""","""u_median_txn_h...","""i_median_txn_h...","""u_mean_txn_hod...","""u_click_cart_r...","""i_cart_click_r...","""i_order_cart_r...","""i_max_txn_hod""","""u_max_txn_hod""","""u_carts""","""i_order_click_...","""u_order_click_...","""i_orders""","""u_min_txn_hod""","""i_mean_txn_hod...","""i_min_txn_hod""","""aid""","""u_orders""","""ur_clicks""","""ur_carts""","""ur_orders""","""u_order_cart_r...","""u_rmean_txn_ho...","""u_rmedian_txn_...","""u_rmin_txn_hod...","""u_rmax_txn_hod...","""i_rmean_txn_ho...","""i_rmedian_txn_...","""i_rmin_txn_hod...","""i_rmax_txn_hod...","""ir_clicks""","""ir_carts""","""ir_orders"""
"""295539.3141479...","""143574.9413146...","""135951.1379241...","""118196.1740570...","""61763.61694335...","""56698.87368774...","""55101.94389343...","""30830.39901733...","""29709.87088012...","""24892.24581909...","""24610.18902587...","""22777.06695556...","""9246.871978759...","""5059.214019775...","""4497.502929687...","""4328.669982910...","""3826.273986816...","""3746.869873046...","""3522.768005371...","""3156.368927001...","""2660.011993408...","""1859.774993896...","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0""","""0.0"""


### Generate candidates
- customer historical interaction in test set
- top 40 candidate from co visitation matrix
- top 20 clicks in the dataset

In [19]:
%%time
# top 20 clicks from training & testing set
top_20_clicks = all_df_click[['aid']].to_pandas().value_counts().index.values[:20]
top_20_clicks = [t[0] for t in top_20_clicks]


# Improved speed by using polars. 
def pqt_to_dict(path):
    return pl.read_parquet(path).groupby('aid_x').agg(pl.col('aid_y').list()).to_pandas().set_index('aid_x').aid_y.apply(list).to_dict()

DISK_PIECES = 4
# LOAD top 40 CO-VISITATION MATRICES
top_40_clicks_co = pqt_to_dict(f'./processed_data/co_matrix_top_40/top_40_clicks_v1_0.pqt')
for k in range(1,DISK_PIECES): 
    top_40_clicks_co.update(pd.read_parquet(f'./processed_data/co_matrix_top_40/top_40_clicks_v1_{k}.pqt') )  # f'./processed_data/co_matrix_top_40/top_40_clicks_v1_{k}.pqt'

CPU times: user 7.47 s, sys: 1.57 s, total: 9.04 s
Wall time: 12.6 s


In [20]:
def generate_clicks_candidates(df, 
                               cov_matrix=top_40_clicks_co, 
                               top_clicks=top_20_clicks, 
                               num_of_candidates=40, 
                               type_weight_multipliers={0:0.5, 1:9, 2:0.5}):
    """ generate clicks candidates
    """
    # USER HISTORY AIDS AND TYPES
    aids = df.aid.tolist()
    types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1]))
    
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= num_of_candidates:
        weights = np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid, w,t in zip(aids, weights, types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(num_of_candidates)]
        return sorted_aids
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[cov_matrix[aid] for aid in unique_aids if aid in cov_matrix]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(num_of_candidates) if aid2 not in unique_aids]    
    result = top_aids2[:num_of_candidates]
    # USE TOP20 TEST CLICKS
    return result + list(top_clicks)[:num_of_candidates-len(result)]

In [21]:
%%time
# send candidates into ranker model to do final ranking
pred_df_clicks = test_df.to_pandas().sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(
    lambda x: generate_clicks_candidates(x)
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=112579), Label(value='0 / 112579')…

CPU times: user 2min 12s, sys: 10.8 s, total: 2min 23s
Wall time: 3min 22s


In [22]:
%%time
clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
clicks_pred_df.columns = ['session_type', 'labels']
clicks_pred_df["labels"] = clicks_pred_df.labels.apply(lambda x: " ".join(map(str,x)))
clicks_pred_df

CPU times: user 17.3 s, sys: 310 ms, total: 17.6 s
Wall time: 17.6 s


Unnamed: 0,session_type,labels
0,11098528_clicks,588923 1732105 571762 884502 876129 1157882 11...
1,11098529_clicks,485256 108125 1460571 29735 184976 1502122 959...
2,11098530_clicks,1603001 963957 254154 583026 364155 210880 752...
3,11098531_clicks,318068 1630025 698990 786626 1689916 476369 92...
4,11098532_clicks,108125 1402537 659399 738098 24318 612920 1673...
...,...,...
1801246,12899774_clicks,1539309 819288 95488 771913 270852 31490 74397...
1801247,12899775_clicks,485256 108125 1460571 29735 184976 1502122 959...
1801248,12899776_clicks,485256 108125 1460571 29735 184976 1502122 959...
1801249,12899777_clicks,1688215 1308634 703474 395762 1486067 613752 2...


### Run local validation - directly use candidates (without LGBM re-rank)

In [23]:
# loading test labels
test_labels = pd.read_parquet(test_label_file_path)
# concat predictions
pred_df = pd.concat([clicks_pred_df])

def calc_cv_score(pred_df, labels=test_labels):
    # init variable
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    # only have clicks right now
    for t in ['clicks']: # 'carts','orders'
        sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')])
        labels = labels.loc[test_labels['type']==t]
        labels = labels.merge(sub, how='left', on=['session'])
        labels = labels.dropna()
        labels['hits'] = labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        labels['gt_count'] = labels.ground_truth.str.len().clip(0,20)
        recall = labels['hits'].sum() / labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)
    print('=============')
    print('Overall Recall =',score)
    print('=============')

In [24]:
%%time
# calc score
calc_cv_score(pred_df, test_labels)

clicks recall = 0.137959162283385
Overall Recall = 0.013795916228338501
CPU times: user 50.3 s, sys: 1.19 s, total: 51.5 s
Wall time: 51.5 s


### Run local validation (with LGBM re-rank)

In [72]:
%%time
# generate candidate dataframe 
pred_df = pd.DataFrame(pred_df_clicks, columns=["aid"]).reset_index()
pred_df = pred_df.explode('aid')
# convert to pl
pred_df_dl = pl.from_pandas(pred_df).with_columns([
    pl.col('*').cast(pl.Int32)
])
# release memory
del pred_df
_ = gc.collect()

pred_df_dl

CPU times: user 13.8 s, sys: 1.02 s, total: 14.8 s
Wall time: 14.6 s


session,aid
i32,i32
11098528,588923
11098528,1732105
11098528,571762
11098528,884502
11098528,876129
11098528,1157882
11098528,1182614
11098528,1790438
11098528,307904
11098528,231487


In [73]:
%%time
# join with session features
pred_df_dl = pred_df_dl.join(user_type_count, on=['session'], how='left')
pred_df_dl = pred_df_dl.join(user_transaction_hour, on=['session'], how='left')

# join with item features
pred_df_dl = pred_df_dl.join(item_transaction_hour, on=['aid'], how='left')
pred_df_dl = pred_df_dl.join(item_type_count, on=['aid'], how='left')

pred_df_dl

CPU times: user 23.7 s, sys: 3.13 s, total: 26.9 s
Wall time: 3.83 s


session,aid,u_clicks,u_orders,u_carts,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_orders,i_carts,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio
i32,i32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32
11098528,588923,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,13.110331,13.0,0,23,4.943953,0.393077,0.375717,0.0,1.0,2676,83,232,2991,0.999835,0.99973,0.999801,0.086697,0.031016,0.357759
11098528,1732105,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,13.220752,14.0,0,23,5.151707,0.403728,0.478913,0.0,1.0,1104,72,260,1436,0.99915,0.999777,0.999739,0.235507,0.065217,0.276923
11098528,571762,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,13.974326,14.0,0,23,5.03755,0.494629,0.478913,0.0,1.0,2125,55,157,2337,0.999738,0.999435,0.999558,0.073882,0.025882,0.350318
11098528,884502,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,13.231216,14.0,0,23,4.978339,0.404873,0.478913,0.0,1.0,3714,99,313,4126,0.999923,0.999853,0.999862,0.084276,0.026656,0.316294
11098528,876129,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,13.909857,15.0,0,23,4.977603,0.490147,0.586598,0.0,1.0,2048,68,258,2374,0.999719,0.999774,0.999702,0.125977,0.033203,0.263566
11098528,1157882,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,13.736248,14.0,0,23,4.783912,0.470128,0.478913,0.0,1.0,3084,83,287,3454,0.999872,0.999815,0.999801,0.093061,0.026913,0.289199
11098528,1182614,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,14.022025,15.0,0,23,4.80494,0.532228,0.586598,0.0,1.0,4752,155,496,5403,0.999958,0.999939,0.99995,0.104377,0.032618,0.3125
11098528,1790438,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,14.043156,14.0,0,23,4.655535,0.533391,0.478913,0.0,1.0,2610,46,171,2827,0.999828,0.999524,0.999383,0.065517,0.017625,0.269006
11098528,307904,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,13.76655,14.0,0,23,4.870729,0.474818,0.478913,0.0,1.0,1038,27,83,1148,0.999057,0.998336,0.99836,0.079961,0.026012,0.325301
11098528,231487,1,0,0,1,0.152625,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.98794,0.985622,0.988855,0.970873,13.437048,14.0,0,23,4.896045,0.429084,0.478913,0.0,1.0,7745,416,1243,9404,0.999992,0.999995,1.0,0.160491,0.053712,0.334674


In [74]:
%%time
# Model inference
scores = ranker.predict(pred_df_dl[feature_columns].to_pandas())

# Appending the model score to the original dataframe
pred_df_dl = pred_df_dl.with_columns(pl.Series(name='rank', values=scores))

# Getting the top 20 candidates from the prediction
pred_df_dl = pred_df_dl.sort(['session', 'rank'], reverse=True).groupby('session').agg([
    pl.col('aid').limit(20).list().alias('labels')
])

# Converting to pandas format and making it align with result format
clicks_pred_df = pred_df_dl.with_columns(
    pl.col('session') + '_clicks'
).to_pandas()
    
clicks_pred_df

CPU times: user 1min 17s, sys: 11.9 s, total: 1min 29s
Wall time: 21.3 s


Unnamed: 0,session,labels
0,12899778_clicks,"[485256, 108125, 1460571, 29735, 184976, 15021..."
1,12899777_clicks,"[1688215, 1308634, 703474, 395762, 1486067, 61..."
2,12899776_clicks,"[485256, 108125, 1460571, 29735, 184976, 15021..."
3,12899775_clicks,"[485256, 108125, 1460571, 29735, 184976, 15021..."
4,12899774_clicks,"[1539309, 819288, 95488, 771913, 270852, 31490..."
...,...,...
1801246,11098532_clicks,"[108125, 612920, 659399, 1754057, 39615, 14025..."
1801247,11098531_clicks,"[348358, 485256, 1169866, 1177944, 108125, 146..."
1801248,11098530_clicks,"[485256, 1603001, 1604220, 1596897, 254154, 10..."
1801249,11098529_clicks,"[485256, 108125, 1460571, 29735, 184976, 15021..."


In [75]:
%%time
# convert to required format
clicks_pred_df.columns = ['session_type', 'labels']
clicks_pred_df["labels"] = clicks_pred_df.labels.apply(lambda x: " ".join(map(str,x)))
clicks_pred_df

CPU times: user 19.7 s, sys: 215 ms, total: 19.9 s
Wall time: 19.9 s


Unnamed: 0,session_type,labels
0,12899778_clicks,485256 108125 1460571 29735 184976 1502122 959...
1,12899777_clicks,1688215 1308634 703474 395762 1486067 613752 2...
2,12899776_clicks,485256 108125 1460571 29735 184976 1502122 959...
3,12899775_clicks,485256 108125 1460571 29735 184976 1502122 959...
4,12899774_clicks,1539309 819288 95488 771913 270852 31490 74397...
...,...,...
1801246,11098532_clicks,108125 612920 659399 1754057 39615 1402537 738...
1801247,11098531_clicks,348358 485256 1169866 1177944 108125 1460571 2...
1801248,11098530_clicks,485256 1603001 1604220 1596897 254154 1066554 ...
1801249,11098529_clicks,485256 108125 1460571 29735 184976 1502122 959...


In [76]:
%%time
# concat predictions
pred_df = pd.concat([clicks_pred_df])
# calc score
calc_cv_score(pred_df, test_labels)

clicks recall = 0.09095352183438202
Overall Recall = 0.009095352183438201
CPU times: user 44.2 s, sys: 918 ms, total: 45.1 s
Wall time: 45.1 s
