In [1]:
import pandas as pd, numpy as np, polars as pl
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
from pandarallel import pandarallel
from lightgbm import LGBMRanker
import lightgbm as lgb

pandarallel.initialize(nb_workers=28, progress_bar=False)

# co visitation matrix version
VER = 1
is_validation = False
type_labels = {'clicks':0, 'carts':1, 'orders':2}

# for validation
if is_validation:
    # co visitation matrix path
    co_visitation_matrix_path = './val_co_visitation_matrix/'
    # data path
    train_data_path = './val_data/train_parquet/*'
    test_data_path = './val_data/test_parquet/*'
    # label path
    test_label = './val_data/test_labels.parquet'
    # co visitation matrix
    co_candidates = './val_data/cv_candidates.pgt'
    # model path
    model_path = './model/val_order_model.txt'
    # root path
    root_path = './val_data/'
# for full dataset
else:
    # co visitation matrix path
    co_visitation_matrix_path = './co_visitation_matrix/'
    # data path
    train_data_path = './data/train_parquet/*parquet'
    test_data_path = './data/test_parquet/*parquet'
    # label path - no label files for whole datset
    test_label = ''
    # co visitation matrix
    co_candidates = './data/candidates.pgt'
    # model path
    model_path = './model/full_cart_model.txt'
    # root path
    root_path = './data/'

def read_file_into_mem(f):
    return (
        pl.read_parquet(
            f
        )
        .with_columns([
            (pl.col('ts') / 1000).cast(pl.Int32).alias('ts'),
            pl.col('type').apply(lambda x: type_labels[x]).cast(pl.Int8).alias('type'),
            pl.col('session').cast(pl.Int32),
            pl.col('aid').cast(pl.Int32)
        ])
    )

INFO: Pandarallel will run on 28 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 1 - loading train data

In [2]:
%%time
# init variables
type_labels = {'clicks':0, 'carts':1, 'orders':2}
train_df = None
# grab files
files = glob.glob(train_data_path)
# log
print(f"Total loaded {len(files)} files, start to load into memory")
# loading file into memroy
for f in files:
    if train_df is None:
        train_df = read_file_into_mem(f)
    else:
        train_df = train_df.vstack(read_file_into_mem(f))
train_df

Total loaded 129 files, start to load into memory
CPU times: user 24.6 s, sys: 10 s, total: 34.7 s
Wall time: 28.1 s


session,aid,ts,type
i32,i32,i32,i8
0,1517085,1659304800,0
0,1563459,1659304904,0
0,1309446,1659367439,0
0,16246,1659367719,0
0,1781822,1659367871,0
0,1152674,1659367885,0
0,1649869,1659369893,1
0,461689,1659369898,1
0,305831,1659370027,2
0,461689,1659370027,2


## 2 - loading test data

In [3]:
test_df = None
# grab files
files = glob.glob(test_data_path)
# log
print(f"Total loaded {len(files)} files, start to load into memory")
# loading file into memroy
for f in files:
    if test_df is None:
        test_df = read_file_into_mem(f)
    else:
        test_df = test_df.vstack(read_file_into_mem(f))
test_df

Total loaded 17 files, start to load into memory


session,aid,ts,type
i32,i32,i32,i8
12899779,59625,1661724000,0
12899780,1142000,1661724000,0
12899780,582732,1661724058,0
12899780,973453,1661724109,0
12899780,736515,1661724136,0
12899780,1142000,1661724155,0
12899781,141736,1661724000,0
12899781,199008,1661724022,0
12899781,57315,1661724170,0
12899781,194067,1661724246,0


## 3 - feature engineering


### Loading feature file

In [4]:
%%time
# user related features - loading feature generated by order rerank model
user_type_count = pl.read_parquet(f'{root_path}user_type_count.pgt')
user_transaction_hour = pl.read_parquet(f'{root_path}user_transaction_hour.pgt')
# item related features
item_type_count = pl.read_parquet(f'{root_path}item_type_count.pgt')
item_transaction_hour = pl.read_parquet(f'{root_path}item_transaction_hour.pgt')

CPU times: user 368 ms, sys: 63.3 ms, total: 431 ms
Wall time: 86.8 ms


### Generate interaction features only using test data

In [5]:
%%time
# item feature - is carted item has been clicked or not
df_cart = (
    test_df.filter(
        pl.col('type') == 1
    )
    .select(
        pl.col(['session', 'aid'])
    )
)

df_click = (
    test_df.filter(
        pl.col('type') == 0
    )
    .with_column(
        pl.lit(1).cast(pl.Int8).alias('is_clicked')
    )
    .select(
        pl.col(['session', 'aid', 'is_clicked'])
    )
)

# left join order dataframe with cart dataframe
item_cart_interactions = (
    df_cart.join(
        df_click,
        on=['session', 'aid'],
        how='left'
    )
    .unique(
        subset=['session', 'aid']
    )
    .with_columns([
        pl.col('is_clicked').fill_null(pl.lit(0))
    ])
)

item_cart_interactions.head(3)

CPU times: user 1.99 s, sys: 645 ms, total: 2.63 s
Wall time: 233 ms


session,aid,is_clicked
i32,i32,i8
12899781,199008,1
12899782,1494780,1
12899782,413962,1


In [6]:
%%time
# save user related feature to disk
item_cart_interactions.write_parquet(f'{root_path}item_cart_interactions.pgt')

CPU times: user 18.3 ms, sys: 4.16 ms, total: 22.5 ms
Wall time: 37.2 ms


## 4 - generate train labels
- This rerank model will focus on rerank aids are likly to be ordered
- Hence if type = 2 (order) then we will give it a positive label, otherwise will give it a negative label

In [7]:
%%time

# if it's validation use test_labels.parquet file
if is_validation:
    # log
    print("Validtion -> True, use test label file to generate labels")
    # loading labels
    labels = pd.read_parquet(test_label)
    # only select oders - since this an order rerank model
    labels = labels.loc[ labels['type'] == 'carts' ]
    # expand items
    aids = labels.ground_truth.explode().astype('int32').rename('aid')
    labels = labels[['session']].astype('int32')
    labels = labels.merge(aids, left_index=True, right_index=True, how='left')
    labels['cart'] = 1
    # convert to pl dataframe
    labels = pl.from_pandas(labels).with_column(pl.col('cart').cast(pl.Int8))
# if it's not validation use all the test data
else:
    # log
    print("Vlidation -> False, use original test data to generate labels")
    labels = (
        test_df.filter(
            pl.col('type') == 1 # only select cart
        )
        .with_columns([
            pl.col('session').cast(pl.Int32),
            pl.col('aid').cast(pl.Int32),
            pl.lit(1).cast(pl.Int8).alias('cart')
        ])
        .select(
            pl.col(['session', 'aid', 'cart'])
        )
    )

labels

Vlidation -> False, use original test data to generate labels
CPU times: user 6.15 ms, sys: 37.1 ms, total: 43.2 ms
Wall time: 13.8 ms


session,aid,cart
i32,i32,i8
12899781,199008,1
12899782,1494780,1
12899782,413962,1
12899782,779477,1
12899782,562753,1
12899782,476063,1
12899782,779477,1
12899782,975116,1
12899782,595994,1
12899782,1344773,1


In [8]:
%%time
# loading candidates file - generated by co visitation matrix
candidates = pl.read_parquet(co_candidates)
candidates = candidates[['session_type','labels']]
# retrieve order candidates generated by co visitation matrix
carts_candidates = (
    candidates
    .filter(
        pl.col('session_type').str.contains('carts')  # only select orders
    )
    .with_columns([
        pl.col('session_type').str.split('_').alias('session_type'), # split session and type
        pl.col('labels').str.split(' ').alias('aid') # split candidate and rename back to aid
    ])
    .with_columns([
        pl.col('session_type').arr.get(i).alias('session' if i == 0 else 'type') for i in range(2) # expand session and type to new columns
    ])
    .select(
        pl.col(['session', 'aid'])
    )
    .explode('aid')
    .with_columns([
        pl.col('session').cast(pl.Int32),
        pl.col('aid').cast(pl.Int32)
    ])
)
carts_candidates

CPU times: user 7.68 s, sys: 4.49 s, total: 12.2 s
Wall time: 7.23 s


session,aid
i32,i32
12899779,59625
12899779,469285
12899779,438191
12899779,731692
12899779,737445
12899779,1253524
12899779,1790770
12899779,3295
12899779,45290
12899779,94230


In [9]:
# merge candidates and label file to genreate training data
train_df = (
    carts_candidates.join(
        labels,
        on=['session', 'aid'],
        how='left'
    )
    .with_column(
        pl.col('cart').fill_null(0)
    )
)

train_df

session,aid,cart
i32,i32,i8
12899779,59625,0
12899779,469285,0
12899779,438191,0
12899779,731692,0
12899779,737445,0
12899779,1253524,0
12899779,1790770,0
12899779,3295,0
12899779,45290,0
12899779,94230,0


In [10]:
%%time
# join with session features
train_df = train_df.join(user_type_count, on=['session'], how='left')
train_df = train_df.join(user_transaction_hour, on=['session'], how='left')

# join with item features
train_df = train_df.join(item_transaction_hour, on=['aid'], how='left')
train_df = train_df.join(item_type_count, on=['aid'], how='left')

# join with item interaction
train_df = train_df.join(item_cart_interactions, on=['session', 'aid'], how='left')
# fill na
train_df = (
    train_df.with_columns([
        pl.col('is_clicked').fill_null(pl.lit(0))
    ])
)
train_df

CPU times: user 41.2 s, sys: 9.91 s, total: 51.1 s
Wall time: 6.1 s


session,aid,cart,u_clicks,u_carts,u_orders,u_sess_len,ur_clicks,ur_carts,ur_orders,u_click_cart_ratio,u_order_click_ratio,u_order_cart_ratio,u_mean_txn_hod,u_median_txn_hod,u_min_txn_hod,u_max_txn_hod,u_std_txn_hod,u_rmean_txn_hod,u_rmedian_txn_hod,u_rmin_txn_hod,u_rmax_txn_hod,i_mean_txn_hod,i_median_txn_hod,i_min_txn_hod,i_max_txn_hod,i_std_txn_hod,i_rmean_txn_hod,i_rmedian_txn_hod,i_rmin_txn_hod,i_rmax_txn_hod,i_clicks,i_carts,i_orders,i_inter_len,ir_clicks,ir_carts,ir_orders,i_cart_click_ratio,i_order_click_ratio,i_order_cart_ratio,is_clicked
i32,i32,i8,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,u32,u32,u32,u32,f32,f32,f32,f32,f32,f32,i8
12899779,59625,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,15.692307,18.0,2,22,5.437288,0.872403,0.930464,0.234818,0.702331,13,0,0,13,0.387713,0.0,0.0,0.0,0.0,0.0,0
12899779,469285,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,15.333333,16.0,7,21,3.384456,0.83003,0.769353,0.737596,0.473259,42,3,0,45,0.710978,0.599684,0.0,0.071429,0.0,0.0,0
12899779,438191,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,14.223774,15.0,0,23,5.449549,0.603536,0.6219,0.0,1.0,3533,114,4,3651,0.996691,0.984853,0.848279,0.032267,0.001132,0.035088,0
12899779,731692,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,14.398497,15.0,3,22,4.696775,0.649181,0.6219,0.302915,0.702331,111,17,5,133,0.860102,0.890593,0.872933,0.153153,0.045045,0.294118,0
12899779,737445,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,13.571428,14.0,0,23,5.440594,0.403044,0.435298,0.0,1.0,943,16,0,959,0.981671,0.884908,0.0,0.016967,0.0,0.0,0
12899779,1253524,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,14.21147,15.0,0,23,5.426594,0.599675,0.6219,0.0,1.0,273,6,0,279,0.934397,0.751733,0.0,0.021978,0.0,0.0,0
12899779,1790770,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,11.5,10.0,2,21,4.538723,0.083942,0.057681,0.234818,0.473259,33,2,1,36,0.658333,0.491272,0.585776,0.060606,0.030303,0.5,0
12899779,3295,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,13.769737,14.0,2,23,4.997311,0.463465,0.435298,0.234818,1.0,151,1,0,152,0.890795,0.310196,0.0,0.006623,0.0,0.0,0
12899779,45290,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,15.157895,17.0,2,23,4.665642,0.804701,0.866744,0.234818,1.0,207,2,0,209,0.916331,0.491272,0.0,0.009662,0.0,0.0,0
12899779,94230,0,1,0,0,1,0.233788,0.0,0.0,0.0,0.0,0.0,22.0,22,22,22,0.0,0.985525,0.984585,0.986192,0.98053,13.659686,14.0,0,23,4.871538,0.429135,0.435298,0.0,1.0,558,14,1,573,0.967393,0.871472,0.585776,0.02509,0.001792,0.071429,0


## 04 - data down smapling
-  down sampling on negative samples

In [11]:
%%time
down_smaple_frac = 0.05 # 20x downsampling

train_df = train_df.to_pandas()
positives = train_df.loc[train_df['cart']==1]
negatives = train_df.loc[train_df['cart']==0].sample(frac=down_smaple_frac)
train_df = pd.concat([positives, negatives], axis=0, ignore_index=True)

# log
print(f"After downsmapling training shape is {train_df.shape}, postive shape is {positives.shape} negative shape is {negatives.shape}.")

After downsmapling training shape is (5910860, 42), postive shape is (569987, 42) negative shape is (5340873, 42).
CPU times: user 25.4 s, sys: 10.2 s, total: 35.7 s
Wall time: 13.4 s


In [12]:
# define feature columns and target columns
aid_column = 'aid'
user_feature_columns = [c for c in train_df.columns if c.startswith('u')]
item_feature_columns = [c for c in train_df.columns if c.startswith('i')]
# all features
feature_columns = user_feature_columns + item_feature_columns 
# target
target = 'cart'


# if non_important_features is defined remove it from feature columns
if 'non_important_features' in globals():
    feature_columns = [c for c in feature_columns if c not in non_important_features]

print(f"All feature columns are {feature_columns} and target column is [{target}]")

All feature columns are ['u_clicks', 'u_carts', 'u_orders', 'u_sess_len', 'ur_clicks', 'ur_carts', 'ur_orders', 'u_click_cart_ratio', 'u_order_click_ratio', 'u_order_cart_ratio', 'u_mean_txn_hod', 'u_median_txn_hod', 'u_min_txn_hod', 'u_max_txn_hod', 'u_std_txn_hod', 'u_rmean_txn_hod', 'u_rmedian_txn_hod', 'u_rmin_txn_hod', 'u_rmax_txn_hod', 'i_mean_txn_hod', 'i_median_txn_hod', 'i_min_txn_hod', 'i_max_txn_hod', 'i_std_txn_hod', 'i_rmean_txn_hod', 'i_rmedian_txn_hod', 'i_rmin_txn_hod', 'i_rmax_txn_hod', 'i_clicks', 'i_carts', 'i_orders', 'i_inter_len', 'ir_clicks', 'ir_carts', 'ir_orders', 'i_cart_click_ratio', 'i_order_click_ratio', 'i_order_cart_ratio', 'is_clicked'] and target column is [cart]


In [14]:
# release memory
# del user_type_count, user_transaction_hour
# del item_type_count, item_transaction_hour
# del item_interaction, item_cart_interactions, item_order_interactions
# del df_click, df_cart, df_order
# _ = gc.collect()

### 4 - model building and training

In [13]:
%%time
# sort training data and generate group information
train_df = pl.from_pandas(train_df)
train_df = train_df.sort('session')

# generate group information for LGB-ranker
group = train_df.groupby('session').agg([ pl.col('session').count().alias('session_length')])['session_length'].to_numpy()

group[:10]

CPU times: user 12.5 s, sys: 1.33 s, total: 13.8 s
Wall time: 873 ms


array([ 2,  1,  4,  7,  7, 11,  3,  1,  1,  2], dtype=uint32)

In [14]:
### model build
lgb = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=20,
    importance_type='gain',
    n_jobs=30,
)

lgb

In [15]:
%%time
# fit model
ranker = lgb.fit(
    train_df[feature_columns].to_pandas(),
    train_df[target].to_pandas(),
    group=group
)

CPU times: user 1min 3s, sys: 1.04 s, total: 1min 4s
Wall time: 3.21 s


In [16]:
feature_importance = pl.DataFrame(
    {
        "feature_name": ranker.feature_name_,
        "importance_gain": ranker.feature_importances_
    }
)

feature_importance.sort('importance_gain', reverse=True)

feature_name,importance_gain
str,f64
"""is_clicked""",2.7639e6
"""u_carts""",256354.922119
"""i_cart_click_r...",85818.134642
"""i_clicks""",32174.309782
"""u_clicks""",14126.428232
"""u_click_cart_r...",4852.067352
"""i_order_cart_r...",4180.846432
"""i_inter_len""",1717.055107
"""i_orders""",1455.212294
"""i_order_click_...",757.4709


In [17]:
# locate feature importance equals to 0
non_important_features = feature_importance.filter(pl.col('importance_gain')  == 0)['feature_name'].to_list()
non_important_features

['u_orders',
 'ur_clicks',
 'ur_carts',
 'ur_orders',
 'u_mean_txn_hod',
 'u_min_txn_hod',
 'u_max_txn_hod',
 'u_rmean_txn_hod',
 'u_rmedian_txn_hod',
 'u_rmin_txn_hod',
 'u_rmax_txn_hod',
 'i_median_txn_hod',
 'i_min_txn_hod',
 'i_max_txn_hod',
 'i_rmean_txn_hod',
 'i_rmedian_txn_hod',
 'i_rmin_txn_hod',
 'i_rmax_txn_hod',
 'ir_clicks',
 'ir_carts',
 'ir_orders']

In [18]:
%%time
# save model
ranker.booster_.save_model(model_path)

CPU times: user 296 ms, sys: 6.12 ms, total: 302 ms
Wall time: 26.1 ms


<lightgbm.basic.Booster at 0x7f1ac365ad00>

### 5 - make prediction and run validation on validation dataset

In [19]:
def compute_metric(preditions, labels, topn=20):
    # init variables
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    # compute metric
    for t in ['clicks','carts','orders']:
        sub = preditions.loc[preditions.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:topn]])
        labels_sub = labels.loc[labels['type']==t]
        labels_sub = labels_sub.merge(sub, how='left', on=['session'])
        labels_sub['hits'] = labels_sub.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        labels_sub['gt_count'] = labels_sub.ground_truth.str.len().clip(0,20)
        recall = labels_sub['hits'].sum() / labels_sub['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)
    print('=============')
    print('overall recall =',score)
    print('=============')

In [31]:
%%time
# loading candidates file generate by co visitation matrix
candidates = pl.read_parquet(co_candidates)
candidates = candidates[['session_type','labels']]

# loading labels
labels = pd.read_parquet('./val_data/test_labels.parquet')
labels

CPU times: user 2.84 s, sys: 595 ms, total: 3.44 s
Wall time: 3.26 s


Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]
...,...,...,...
2212687,12899774,clicks,[1399483]
2212688,12899775,clicks,[1760714]
2212689,12899776,clicks,[1737908]
2212690,12899777,clicks,[384045]


In [32]:
%%time
# run validation for candidates
print("=====validation recall for pure co visitation matrix")
compute_metric(candidates.to_pandas(), labels, topn=20)

=====validation recall for pure co visitation matrix
clicks recall = 0.5239066859428527
carts recall = 0.4061063485069785
orders recall = 0.6289023724637173
overall recall = 0.5515639966246092
CPU times: user 34.9 s, sys: 2.04 s, total: 36.9 s
Wall time: 36.9 s


## 5 - Make inference

In [20]:
%%time
# Model inference
scores = ranker.predict(train_df[feature_columns].to_pandas())

# Appending the model score to the original dataframe
pred = train_df.with_columns(pl.Series(name='rank', values=scores))

# Getting the top 20 candidates from the prediction
pred = pred.sort(['session', 'rank'], reverse=True).groupby('session').agg([
    pl.col('aid').limit(20).list().alias('labels')
])

# Converting to pandas format and making it align with result format
pred = pred.with_columns(
    pl.col('session') + '_carts'
).to_pandas()
    
pred

CPU times: user 6.34 s, sys: 1.4 s, total: 7.74 s
Wall time: 1.25 s


Unnamed: 0,session,labels
0,14571581_carts,[285653]
1,14571580_carts,"[989688, 1246483, 891417]"
2,14571578_carts,"[538003, 326064, 519105, 1811714]"
3,14571577_carts,"[932022, 842555]"
4,14571576_carts,"[337471, 1198015, 1200606, 68703, 1697138, 182..."
...,...,...
1555190,12899783_carts,"[1535573, 1216820, 735204, 1411071]"
1555191,12899782_carts,"[595994, 834354, 834354, 740494, 987399, 88967..."
1555192,12899781_carts,"[199008, 1013328, 374037, 1508281, 1228668, 33..."
1555193,12899780_carts,"[404612, 1519088, 87442]"


In [21]:
%%time
# converting to reqiured format
pred['labels'] = pred.labels.parallel_apply(lambda x: " ".join(map(str,x)))
pred

CPU times: user 2.01 s, sys: 3.34 s, total: 5.34 s
Wall time: 6.06 s


Unnamed: 0,session,labels
0,14571581_carts,285653
1,14571580_carts,989688 1246483 891417
2,14571578_carts,538003 326064 519105 1811714
3,14571577_carts,932022 842555
4,14571576_carts,337471 1198015 1200606 68703 1697138 1826099
...,...,...
1555190,12899783_carts,1535573 1216820 735204 1411071
1555191,12899782_carts,595994 834354 834354 740494 987399 889671 9751...
1555192,12899781_carts,199008 1013328 374037 1508281 1228668 333908 1...
1555193,12899780_carts,404612 1519088 87442


In [22]:
%%time
# if it's validation run compute metric function output scoring
if is_validation:
    # loading candidates file generate by co visitation matrix
    candidates = pd.read_parquet(co_candidates)
    candidates = candidates[['session_type','labels']]
    # replace order candidates generate by order-reranker model
    candidates = pd.concat([
        pred.rename(columns={'session':'session_type'}),
        candidates.loc[~candidates['session_type'].isin(pred['session'])]
    ])

    # run validation for candidates
    compute_metric(candidates, labels, topn=20)
# if it's not valiation output prediction file for later submission
else:
    # conver to pd dataframe
    candidates = candidates.to_pandas()
    # concat with candidate dataframe to supplement missing sessions
    pred = pd.concat([
        pred.rename(columns={'session':'session_type'}),
        candidates.loc[~candidates['session_type'].isin(pred['session'])]
    ])
    # only select orders
    pred = pred.loc[pred.session_type.str.contains('carts')]
    # save to disk
    pred.to_parquet(f'{root_path}cart_rerank_result.pgt')

CPU times: user 3.63 s, sys: 861 ms, total: 4.49 s
Wall time: 4.6 s


In [37]:
# validation score on cart rerank model recall improve from 0.40 to 0.49

clicks recall = 0.5239066859428527
carts recall = 0.49106476871784377
orders recall = 0.6289023724637173
overall recall = 0.5770515226878687
CPU times: user 30.5 s, sys: 467 ms, total: 31 s
Wall time: 31 s
