In [None]:
%pip install -U lightgbm==3.3.2



In [None]:
%pip install implicit



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [None]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("/content/drive/MyDrive/HM-new/") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [None]:
from src.data import DataHelper
from src.data.metrics import map_at_k, hr_at_k, recall_at_k

from src.retrieval.rules import (
    OrderHistory,
    OrderHistoryDecay,
    ItemPair,
    UserGroupTimeHistory,
    UserGroupSaleTrend,
    TimeHistory,
    TimeHistoryDecay,
    SaleTrend,
    OutOfStock,
)
from src.retrieval.collector import RuleCollector

from src.features import full_sale, week_sale, repurchase_ratio, popularity, period_sale

from src.utils import (
    calc_valid_date,
    merge_week_data,
    reduce_mem_usage,
    calc_embd_similarity,
)

In [None]:
data_dir = Path("/content/drive/MyDrive/HM-new/data/")
model_dir = Path("/content/drive/MyDrive/HM-new/models/")

In [None]:
TRAIN_WEEK_NUM = 4
WEEK_NUM = TRAIN_WEEK_NUM + 2

VERSION_NAME = "Recall 1"
TEST = True # * Set as `False` when do local experiments to save time

In [None]:
import os
if not os.path.exists(data_dir/"interim"/VERSION_NAME):
    os.mkdir(data_dir/"interim"/VERSION_NAME)
if not os.path.exists(data_dir/"processed"/VERSION_NAME):
    os.mkdir(data_dir/"processed"/VERSION_NAME)

Pepare data: encoding ids and preprocessing

In [None]:
dh = DataHelper(data_dir)

In [None]:
data = dh.preprocess_data(save=True, name="encoded_full") # * run only once, processed data will be saved

In [None]:
data = dh.load_data(name="encoded_full")

In [None]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

## Retrieval


Generate candidates for each week

In [None]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
data['user']['age_bins'] = pd.cut(data['user']['age'], listBin)

In [None]:
# * WEEK_NUM = 0: test
# * WEEK_NUM = 1: valid
# * WEEK_NUM > 1: train
for week in range(1,WEEK_NUM):
    # * use sliding window to generate candidates
    if week == 0 and not TEST:
        continue
    trans = data["inter"]

    start_date, end_date = calc_valid_date(week)
    print(f"Week {week}: [{start_date}, {end_date})")
    
    train, valid = dh.split_data(trans, start_date, end_date)
    train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

    last_week_start = pd.to_datetime(start_date) - pd.Timedelta(days=7)
    last_week_start = last_week_start.strftime("%Y-%m-%d")
    last_week = train.loc[train.t_dat >= last_week_start]
    
    last_3day_start = pd.to_datetime(start_date) - pd.Timedelta(days=3)
    last_3day_start = last_3day_start.strftime("%Y-%m-%d")
    last_3days = train.loc[train.t_dat >= last_3day_start]

    if week != 0:
        customer_list = valid["customer_id"].values
    else:
        customer_list = submission['customer_id'].values

    # * ========================== Retrieval Strategies ==========================

    candidates = RuleCollector().collect(
        week_num = week,
        trans_df = trans,
        customer_list=customer_list,
        rules=[
            OrderHistory(train, days=3, name='1'),
            OrderHistory(train, days=7, name='2'),
            OrderHistoryDecay(train, days=3, n=50, name='1'),
            OrderHistoryDecay(train, days=7, n=50, name='2'),
            ItemPair(OrderHistory(train, days=3).retrieve(), name='1'),
            ItemPair(OrderHistory(train, days=7).retrieve(), name='2'),
            ItemPair(OrderHistoryDecay(train, days=3, n=50).retrieve(), name='3'),
            ItemPair(OrderHistoryDecay(train, days=7, n=50).retrieve(), name='4'),
            UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50, name='1'),
            UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50, name='2'),
            UserGroupSaleTrend(data, customer_list, train, ['age_bins'], days=7, n=50),
            TimeHistory(customer_list, last_week, n=50, name='1'),
            TimeHistory(customer_list, last_3days, n=50, name='2'),
            TimeHistoryDecay(customer_list, train, days=3, n=50, name='1'),
            TimeHistoryDecay(customer_list, train, days=7, n=50, name='2'),
            SaleTrend(customer_list, train, days=7, n=50),
        ],
        filters=[OutOfStock(trans)],
        min_pos_rate=0.006,
        compress=False,
    )

    candidates = (
        pd.pivot_table(
            candidates,
            values="score",
            index=["customer_id", "article_id"],
            columns=["method"],
            aggfunc=np.sum,
        )
        .reset_index()
    )

    candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
    valid.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{week}_label.pqt")

In [None]:
# * use the threshold in week 1 to generate candidates for test data, see the log in the upper cell 
if TEST:
    week = 0
    trans = data["inter"]
    
    start_date, end_date = calc_valid_date(week)
    print(f"Week {week}: [{start_date}, {end_date})")
    
    train, valid = dh.split_data(trans, start_date, end_date)
    train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

    last_week_start = pd.to_datetime(start_date) - pd.Timedelta(days=7)
    last_week_start = last_week_start.strftime("%Y-%m-%d")
    last_week = train.loc[train.t_dat >= last_week_start]
    
    last_3day_start = pd.to_datetime(start_date) - pd.Timedelta(days=3)
    last_3day_start = last_3day_start.strftime("%Y-%m-%d")
    last_3days = train.loc[train.t_dat >= last_3day_start]

    customer_list = submission['customer_id'].values

    # * ========================== Retrieval Strategies ==========================

    candidates = RuleCollector().collect(
        week_num = week,
        trans_df = trans,
        customer_list=customer_list,
        rules=[
            OrderHistory(train, days=3, name='1'),
            OrderHistory(train, days=7, name='2'),
            OrderHistoryDecay(train, days=3, n=50, name='1'),
            OrderHistoryDecay(train, days=7, n=50, name='2'),
            ItemPair(OrderHistory(train, days=3).retrieve(), name='1'),
            ItemPair(OrderHistory(train, days=7).retrieve(), name='2'),
            ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),
            ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),
            UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=15, name='1'),
            UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=20.5, name='2'),
            UserGroupSaleTrend(data, customer_list, train, ['age_bins'], days=7, n=2),
            TimeHistory(customer_list, last_week, n=9, name='1'),
            TimeHistory(customer_list, last_3days, n=16, name='2'),
            TimeHistoryDecay(customer_list, train, days=3, n=12, name='1'),
            TimeHistoryDecay(customer_list, train, days=7, n=8, name='2'),
            SaleTrend(customer_list, train, days=7, n=2),
        ],
        filters=[OutOfStock(trans)],
        min_pos_rate=0.006,
        compress=False,
    )
    
    candidates, _ = reduce_mem_usage(candidates)
    candidates = (
        pd.pivot_table(
            candidates,
            values="score",
            index=["customer_id", "article_id"],
            columns=["method"],
            aggfunc=np.sum,
        )
        .reset_index()
    )

    candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
    valid.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{week}_label.pqt")

In [None]:
del train, valid, last_week, customer_list, candidates
gc.collect()

## Feature engineering


In [None]:
user = data["user"]
item = data["item"]
inter = data["inter"]

In [None]:
# calculate week number
inter['week'] = (pd.to_datetime('2020-09-29') - pd.to_datetime(inter['t_dat'])).dt.days // 7

In [None]:
# merge full candidates to transaction data (avoid feature missing in training data)
full_candidates = []
for i in tqdm(range(WEEK_NUM)):
    candidate = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate.pqt")
    full_candidates += candidate['article_id'].values.tolist()
full_candidates = list(set(full_candidates))
del candidate
gc.collect()

num_candidates = len(full_candidates)
full_candidates = np.array(full_candidates)
full_candidates = np.tile(full_candidates, WEEK_NUM + 1)
weeks = np.repeat(np.arange(1,WEEK_NUM+2), num_candidates)
full_candidates = pd.DataFrame({'article_id':full_candidates, 'week':weeks})

inter['valid'] = 1
in_train = inter[inter['week']<=WEEK_NUM + 1]
out_train = inter[inter['week']>WEEK_NUM + 1]

in_train = in_train.merge(full_candidates, on=['article_id','week'], how='right')
in_train['valid'] = in_train['valid'].fillna(0)
inter = pd.concat([in_train, out_train], ignore_index=True)
inter = inter.sort_values(["valid"], ascending=False).reset_index(drop=True)

100%|██████████| 6/6 [00:18<00:00,  3.07s/it]


17

In [None]:
# merge `product_code`
inter = inter.merge(item[["article_id", "product_code"]], on="article_id", how="left")

In [None]:
inter.shape

(31837313, 8)

In [None]:
_, inter["i_1w_sale_rank"], inter["i_1w_sale_norm"] = period_sale(
    inter, ["article_id"], days=14, rank=True, norm=True, week_num=WEEK_NUM
)
_, inter["p_1w_sale_rank"], inter["p_1w_sale_norm"] = period_sale(
    inter, ["product_code"], days=14, rank=True, norm=True, week_num=WEEK_NUM
)
inter["i_2w_sale"], inter["i_2w_sale_rank"], inter["i_2w_sale_norm"] = period_sale(
    inter, ["article_id"], days=14, rank=True, norm=True, week_num=WEEK_NUM
)
inter["p_2w_sale"], inter["p_2w_sale_rank"], inter["p_2w_sale_norm"] = period_sale(
    inter, ["product_code"], days=14, rank=True, norm=True, week_num=WEEK_NUM
)

In [None]:
inter["i_3w_sale"], inter["i_3w_sale_rank"], inter["i_3w_sale_norm"] = period_sale(
    inter, ["article_id"], days=21, rank=True, norm=True, week_num=WEEK_NUM
)
inter["p_3w_sale"], inter["p_3w_sale_rank"], inter["p_3w_sale_norm"] = period_sale(
    inter, ["product_code"], days=21, rank=True, norm=True, week_num=WEEK_NUM
)
inter["i_4w_sale"], inter["i_4w_sale_rank"], inter["i_4w_sale_norm"] = period_sale(
    inter, ["article_id"], days=28, rank=True, norm=True, week_num=WEEK_NUM
)
inter["p_4w_sale"], inter["p_4w_sale_rank"], inter["p_4w_sale_norm"] = period_sale(
    inter, ["product_code"], days=28, rank=True, norm=True, week_num=WEEK_NUM
)

In [None]:
inter.shape

(31837313, 30)

In [None]:
inter['i_repurchase_ratio'] = repurchase_ratio(inter, ['article_id'], week_num=WEEK_NUM)
inter['p_repurchase_ratio'] = repurchase_ratio(inter, ['product_code'], week_num=WEEK_NUM)

100%|██████████| 6/6 [02:28<00:00, 24.77s/it]
100%|██████████| 6/6 [02:00<00:00, 20.07s/it]


In [None]:
inter.shape

(31837313, 32)

In [None]:
inter, _ = reduce_mem_usage(inter)

In [None]:
inter["i_sale"] = week_sale(inter, ["article_id"], week_num=WEEK_NUM)
inter["p_sale"] = week_sale(inter, ["product_code"], week_num=WEEK_NUM)
inter["i_sale_uni"] = week_sale(inter, ["article_id"], True, week_num=WEEK_NUM)
inter["p_sale_uni"] = week_sale(inter, ["product_code"], True, week_num=WEEK_NUM)
inter["lw_i_sale"] = week_sale(inter, ["article_id"], step=1, week_num=WEEK_NUM) # * last week sale
inter["lw_p_sale"] = week_sale(inter, ["product_code"], step=1, week_num=WEEK_NUM)
inter["lw_i_sale_uni"] = week_sale(inter, ["article_id"], True, step=1, week_num=WEEK_NUM)
inter["lw_p_sale_uni"] = week_sale(inter, ["product_code"], True, step=1, week_num=WEEK_NUM)

inter["i_sale_ratio"] = inter["i_sale"] / (inter["p_sale"] + 1e-6)
inter["i_sale_uni_ratio"] = inter["i_sale_uni"] / (inter["p_sale_uni"] + 1e-6)
inter["lw_i_sale_ratio"] = inter["lw_i_sale"] / (inter["lw_p_sale"] + 1e-6)
inter["lw_i_sale_uni_ratio"] = inter["lw_i_sale_uni"] / (inter["lw_p_sale_uni"] + 1e-6)

inter["i_uni_ratio"] = inter["i_sale"] / (inter["i_sale_uni"] + 1e-6)
inter["p_uni_ratio"] = inter["p_sale"] / (inter["p_sale_uni"] + 1e-6)
inter["lw_i_uni_ratio"] = inter["lw_i_sale"] / (inter["lw_i_sale_uni"] + 1e-6)
inter["lw_p_uni_ratio"] = inter["lw_p_sale"] / (inter["lw_p_sale_uni"] + 1e-6)

inter["i_sale_trend"] = (inter["i_sale"] - inter["lw_i_sale"]) / (inter["lw_i_sale"] + 1e-6)
inter["p_sale_trend"] = (inter["p_sale"] - inter["lw_p_sale"]) / (inter["lw_p_sale"] + 1e-6)

item_feats = [
    "product_type_no",
    # "product_group_name",
    # "graphical_appearance_no",
    # "colour_group_code",
    # "perceived_colour_value_id",
    # "perceived_colour_master_id",
]
inter = inter.merge(item[["article_id", *item_feats]], on="article_id", how="left")

for f in tqdm(item_feats):
    inter[f"{f}_sale"] = week_sale(inter, [f], f"{f}_sale", week_num=WEEK_NUM)
    inter[f"lw_{f}_sale"] = week_sale(inter, [f], f"{f}_sale", step=1, week_num=WEEK_NUM)
    inter[f"{f}_sale_trend"] = (inter[f"{f}_sale"] - inter[f"lw_{f}_sale"]) / (inter[f"lw_{f}_sale"] + 1e-6)

100%|██████████| 1/1 [00:22<00:00, 22.43s/it]


In [None]:
inter.shape

(31837313, 54)

In [None]:
# * Date related
curr_date_dict = {x:calc_valid_date(x-1)[0] for x in range(100)}
current_dat = inter['week'].map(curr_date_dict)
mask = inter['valid']==0
inter.loc[mask, 't_dat'] = inter.loc[mask, 'week'].map(curr_date_dict)
first_date = inter.groupby('article_id')['t_dat'].min().reset_index(name='first_dat')
inter = pd.merge(inter, first_date, on='article_id', how='left')
# df = pd.merge(df, last_date, on='article_id', how='left')
inter['first_dat'] = (pd.to_datetime(current_dat)-pd.to_datetime(inter['first_dat'])).dt.days

In [None]:
inter.shape

(31837313, 55)

In [None]:
inter['i_full_sale'] = full_sale(inter, ['article_id'], week_num=WEEK_NUM)
inter['p_full_sale'] = full_sale(inter, ['product_code'], week_num=WEEK_NUM)

inter['i_daily_sale'] = inter['i_full_sale'] / inter['first_dat']
inter['p_daily_sale'] = inter['p_full_sale'] / inter['first_dat']
inter['i_daily_sale_ratio'] = inter['i_daily_sale'] / inter['p_daily_sale']
inter['i_w_full_sale_ratio'] = inter['i_sale'] / inter['i_full_sale']

inter['i_2w_full_sale_ratio'] = inter['i_2w_sale'] / inter['i_full_sale']
inter['p_w_full_sale_ratio'] = inter['p_sale'] / inter['p_full_sale']
inter['p_2w_full_sale_ratio'] = inter['p_2w_sale'] / inter['p_full_sale']

inter['i_week_above_daily_sale'] = inter['i_sale'] / 7 - inter['i_daily_sale']
inter['p_week_above_full_sale'] = inter['p_sale'] / 7 - inter['i_full_sale']
inter['i_2w_week_above_daily_sale'] = inter['i_2w_sale'] / 14 - inter['i_daily_sale']
inter['p_2w_week_above_daily_sale'] = inter['p_2w_sale'] / 14 - inter['p_daily_sale']

In [None]:
gc.collect()

94

In [None]:
for f in tqdm(item_feats):
    inter[f'{f}_full_sale'] = full_sale(inter, [f], week_num=WEEK_NUM)
    f_first_date = inter.groupby(f)['t_dat'].min().reset_index(name=f'{f}_first_dat')
    inter = inter.merge(f_first_date, on=f, how='left')
    inter[f'{f}_daily_sale'] = inter[f'{f}_full_sale'] / (pd.to_datetime(current_dat) - pd.to_datetime(inter[f'{f}_first_dat'])).dt.days
    inter[f'i_{f}_daily_sale_ratio'] = inter['i_daily_sale'] / inter[f'{f}_daily_sale']
    inter[f'p_{f}_daily_sale_ratio'] = inter['p_daily_sale'] / inter[f'{f}_daily_sale']
    del inter[f'{f}_full_sale'], inter[f'{f}_first_dat']
    gc.collect()

  0%|          | 0/1 [00:00<?, ?it/s]

17

100%|██████████| 1/1 [01:21<00:00, 81.40s/it]


In [None]:
for f in item_feats + ['i_full_sale','p_full_sale']:
    del inter[f]

In [None]:
inter['i_pop'] = popularity(inter, 'article_id', week_num=WEEK_NUM)
inter['p_pop'] = popularity(inter, 'product_code', week_num=WEEK_NUM)

In [None]:
inter = inter.loc[inter['week'] <= WEEK_NUM + 2]

In [None]:
inter.to_parquet(data_dir / "processed/processed_inter.pqt")

## Merge Features


In [None]:
inter = pd.read_parquet(data_dir / "processed/processed_inter.pqt")
inter = inter[inter['week'] <= WEEK_NUM + 2]

In [None]:
#* embeddings from DSSM model
dssm_user_embd = np.load(data_dir / "external/dssm_user_embd.npy", allow_pickle=True)
dssm_item_embd = np.load(data_dir / "external/dssm_item_embd.npy", allow_pickle=True)
# * embeddings from YouTubeDNN model
yt_user_embd = np.load(data_dir / "external/yt_user_embd.npy", allow_pickle=True)
yt_item_embd = np.load(data_dir / "external/yt_item_embd.npy", allow_pickle=True)
# * embeddings from Word2Vector model
w2v_user_embd = np.load(data_dir/'external'/'w2v_user_embd.npy', allow_pickle=True)
w2v_item_embd = np.load(data_dir/'external'/'w2v_item_embd.npy', allow_pickle=True)

In [None]:
for col in inter.columns:
    inter[col] = np.nan_to_num(inter[col])

In [None]:
for i in tqdm(range(WEEK_NUM)):
    if i == 0 and not TEST:
        continue
    candidate = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate.pqt")
    if i == 0:
        chunk_size = int(candidate.shape[0] * 0.5)
        for chunk,batch in enumerate(range(0, candidate.shape[0], chunk_size)):
            sub_candidate = candidate.iloc[batch:batch+chunk_size-1]
            # * merge features
            sub_candidate = merge_week_data(data, inter, i, sub_candidate)
            sub_candidate['article_id'] = sub_candidate['article_id'].astype(int)
            sub_candidate['customer_id'] = sub_candidate['customer_id'].astype(int)
            # * merge DSSM user and item embeddings
            sub_candidate["dssm_similarity"] = calc_embd_similarity(sub_candidate, dssm_user_embd, dssm_item_embd)
            # * merge YouTubeDNN user and item embeddings
            sub_candidate["yt_similarity"] = calc_embd_similarity(sub_candidate, yt_user_embd, yt_item_embd)
            # * merge Word2Vector user and item embeddings
            sub_candidate["wv_similarity"] = calc_embd_similarity(sub_candidate, w2v_user_embd, w2v_item_embd, sub=False)
            print(f"Chunk {chunk} done...")
            sub_candidate.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate_{chunk}.pqt")
    else:
        # * merge features
        candidate = merge_week_data(data, inter, i, candidate)
        print(candidate['week'].unique())
        # * merge DSSM user and item embeddings
        candidate["dssm_similarity"] = calc_embd_similarity(candidate, dssm_user_embd, dssm_item_embd)
        # * merge YouTubeDNN user and item embeddings
        candidate["yt_similarity"] = calc_embd_similarity(candidate, yt_user_embd, yt_item_embd)
        candidate["wv_similarity"] = calc_embd_similarity(candidate, w2v_user_embd, w2v_item_embd, sub=False)
    candidate.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")

  0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/2766 [00:00<?, ?it/s][A
  0%|          | 3/2766 [00:00<01:35, 28.95it/s][A
  1%|          | 30/2766 [00:00<00:16, 167.70it/s][A
  2%|▏         | 57/2766 [00:00<00:12, 212.10it/s][A
  3%|▎         | 85/2766 [00:00<00:11, 235.70it/s][A
  4%|▍         | 111/2766 [00:00<00:10, 241.59it/s][A
  5%|▍         | 138/2766 [00:00<00:10, 249.29it/s][A
  6%|▌         | 165/2766 [00:00<00:10, 253.18it/s][A
  7%|▋         | 192/2766 [00:00<00:09, 257.92it/s][A
  8%|▊         | 220/2766 [00:00<00:09, 263.76it/s][A
  9%|▉         | 248/2766 [00:01<00:09, 266.11it/s][A
 10%|▉         | 276/2766 [00:01<00:09, 267.63it/s][A
 11%|█         | 303/2766 [00:01<00:09, 265.62it/s][A
 12%|█▏        | 330/2766 [00:01<00:09, 265.89it/s][A
 13%|█▎        | 357/2766 [00:01<00:09, 263.42it/s][A
 14%|█▍        | 384/2766 [00:01<00:09, 263.66it/s][A
 15%|█▍        | 411/2766 [00:01<00:08, 264.25it/s][A
 16%|█▌        | 438/2766 [00:01<00:08, 261.

Chunk 0 done...



  0%|          | 0/2766 [00:00<?, ?it/s][A
  0%|          | 2/2766 [00:00<02:21, 19.56it/s][A
  1%|          | 26/2766 [00:00<00:18, 144.59it/s][A
  2%|▏         | 50/2766 [00:00<00:14, 184.76it/s][A
  3%|▎         | 74/2766 [00:00<00:13, 204.61it/s][A
  4%|▎         | 99/2766 [00:00<00:12, 220.46it/s][A
  5%|▍         | 125/2766 [00:00<00:11, 230.52it/s][A
  5%|▌         | 150/2766 [00:00<00:11, 234.62it/s][A
  6%|▋         | 175/2766 [00:00<00:10, 238.09it/s][A
  7%|▋         | 200/2766 [00:00<00:10, 240.21it/s][A
  8%|▊         | 225/2766 [00:01<00:10, 243.08it/s][A
  9%|▉         | 250/2766 [00:01<00:10, 243.88it/s][A
 10%|▉         | 275/2766 [00:01<00:10, 241.91it/s][A
 11%|█         | 300/2766 [00:01<00:10, 239.40it/s][A
 12%|█▏        | 325/2766 [00:01<00:10, 240.68it/s][A
 13%|█▎        | 350/2766 [00:01<00:09, 243.02it/s][A
 14%|█▎        | 375/2766 [00:01<00:09, 242.33it/s][A
 14%|█▍        | 400/2766 [00:01<00:09, 244.15it/s][A
 15%|█▌        | 425/2766 [

Chunk 1 done...


 17%|█▋        | 1/6 [11:46<58:51, 706.31s/it]

68984
[1]



  0%|          | 0/306 [00:00<?, ?it/s][A
  6%|▌         | 19/306 [00:00<00:01, 187.89it/s][A
 12%|█▏        | 38/306 [00:00<00:02, 124.71it/s][A
 17%|█▋        | 52/306 [00:00<00:02, 98.73it/s] [A
 24%|██▍       | 74/306 [00:00<00:01, 131.32it/s][A
 31%|███▏      | 96/306 [00:00<00:01, 155.20it/s][A
 39%|███▊      | 118/306 [00:00<00:01, 172.72it/s][A
 46%|████▌     | 140/306 [00:00<00:00, 185.60it/s][A
 53%|█████▎    | 161/306 [00:00<00:00, 191.10it/s][A
 59%|█████▉    | 182/306 [00:01<00:00, 196.52it/s][A
 67%|██████▋   | 205/306 [00:01<00:00, 204.30it/s][A
 75%|███████▍  | 228/306 [00:01<00:00, 210.19it/s][A
 82%|████████▏ | 250/306 [00:01<00:00, 211.66it/s][A
 89%|████████▉ | 272/306 [00:01<00:00, 211.45it/s][A
100%|██████████| 306/306 [00:01<00:00, 183.96it/s]

  0%|          | 0/306 [00:00<?, ?it/s][A
  7%|▋         | 20/306 [00:00<00:01, 190.14it/s][A
 13%|█▎        | 41/306 [00:00<00:01, 201.03it/s][A
 20%|██        | 62/306 [00:00<00:01, 201.34it/s][A
 27%|

72019
[2]



  0%|          | 0/225 [00:00<?, ?it/s][A
 10%|█         | 23/225 [00:00<00:00, 226.86it/s][A
 20%|██        | 46/225 [00:00<00:01, 167.87it/s][A
 28%|██▊       | 64/225 [00:00<00:01, 148.20it/s][A
 39%|███▊      | 87/225 [00:00<00:00, 172.75it/s][A
 49%|████▉     | 110/225 [00:00<00:00, 189.40it/s][A
 59%|█████▉    | 133/225 [00:00<00:00, 200.61it/s][A
 70%|██████▉   | 157/225 [00:00<00:00, 210.06it/s][A
 80%|████████  | 180/225 [00:00<00:00, 213.44it/s][A
 90%|████████▉ | 202/225 [00:01<00:00, 214.58it/s][A
100%|██████████| 225/225 [00:01<00:00, 199.38it/s]

  0%|          | 0/225 [00:00<?, ?it/s][A
 11%|█         | 24/225 [00:00<00:00, 236.68it/s][A
 21%|██▏       | 48/225 [00:00<00:00, 230.10it/s][A
 32%|███▏      | 72/225 [00:00<00:00, 231.24it/s][A
 43%|████▎     | 96/225 [00:00<00:00, 228.23it/s][A
 53%|█████▎    | 119/225 [00:00<00:00, 227.90it/s][A
 63%|██████▎   | 142/225 [00:00<00:00, 223.38it/s][A
 73%|███████▎  | 165/225 [00:00<00:00, 219.49it/s][A
 83%|

75822
[3]



  0%|          | 0/280 [00:00<?, ?it/s][A
  9%|▉         | 25/280 [00:00<00:01, 244.42it/s][A
 18%|█▊        | 50/280 [00:00<00:01, 174.76it/s][A
 26%|██▌       | 73/280 [00:00<00:01, 160.21it/s][A
 35%|███▍      | 97/280 [00:00<00:01, 182.88it/s][A
 43%|████▎     | 121/280 [00:00<00:00, 199.49it/s][A
 51%|█████▏    | 144/280 [00:00<00:00, 207.53it/s][A
 60%|██████    | 168/280 [00:00<00:00, 216.70it/s][A
 68%|██████▊   | 191/280 [00:00<00:00, 220.48it/s][A
 76%|███████▋  | 214/280 [00:01<00:00, 222.69it/s][A
 85%|████████▌ | 239/280 [00:01<00:00, 229.08it/s][A
100%|██████████| 280/280 [00:01<00:00, 213.52it/s]

  0%|          | 0/280 [00:00<?, ?it/s][A
  9%|▊         | 24/280 [00:00<00:01, 237.80it/s][A
 17%|█▋        | 48/280 [00:00<00:00, 235.43it/s][A
 26%|██▌       | 72/280 [00:00<00:00, 233.45it/s][A
 34%|███▍      | 96/280 [00:00<00:00, 234.77it/s][A
 43%|████▎     | 120/280 [00:00<00:00, 233.02it/s][A
 52%|█████▏    | 145/280 [00:00<00:00, 235.72it/s][A
 60%|

80253
[4]



  0%|          | 0/298 [00:00<?, ?it/s][A
  7%|▋         | 22/298 [00:00<00:01, 217.48it/s][A
 15%|█▍        | 44/298 [00:00<00:01, 158.17it/s][A
 20%|██        | 61/298 [00:00<00:01, 141.02it/s][A
 29%|██▊       | 85/298 [00:00<00:01, 171.40it/s][A
 36%|███▌      | 107/298 [00:00<00:01, 186.27it/s][A
 44%|████▍     | 131/298 [00:00<00:00, 201.13it/s][A
 52%|█████▏    | 156/298 [00:00<00:00, 215.09it/s][A
 60%|██████    | 179/298 [00:00<00:00, 219.37it/s][A
 68%|██████▊   | 203/298 [00:01<00:00, 224.78it/s][A
 76%|███████▌  | 227/298 [00:01<00:00, 226.68it/s][A
 84%|████████▍ | 250/298 [00:01<00:00, 225.81it/s][A
 92%|█████████▏| 273/298 [00:01<00:00, 226.74it/s][A
100%|██████████| 298/298 [00:01<00:00, 207.64it/s]

  0%|          | 0/298 [00:00<?, ?it/s][A
  8%|▊         | 23/298 [00:00<00:01, 224.23it/s][A
 15%|█▌        | 46/298 [00:00<00:01, 223.14it/s][A
 23%|██▎       | 70/298 [00:00<00:00, 230.46it/s][A
 32%|███▏      | 94/298 [00:00<00:00, 229.35it/s][A
 39%|

72035
[5]



  0%|          | 0/178 [00:00<?, ?it/s][A
 13%|█▎        | 24/178 [00:00<00:00, 231.89it/s][A
 27%|██▋       | 48/178 [00:00<00:00, 176.27it/s][A
 38%|███▊      | 67/178 [00:00<00:00, 161.01it/s][A
 50%|█████     | 89/178 [00:00<00:00, 177.62it/s][A
 63%|██████▎   | 113/178 [00:00<00:00, 196.81it/s][A
 78%|███████▊  | 138/178 [00:00<00:00, 211.13it/s][A
100%|██████████| 178/178 [00:00<00:00, 203.60it/s]

  0%|          | 0/178 [00:00<?, ?it/s][A
 14%|█▍        | 25/178 [00:00<00:00, 240.54it/s][A
 28%|██▊       | 50/178 [00:00<00:00, 235.23it/s][A
 42%|████▏     | 74/178 [00:00<00:00, 223.04it/s][A
 54%|█████▍    | 97/178 [00:00<00:00, 218.99it/s][A
 67%|██████▋   | 119/178 [00:00<00:00, 218.18it/s][A
 79%|███████▉  | 141/178 [00:00<00:00, 216.85it/s][A
100%|██████████| 178/178 [00:00<00:00, 220.95it/s]

  0%|          | 0/178 [00:00<?, ?it/s][A
  7%|▋         | 12/178 [00:00<00:01, 118.54it/s][A
 13%|█▎        | 24/178 [00:00<00:01, 118.38it/s][A
 20%|██        | 36/

In [None]:
del dssm_user_embd, dssm_item_embd, yt_user_embd, yt_item_embd
gc.collect()

858

## Ranking


In [None]:
candidates = {}
labels = {}
for i in tqdm(range(1, WEEK_NUM)):
    candidates[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")
    labels[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_label.pqt")    

100%|██████████| 5/5 [00:06<00:00,  1.36s/it]


In [None]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "sales_channel_id",
        "t_dat",
        "week",
    ]
]
cat_features = [
    "customer_id",
    "article_id",
    "product_code",
    "FN",
    "Active",
    "club_member_status",
    "fashion_news_frequency",
    "age",
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",

    "user_gender",
    "article_gender",
    "season_type"
]

In [None]:
# * Convert categorical featues as `CategoricalDtype`
cate_dict = {}        
for feat in tqdm(cat_features):
    if feat in data['user'].columns:
        value_set = set(data['user'][feat].unique())
    elif feat in data['item'].columns:
        value_set = set(data['item'][feat].unique())
    else:
        value_set = set(data['inter'][feat].unique())
    cate_dict[feat] = CategoricalDtype(categories=value_set)

100%|██████████| 17/17 [00:00<00:00, 18.05it/s]


In [None]:
full_data = pd.concat([candidates[i] for i in range(1, WEEK_NUM)], ignore_index=True)

### Extra Features

In [None]:
inter = data['inter']
inter = inter[inter['t_dat']<'2020-08-19'] # * start date of the last valid week
inter['week'] = (pd.to_datetime('2020-09-29') - pd.to_datetime(inter['t_dat'])).dt.days // 7
inter = inter.merge(data['item'][["article_id", "product_code"]], on="article_id", how="left")

In [None]:
tmp = inter.groupby('article_id').week.mean()
full_data['article_time_mean'] = full_data['article_id'].map(tmp)

tmp = inter.groupby('customer_id').week.nth(-1)
full_data['customer_id_last_time'] = full_data['customer_id'].map(tmp)

tmp = inter.groupby('customer_id').week.nth(0)
full_data['customer_id_first_time'] = full_data['customer_id'].map(tmp)

tmp = inter.groupby('customer_id').week.mean()
full_data['customer_id_time_mean'] = full_data['customer_id'].map(tmp)

full_data['customer_id_gap'] = full_data['customer_id_first_time'] - full_data['customer_id_last_time']

In [None]:
feats += [
    'article_time_mean', 
    'customer_id_last_time', 
    'customer_id_first_time', 
    'customer_id_time_mean',
    'customer_id_gap'
]

In [None]:
del tmp
gc.collect()

450

### Train


In [None]:
for feat in tqdm(cat_features):
    full_data[feat] = full_data[feat].astype(cate_dict[feat])

100%|██████████| 17/17 [00:10<00:00,  1.66it/s]


In [None]:
train = full_data.loc[full_data['week']>1]
valid = full_data.loc[full_data['week']==1]

del full_data
gc.collect()

88

In [None]:
params = {
    "objective": "binary",#"lambdarank",
    "boosting_type": "gbdt",
    "metric": "auc",#"map",
    "max_depth": 8,
    "num_leaves": 128,
    "learning_rate": 0.03,

    "verbose": -1,
    "eval_at": 12,
}

In [None]:
def train_rank_model(train, valid, train_group, valid_group):

    train_set = lgb.Dataset(
        data=train[feats],
        label=train["label"],
        group=train_group,
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    valid_set = lgb.Dataset(
        data=valid[feats],
        label=valid["label"],
        group=valid_group,
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    ranker = lgb.train(
        params,
        train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        early_stopping_rounds=30,
        verbose_eval=10,
    )
    ranker.save_model(
        model_dir / f"lgb_small_ranker.model",
        num_iteration=ranker.best_iteration,
    )
    return ranker

In [None]:
def train_binary_model(train, valid):

    train_set = lgb.Dataset(
        data=train[feats],
        label=train["label"],
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    valid_set = lgb.Dataset(
        data=valid[feats],
        label=valid["label"],
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    ranker = lgb.train(
        params,
        train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        early_stopping_rounds=30,
        verbose_eval=10,
    )
    ranker.save_model(
        model_dir / f"lgb_small_binary.model",
        num_iteration=ranker.best_iteration,
    )
    return ranker

In [None]:
del candidates
gc.collect()

568

In [None]:
print("Train positive rate:", train.label.mean())

Train positive rate: 0.0067018773363074985


In [None]:
train = train.sort_values(by=["week", "customer_id"], ascending=True).reset_index(drop=True)
valid = valid.sort_values(by=["customer_id"], ascending=True).reset_index(drop=True)

In [None]:
train_group = train[["customer_id", "article_id", "week"]]
train_group = train_group.astype("int32")  # * convert to int to avoid `0` in groupby count result
train_group = (train_group.groupby(["week", "customer_id"]).size().values)

In [None]:
valid_group = valid[["customer_id", "article_id"]]
valid_group = valid_group.astype("int32")  # * convert to int to avoid `0` in groupby count result
valid_group = valid_group.groupby(["customer_id"]).size().values

In [None]:
train = train[feats+['label']]
valid = valid[feats+['label']]

In [None]:
gc.collect()

247

In [None]:
# ranker = train_rank_model(train, valid, train_group, valid_group)
ranker = train_binary_model(train, valid)

Training until validation scores don't improve for 30 rounds
[10]	valid_0's auc: 0.690798
[20]	valid_0's auc: 0.699496
[30]	valid_0's auc: 0.703928
[40]	valid_0's auc: 0.706298
[50]	valid_0's auc: 0.708892
[60]	valid_0's auc: 0.709672
[70]	valid_0's auc: 0.710949
[80]	valid_0's auc: 0.713591
[90]	valid_0's auc: 0.715312
[100]	valid_0's auc: 0.716153
[110]	valid_0's auc: 0.717036
[120]	valid_0's auc: 0.717173
[130]	valid_0's auc: 0.717293
[140]	valid_0's auc: 0.717737
[150]	valid_0's auc: 0.717838
[160]	valid_0's auc: 0.717651
[170]	valid_0's auc: 0.717697
Early stopping, best iteration is:
[148]	valid_0's auc: 0.717971


### Inference

In [None]:
# ranker = lgb.Booster(model_file=model_dir / "lgb_small_binary.model")
ranker = lgb.Booster(model_file=model_dir / "lgb_small_ranker.model")

In [None]:
feat_importance = pd.DataFrame(
    {"feature": feats, "importance": ranker.feature_importance()}
).sort_values(by="importance", ascending=False)
plt.figure(figsize=(8, 22))
sns.barplot(y="feature", x="importance", data=feat_importance)

### Validate

In [None]:
val_candidates = valid.reset_index(drop=True)

In [None]:
def predict(ranker, candidates, batch_size = 5_000_000):
    probs = np.zeros(candidates.shape[0])
    for batch in range(0, candidates.shape[0], batch_size):
        outputs = ranker.predict(candidates.loc[batch : batch + batch_size - 1, feats])
        probs[batch : batch + batch_size] = outputs
    candidates["prob"] = probs
    pred_lgb = candidates[['customer_id','article_id','prob']]
    pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
    pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)
    pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
    pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)
    pred_lgb = pred_lgb.groupby("customer_id")["prediction"].progress_apply(list).reset_index()
    return pred_lgb

In [None]:
pred = predict(ranker, val_candidates)

100%|██████████| 68984/68984 [00:06<00:00, 10932.32it/s]


In [None]:
label = labels[1]
label = pd.merge(label, pred, on="customer_id", how="left")

In [None]:
map_at_k(label["article_id"], label["prediction"], k=12)

# 0.029813727108367518 ranker
# 0.029791925075924913 binary

0.029791925075924913

In [None]:
batch_size = 5_000_000
probs = np.zeros(val_candidates.shape[0])
for batch in range(0, val_candidates.shape[0], batch_size):
    outputs = ranker.predict(val_candidates.loc[batch : batch + batch_size - 1, feats])
    probs[batch : batch + batch_size] = outputs
val_candidates["prob"] = probs
pred_lgb = val_candidates[['customer_id','article_id','prob']]
pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)
pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)

In [None]:
pred_lgb.to_parquet(data_dir/"processed"/"small_binary_valid.pqt")

### Test

In [None]:
del candidates
gc.collect()

414

In [None]:
test_pred = []
for chunk in range(2):
    print(f"Chunk {chunk}")
    test_candidates = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week0_candidate_{chunk}.pqt")
    for feat in cat_features:
        test_candidates[feat] = test_candidates[feat].astype(cate_dict[feat])

    # * Extra Features ===================================

    tmp = inter.groupby('article_id').week.mean()
    test_candidates['article_time_mean'] = test_candidates['article_id'].map(tmp)

    tmp = inter.groupby('customer_id').week.nth(-1)
    test_candidates['customer_id_last_time'] = test_candidates['customer_id'].map(tmp)

    tmp = inter.groupby('customer_id').week.nth(0)
    test_candidates['customer_id_first_time'] = test_candidates['customer_id'].map(tmp)

    tmp = inter.groupby('customer_id').week.mean()
    test_candidates['customer_id_time_mean'] = test_candidates['customer_id'].map(tmp)

    test_candidates['customer_id_gap'] = test_candidates['customer_id_first_time'] - test_candidates['customer_id_last_time']

    gc.collect()
    # * ==================================================
    
    batch_size = 5_000_000
    probs = np.zeros(test_candidates.shape[0])
    for batch in tqdm(range(0, test_candidates.shape[0], batch_size)):
        outputs = ranker.predict(test_candidates.loc[batch : batch + batch_size - 1, feats])
        probs[batch : batch + batch_size] = outputs
    test_candidates["prob"] = probs
    pred_lgb = test_candidates[['customer_id','article_id','prob']]
    pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
    pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)
    pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
    pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)
    test_pred.append(pred_lgb)
    del test_candidates
    gc.collect()

Chunk 0


53

100%|██████████| 6/6 [02:57<00:00, 29.52s/it]


24

Chunk 1


0

100%|██████████| 6/6 [02:26<00:00, 24.38s/it]


4

In [None]:
pred_lgb = pd.concat(test_pred, ignore_index=True)

In [None]:
# pred_lgb.to_parquet(data_dir/"processed"/"small_binary_test.pqt")
pred_lgb.to_parquet(data_dir/"processed"/"small_rank_test.pqt")