In [1]:
# %pip install -U lightgbm==3.3.2

In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [2]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("../") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [3]:
from src.data import DataHelper
from src.data.metrics import map_at_k, hr_at_k, recall_at_k

from src.retrieval.rules import (
    OrderHistory,
    OrderHistoryDecay,
    ItemPair,
    UserGroupTimeHistory,
    UserGroupSaleTrend,
    TimeHistory,
    TimeHistoryDecay,
    SaleTrend,
    OutOfStock,
)
from src.retrieval.collector import RuleCollector

from src.features import cum_sale, week_sale, repurchase_ratio, purchased_before, popularity

from src.utils import (
    calc_valid_date,
    merge_week_data,
    reduce_mem_usage,
    calc_embd_similarity,
)

In [4]:
data_dir = Path("../data/")
model_dir = Path("../models/")

In [5]:
TRAIN_WEEK_NUM = 4
WEEK_NUM = TRAIN_WEEK_NUM + 2

VERSION_NAME = "pivot"
TEST = True # * Set as `False` when do local experiments to save time

In [6]:
import os
if not os.path.exists(data_dir/"interim"/VERSION_NAME):
    os.mkdir(data_dir/"interim"/VERSION_NAME)
if not os.path.exists(data_dir/"processed"/VERSION_NAME):
    os.mkdir(data_dir/"processed"/VERSION_NAME)

Pepare data: encoding ids and preprocessing

In [7]:
dh = DataHelper(data_dir)

In [8]:
# data = dh.preprocess_data(save=True, name="encoded_full") # * run only once, processed data will be saved

In [9]:
data = dh.load_data(name="encoded_full")

In [10]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

## Retrieval


Generate candidates for each week

In [11]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
data['user']['age_bins'] = pd.cut(data['user']['age'], listBin)

In [13]:
# * WEEK_NUM = 0: test
# * WEEK_NUM = 1: valid
# * WEEK_NUM > 1: train
for week in range(1,WEEK_NUM):
    # * use sliding window to generate candidates
    if week == 0 and not TEST:
        continue
    trans = data["inter"]

    start_date, end_date = calc_valid_date(week)
    print(f"Week {week}: [{start_date}, {end_date})")
    
    train, valid = dh.split_data(trans, start_date, end_date)
    train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

    last_week_start = pd.to_datetime(start_date) - pd.Timedelta(days=7)
    last_week_start = last_week_start.strftime("%Y-%m-%d")
    last_week = train.loc[train.t_dat >= last_week_start]
    
    last_3day_start = pd.to_datetime(start_date) - pd.Timedelta(days=3)
    last_3day_start = last_3day_start.strftime("%Y-%m-%d")
    last_3days = train.loc[train.t_dat >= last_3day_start]

    if week != 0:
        customer_list = valid["customer_id"].values
    else:
        customer_list = submission['customer_id'].values

    # * ========================== Retrieval Strategies ==========================

    candidates = RuleCollector().collect(
        week_num = week,
        trans_df = trans,
        customer_list=customer_list,
        rules=[
            OrderHistory(train, days=3, name='1'),
            OrderHistory(train, days=7, name='2'),
            OrderHistoryDecay(train, days=3, n=50, name='1'),
            OrderHistoryDecay(train, days=7, n=50, name='2'),
            ItemPair(OrderHistory(train, days=3).retrieve(), name='1'),
            ItemPair(OrderHistory(train, days=7).retrieve(), name='2'),
            ItemPair(OrderHistoryDecay(train, days=3, n=50).retrieve(), name='3'),
            ItemPair(OrderHistoryDecay(train, days=7, n=50).retrieve(), name='4'),
            UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50, name='1'),
            UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50, name='2'),
            UserGroupSaleTrend(data, customer_list, train, ['age_bins'], days=7, n=50),
            TimeHistory(customer_list, last_week, n=50, name='1'),
            TimeHistory(customer_list, last_3days, n=50, name='2'),
            TimeHistoryDecay(customer_list, train, days=3, n=50, name='1'),
            TimeHistoryDecay(customer_list, train, days=7, n=50, name='2'),
            SaleTrend(customer_list, train, days=7, n=50),
        ],
        filters=[OutOfStock(trans)],
        min_pos_rate=0.006,
        compress=False,
    )

    candidates = (
        pd.pivot_table(
            candidates,
            values="score",
            index=["customer_id", "article_id"],
            columns=["method"],
            aggfunc=np.sum,
        )
        .reset_index()
    )

    candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
    valid.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{week}_label.pqt")

Week 1: [2020-09-16, 2020-09-23)


Retrieve items by rules:   6%|▋         | 1/16 [00:22<05:44, 22.97s/it]

Positive rate: 0.03038


Retrieve items by rules:  12%|█▎        | 2/16 [00:47<05:35, 23.93s/it]

Positive rate: 0.02859


Retrieve items by rules:  19%|█▉        | 3/16 [01:54<09:26, 43.61s/it]

Positive rate: 0.01413


Retrieve items by rules:  25%|██▌       | 4/16 [03:03<10:42, 53.51s/it]

Positive rate: 0.01295
Positive rate: 0.01519


Retrieve items by rules:  31%|███▏      | 5/16 [03:23<07:36, 41.50s/it]

Positive rate: 0.01472


Retrieve items by rules:  38%|███▊      | 6/16 [03:46<05:53, 35.35s/it]

Positive rate: 0.00908


Retrieve items by rules:  44%|████▍     | 7/16 [04:23<05:20, 35.62s/it]

Positive rate: 0.00853


Retrieve items by rules:  50%|█████     | 8/16 [05:10<05:15, 39.42s/it]

TOP15.0 Positive rate: 0.00602


Retrieve items by rules:  56%|█████▋    | 9/16 [05:18<03:26, 29.53s/it]

TOP20.5 Positive rate: 0.00608


Retrieve items by rules:  62%|██████▎   | 10/16 [05:25<02:16, 22.68s/it]

TOP2.0 Positive rate: 0.00652


Retrieve items by rules:  69%|██████▉   | 11/16 [05:41<01:42, 20.42s/it]

TOP9.0 Positive rate: 0.00631


Retrieve items by rules:  75%|███████▌  | 12/16 [05:47<01:05, 16.32s/it]

TOP16.0 Positive rate: 0.00601


Retrieve items by rules:  81%|████████▏ | 13/16 [05:55<00:40, 13.50s/it]

TOP12.0 Positive rate: 0.00607


Retrieve items by rules:  88%|████████▊ | 14/16 [06:29<00:39, 19.96s/it]

TOP8.0 Positive rate: 0.00616


Retrieve items by rules:  94%|█████████▍| 15/16 [07:02<00:23, 23.84s/it]

TOP2.0 Positive rate: 0.00758


Retrieve items by rules: 100%|██████████| 16/16 [07:15<00:00, 27.22s/it]


Week 2: [2020-09-09, 2020-09-16)


Retrieve items by rules:   6%|▋         | 1/16 [00:23<05:52, 23.47s/it]

Positive rate: 0.03008


Retrieve items by rules:  12%|█▎        | 2/16 [01:07<08:19, 35.65s/it]

Positive rate: 0.02825
Positive rate: 0.01354


Retrieve items by rules:  19%|█▉        | 3/16 [03:17<17:02, 78.67s/it]

Positive rate: 0.01244


Retrieve items by rules:  25%|██▌       | 4/16 [05:40<20:47, 103.98s/it]

Positive rate: 0.01435


Retrieve items by rules:  31%|███▏      | 5/16 [06:25<15:09, 82.67s/it] 

Positive rate: 0.01376


Retrieve items by rules:  38%|███▊      | 6/16 [07:16<11:59, 71.91s/it]

Positive rate: 0.00886


Retrieve items by rules:  44%|████▍     | 7/16 [08:34<11:06, 74.07s/it]

Positive rate: 0.00830


Retrieve items by rules:  50%|█████     | 8/16 [10:27<11:31, 86.48s/it]

TOP10.0 Positive rate: 0.00610


Retrieve items by rules:  56%|█████▋    | 9/16 [10:49<07:43, 66.17s/it]

TOP7.0 Positive rate: 0.00611


Retrieve items by rules:  69%|██████▉   | 11/16 [11:41<03:50, 46.02s/it]

skip


Retrieve items by rules:  75%|███████▌  | 12/16 [11:55<02:25, 36.35s/it]

skip


Retrieve items by rules:  81%|████████▏ | 13/16 [12:10<01:29, 29.93s/it]

skip
TOP6.0 Positive rate: 0.00606


Retrieve items by rules:  88%|████████▊ | 14/16 [13:33<01:31, 45.99s/it]

TOP7.0 Positive rate: 0.00603


Retrieve items by rules: 100%|██████████| 16/16 [15:18<00:00, 57.39s/it]

skip





Week 3: [2020-09-02, 2020-09-09)


Retrieve items by rules:   6%|▋         | 1/16 [00:22<05:41, 22.75s/it]

Positive rate: 0.02822


Retrieve items by rules:  12%|█▎        | 2/16 [00:46<05:25, 23.24s/it]

Positive rate: 0.02679


Retrieve items by rules:  19%|█▉        | 3/16 [01:52<09:15, 42.74s/it]

Positive rate: 0.01321
Positive rate: 0.01237


Retrieve items by rules:  25%|██▌       | 4/16 [03:01<10:40, 53.35s/it]

Positive rate: 0.01490


Retrieve items by rules:  31%|███▏      | 5/16 [03:23<07:40, 41.90s/it]

Positive rate: 0.01435


Retrieve items by rules:  38%|███▊      | 6/16 [03:46<05:55, 35.56s/it]

Positive rate: 0.00910


Retrieve items by rules:  44%|████▍     | 7/16 [04:33<05:51, 39.08s/it]

Positive rate: 0.00873


Retrieve items by rules:  50%|█████     | 8/16 [05:29<05:57, 44.64s/it]

TOP16.0 Positive rate: 0.00601


Retrieve items by rules:  56%|█████▋    | 9/16 [05:38<03:54, 33.46s/it]

TOP10.5 Positive rate: 0.00612


Retrieve items by rules:  69%|██████▉   | 11/16 [06:01<01:51, 22.28s/it]

skip
TOP11.0 Positive rate: 0.00619


Retrieve items by rules:  75%|███████▌  | 12/16 [06:08<01:10, 17.66s/it]

TOP9.0 Positive rate: 0.00619


Retrieve items by rules:  81%|████████▏ | 13/16 [06:15<00:43, 14.57s/it]

TOP10.0 Positive rate: 0.00601


Retrieve items by rules:  88%|████████▊ | 14/16 [06:51<00:41, 20.82s/it]

TOP9.0 Positive rate: 0.00622


Retrieve items by rules: 100%|██████████| 16/16 [07:37<00:00, 28.61s/it]

skip





Week 4: [2020-08-26, 2020-09-02)


Retrieve items by rules:   6%|▋         | 1/16 [00:23<05:49, 23.32s/it]

Positive rate: 0.02406


Retrieve items by rules:  12%|█▎        | 2/16 [00:46<05:29, 23.51s/it]

Positive rate: 0.02268


Retrieve items by rules:  19%|█▉        | 3/16 [01:53<09:21, 43.16s/it]

Positive rate: 0.01182


Retrieve items by rules:  25%|██▌       | 4/16 [03:03<10:44, 53.74s/it]

Positive rate: 0.01123
Positive rate: 0.01353


Retrieve items by rules:  31%|███▏      | 5/16 [03:25<07:44, 42.20s/it]

Positive rate: 0.01306


Retrieve items by rules:  38%|███▊      | 6/16 [03:49<06:01, 36.20s/it]

Positive rate: 0.00876


Retrieve items by rules:  44%|████▍     | 7/16 [04:39<06:04, 40.53s/it]

Positive rate: 0.00836


Retrieve items by rules:  50%|█████     | 8/16 [05:41<06:19, 47.39s/it]

TOP13.0 Positive rate: 0.00603


Retrieve items by rules:  56%|█████▋    | 9/16 [05:50<04:08, 35.45s/it]

TOP14.5 Positive rate: 0.00606


Retrieve items by rules:  69%|██████▉   | 11/16 [06:14<01:57, 23.54s/it]

skip
TOP5.0 Positive rate: 0.00631


Retrieve items by rules:  75%|███████▌  | 12/16 [06:21<01:14, 18.71s/it]

TOP8.0 Positive rate: 0.00650


Retrieve items by rules:  81%|████████▏ | 13/16 [06:29<00:45, 15.25s/it]

TOP7.0 Positive rate: 0.00607


Retrieve items by rules:  88%|████████▊ | 14/16 [07:03<00:41, 20.96s/it]

TOP3.0 Positive rate: 0.00660


Retrieve items by rules: 100%|██████████| 16/16 [07:49<00:00, 29.35s/it]

skip





Week 5: [2020-08-19, 2020-08-26)


Retrieve items by rules:   6%|▋         | 1/16 [00:23<05:55, 23.71s/it]

Positive rate: 0.02329


Retrieve items by rules:  12%|█▎        | 2/16 [00:48<05:37, 24.10s/it]

Positive rate: 0.02211
Positive rate: 0.01187


Retrieve items by rules:  19%|█▉        | 3/16 [02:36<13:36, 62.77s/it]

Positive rate: 0.01101


Retrieve items by rules:  25%|██▌       | 4/16 [04:48<18:00, 90.03s/it]

Positive rate: 0.01325


Retrieve items by rules:  31%|███▏      | 5/16 [05:11<12:04, 65.89s/it]

Positive rate: 0.01269


Retrieve items by rules:  38%|███▊      | 6/16 [05:36<08:40, 52.00s/it]

Positive rate: 0.00869


Retrieve items by rules:  44%|████▍     | 7/16 [06:23<07:31, 50.13s/it]

Positive rate: 0.00826


Retrieve items by rules:  50%|█████     | 8/16 [07:22<07:05, 53.21s/it]

TOP2.5 Positive rate: 0.00600


Retrieve items by rules:  56%|█████▋    | 9/16 [07:29<04:30, 38.59s/it]

TOP8.0 Positive rate: 0.00605


Retrieve items by rules:  69%|██████▉   | 11/16 [07:49<02:00, 24.00s/it]

skip
TOP2.0 Positive rate: 0.00657


Retrieve items by rules:  81%|████████▏ | 13/16 [07:59<00:42, 14.27s/it]

skip
TOP1.0 Positive rate: 0.00644


Retrieve items by rules:  94%|█████████▍| 15/16 [09:01<00:22, 22.73s/it]

skip


Retrieve items by rules: 100%|██████████| 16/16 [09:13<00:00, 34.57s/it]

skip





Week 6: [2020-08-12, 2020-08-19)


Retrieve items by rules:   6%|▋         | 1/16 [00:22<05:40, 22.67s/it]

Positive rate: 0.02325


Retrieve items by rules:  12%|█▎        | 2/16 [00:46<05:23, 23.14s/it]

Positive rate: 0.02198


Retrieve items by rules:  19%|█▉        | 3/16 [01:51<09:14, 42.64s/it]

Positive rate: 0.01166


Retrieve items by rules:  25%|██▌       | 4/16 [03:00<10:36, 53.05s/it]

Positive rate: 0.01050
Positive rate: 0.01124


Retrieve items by rules:  31%|███▏      | 5/16 [03:22<07:37, 41.57s/it]

Positive rate: 0.01112


Retrieve items by rules:  38%|███▊      | 6/16 [03:46<05:57, 35.72s/it]

Positive rate: 0.00785


Retrieve items by rules:  44%|████▍     | 7/16 [04:35<06:00, 40.02s/it]

Positive rate: 0.00721


Retrieve items by rules:  50%|█████     | 8/16 [05:43<06:31, 48.96s/it]

TOP2.0 Positive rate: 0.00629


Retrieve items by rules:  62%|██████▎   | 10/16 [05:50<02:30, 25.07s/it]

skip


Retrieve items by rules:  69%|██████▉   | 11/16 [06:00<01:42, 20.41s/it]

skip


Retrieve items by rules:  75%|███████▌  | 12/16 [06:02<00:59, 14.81s/it]

skip


Retrieve items by rules:  81%|████████▏ | 13/16 [06:04<00:32, 10.90s/it]

skip


Retrieve items by rules:  88%|████████▊ | 14/16 [06:34<00:33, 16.79s/it]

skip


Retrieve items by rules:  94%|█████████▍| 15/16 [07:04<00:20, 20.75s/it]

skip


Retrieve items by rules: 100%|██████████| 16/16 [07:12<00:00, 27.04s/it]

skip





Week 7: [2020-08-05, 2020-08-12)


Retrieve items by rules:   6%|▋         | 1/16 [00:23<05:48, 23.21s/it]

Positive rate: 0.01753


Retrieve items by rules:  12%|█▎        | 2/16 [00:47<05:32, 23.73s/it]

Positive rate: 0.01667


Retrieve items by rules:  19%|█▉        | 3/16 [01:54<09:24, 43.39s/it]

Positive rate: 0.00867
Positive rate: 0.00809


Retrieve items by rules:  25%|██▌       | 4/16 [03:02<10:40, 53.34s/it]

Positive rate: 0.00971


Retrieve items by rules:  31%|███▏      | 5/16 [03:25<07:43, 42.16s/it]

Positive rate: 0.00959


Retrieve items by rules:  38%|███▊      | 6/16 [03:48<05:59, 35.95s/it]

Positive rate: 0.00631


Retrieve items by rules:  44%|████▍     | 7/16 [04:50<06:37, 44.17s/it]

TOP17.0 Positive rate: 0.00600


Retrieve items by rules:  56%|█████▋    | 9/16 [06:09<04:29, 38.50s/it]

skip


Retrieve items by rules:  62%|██████▎   | 10/16 [06:13<02:47, 27.88s/it]

skip


Retrieve items by rules:  69%|██████▉   | 11/16 [06:23<01:52, 22.43s/it]

skip


Retrieve items by rules:  75%|███████▌  | 12/16 [06:26<01:06, 16.66s/it]

skip


Retrieve items by rules:  81%|████████▏ | 13/16 [06:29<00:37, 12.40s/it]

skip


Retrieve items by rules:  88%|████████▊ | 14/16 [07:00<00:36, 18.04s/it]

skip


Retrieve items by rules:  94%|█████████▍| 15/16 [07:30<00:21, 21.65s/it]

skip


Retrieve items by rules: 100%|██████████| 16/16 [07:37<00:00, 28.62s/it]

skip





In [12]:
# * use the threshold in week 1 to generate candidates for test data, see the log in the upper cell 
if TEST:
    week = 0
    trans = data["inter"]
    
    start_date, end_date = calc_valid_date(week)
    print(f"Week {week}: [{start_date}, {end_date})")
    
    train, valid = dh.split_data(trans, start_date, end_date)
    train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

    last_week_start = pd.to_datetime(start_date) - pd.Timedelta(days=7)
    last_week_start = last_week_start.strftime("%Y-%m-%d")
    last_week = train.loc[train.t_dat >= last_week_start]
    
    last_3day_start = pd.to_datetime(start_date) - pd.Timedelta(days=3)
    last_3day_start = last_3day_start.strftime("%Y-%m-%d")
    last_3days = train.loc[train.t_dat >= last_3day_start]

    customer_list = submission['customer_id'].values

    # * ========================== Retrieval Strategies ==========================

    candidates = RuleCollector().collect(
        week_num = week,
        trans_df = trans,
        customer_list=customer_list,
        rules=[
            OrderHistory(train, days=3, name='1'),
            OrderHistory(train, days=7, name='2'),
            OrderHistoryDecay(train, days=3, n=50, name='1'),
            OrderHistoryDecay(train, days=7, n=50, name='2'),
            ItemPair(OrderHistory(train, days=3).retrieve(), name='1'),
            ItemPair(OrderHistory(train, days=7).retrieve(), name='2'),
            ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),
            ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),
            UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=15, name='1'),
            UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=20.5, name='2'),
            UserGroupSaleTrend(data, customer_list, train, ['age_bins'], days=7, n=2),
            TimeHistory(customer_list, last_week, n=9, name='1'),
            TimeHistory(customer_list, last_3days, n=16, name='2'),
            TimeHistoryDecay(customer_list, train, days=3, n=12, name='1'),
            TimeHistoryDecay(customer_list, train, days=7, n=8, name='2'),
            SaleTrend(customer_list, train, days=7, n=2),
        ],
        filters=[OutOfStock(trans)],
        min_pos_rate=0.006,
        compress=False,
    )
    
    candidates, _ = reduce_mem_usage(candidates)
    candidates = (
        pd.pivot_table(
            candidates,
            values="score",
            index=["customer_id", "article_id"],
            columns=["method"],
            aggfunc=np.sum,
        )
        .reset_index()
    )

    candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
    valid.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{week}_label.pqt")

Week 0: [2020-09-23, 2020-09-30)


Retrieve items by rules: 100%|██████████| 16/16 [07:00<00:00, 26.27s/it]


In [13]:
del train, valid, last_week, customer_list, candidates
gc.collect()

41

## Feature engineering


In [13]:
user = data["user"]
item = data["item"]
inter = data["inter"]

In [14]:
# merge `product_code`
inter = inter.merge(item[["article_id", "product_code"]], on="article_id", how="left")
# calculate week number
inter['week'] = (pd.to_datetime('2020-09-29') - pd.to_datetime(inter['t_dat'])).dt.days // 7

In [15]:
inter = inter.sort_values(["t_dat"]).reset_index(drop=True)

In [16]:
inter.shape

(31788324, 7)

Week Sale, Last Week Sale, Week Sale Trend

In [17]:
inter["i_sale"] = week_sale(inter, ["article_id"])
inter["p_sale"] = week_sale(inter, ["product_code"])
inter["i_sale_uni"] = week_sale(inter, ["article_id"], True)
inter["p_sale_uni"] = week_sale(inter, ["product_code"], True)
inter["lw_i_sale"] = week_sale(inter, ["article_id"], step=1) # * last week sale
inter["lw_p_sale"] = week_sale(inter, ["product_code"], step=1)
inter["lw_i_sale_uni"] = week_sale(inter, ["article_id"], True, step=1)
inter["lw_p_sale_uni"] = week_sale(inter, ["product_code"], True, step=1)

inter["i_sale_ratio"] = inter["i_sale"] / (inter["p_sale"] + 1e-6)
inter["i_sale_uni_ratio"] = inter["i_sale_uni"] / (inter["p_sale_uni"] + 1e-6)
inter["lw_i_sale_ratio"] = inter["lw_i_sale"] / (inter["lw_p_sale"] + 1e-6)
inter["lw_i_sale_uni_ratio"] = inter["lw_i_sale_uni"] / (inter["lw_p_sale_uni"] + 1e-6)

inter["i_uni_ratio"] = inter["i_sale"] / (inter["i_sale_uni"] + 1e-6)
inter["p_uni_ratio"] = inter["p_sale"] / (inter["p_sale_uni"] + 1e-6)
inter["lw_i_uni_ratio"] = inter["lw_i_sale"] / (inter["lw_i_sale_uni"] + 1e-6)
inter["lw_p_uni_ratio"] = inter["lw_p_sale"] / (inter["lw_p_sale_uni"] + 1e-6)

inter["i_sale_trend"] = (inter["i_sale"] - inter["lw_i_sale"]) / (inter["lw_i_sale"] + 1e-6)
inter["p_sale_trend"] = (inter["p_sale"] - inter["lw_p_sale"]) / (inter["lw_p_sale"] + 1e-6)

item_feats = [
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",
]
inter = inter.merge(item[["article_id", *item_feats]], on="article_id", how="left")

for f in tqdm(item_feats):
    inter[f"{f}_sale"] = week_sale(inter, [f], f"{f}_sale")
    inter[f"lw_{f}_sale"] = week_sale(inter, [f], f"{f}_sale", step=1)
    inter[f"{f}_sale_trend"] = (inter[f"{f}_sale"] - inter[f"lw_{f}_sale"]) / (inter[f"lw_{f}_sale"] + 1e-6)

inter = inter.drop(columns=item_feats)

100%|██████████| 6/6 [02:29<00:00, 24.99s/it]


Repurchase Ratio

In [18]:
inter['i_repurchase_ratio'] = repurchase_ratio(inter, ['article_id'], week_num=WEEK_NUM)
inter['p_repurchase_ratio'] = repurchase_ratio(inter, ['product_code'], week_num=WEEK_NUM)

Popularity

In [20]:
inter['i_pop'] = popularity(inter, 'article_id', week_num=WEEK_NUM)
inter['p_pop'] = popularity(inter, 'product_code', week_num=WEEK_NUM)

Already Bought Item

In [21]:
# inter["purchased_item"] = purchased_before(inter, ["article_id"])
# inter["purchased_pro"] = purchased_before(inter, ["product_code"])

Save data

In [22]:
inter.shape

(31788324, 47)

In [23]:
inter.to_parquet(data_dir / "processed/processed_inter.pqt")

## Merge Features


In [11]:
inter = pd.read_parquet(data_dir / "processed/processed_inter.pqt")
data["inter"] = inter

In [12]:
#* embeddings from DSSM model
dssm_user_embd = np.load(data_dir / "external/dssm_user_embd.npy", allow_pickle=True)
dssm_item_embd = np.load(data_dir / "external/dssm_item_embd.npy", allow_pickle=True)
# * embeddings from YouTubeDNN model
yt_user_embd = np.load(data_dir / "external/yt_user_embd.npy", allow_pickle=True)
yt_item_embd = np.load(data_dir / "external/yt_item_embd.npy", allow_pickle=True)

In [13]:
for i in tqdm(range(WEEK_NUM)):
    if i == 0 and not TEST:
        continue
        
    candidate = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate.pqt")
    # * merge features
    candidate = merge_week_data(data, i, candidate)
    # * merge DSSM user and item embeddings
    candidate["dssm_similarity"] = calc_embd_similarity(candidate, dssm_user_embd, dssm_item_embd)
    # * merge YouTubeDNN user and item embeddings
    candidate["yt_similarity"] = calc_embd_similarity(candidate, yt_user_embd, yt_item_embd)

    candidate.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")

100%|██████████| 5532/5532 [00:37<00:00, 147.02it/s]
100%|██████████| 5532/5532 [00:34<00:00, 160.03it/s]
100%|██████████| 306/306 [00:01<00:00, 183.83it/s]
100%|██████████| 306/306 [00:01<00:00, 180.93it/s]
100%|██████████| 225/225 [00:01<00:00, 174.28it/s]
100%|██████████| 225/225 [00:01<00:00, 181.39it/s]
100%|██████████| 280/280 [00:01<00:00, 178.55it/s]
100%|██████████| 280/280 [00:01<00:00, 187.86it/s]
100%|██████████| 298/298 [00:01<00:00, 171.52it/s]
100%|██████████| 298/298 [00:01<00:00, 173.69it/s]
100%|██████████| 178/178 [00:01<00:00, 176.01it/s]
100%|██████████| 178/178 [00:01<00:00, 171.28it/s]
100%|██████████| 6/6 [25:02<00:00, 250.49s/it]


In [14]:
del dssm_user_embd, dssm_item_embd, yt_user_embd, yt_item_embd
gc.collect()

20

## Ranking


In [104]:
candidates = {}
labels = {}
for i in tqdm(range(1, WEEK_NUM)):
    candidates[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")
    labels[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_label.pqt")    

100%|██████████| 5/5 [00:10<00:00,  2.15s/it]


In [105]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "sales_channel_id",
        "t_dat",
        "week",
        # "WeekSaleTrend_item",
        # "WeekSaleTrend_pro",
        # "ThreeDaySaleTrend_item",
        # "ThreeDaySaleTrend_pro",
        # "Item_Popularity",
        # "Pro_Popularity",
    ]
]
cat_features = [
    "customer_id",
    "article_id",
    "product_code",
    "FN",
    "Active",
    "club_member_status",
    "fashion_news_frequency",
    "age",
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",

    "user_gender",
    "article_gender",
    "season_type"
]


In [106]:
# * Convert categorical featues as `CategoricalDtype`
cate_dict = {}        
for feat in tqdm(cat_features):
    if feat in data['user'].columns:
        value_set = set(data['user'][feat].unique())
    elif feat in data['item'].columns:
        value_set = set(data['item'][feat].unique())
    else:
        value_set = set(data['inter'][feat].unique())
    cate_dict[feat] = CategoricalDtype(categories=value_set)

100%|██████████| 17/17 [00:00<00:00, 19.21it/s]


In [107]:
for i in tqdm(range(1,WEEK_NUM)):
    for feat in cat_features:
        candidates[i][feat] = candidates[i][feat].astype(cate_dict[feat])

100%|██████████| 5/5 [00:09<00:00,  1.80s/it]


### Train


In [108]:
params = {
    "objective": "lambdarank",#"",
    "boosting_type": "gbdt",
    "metric": "map",#"map",
    "max_depth": 8,
    "num_leaves": 128,
    "learning_rate": 0.03,

    "verbose": -1,
    "eval_at": 12,
    # 'device':'gpu'
}

In [109]:
# * some rules are skipped for some weeks, we need to concat them together
# * to merge the columns
full_data = pd.concat([candidates[i] for i in range(1, WEEK_NUM)], ignore_index=True)

In [110]:
rule_feats = ['ItemPairRetrieve_1', 'ItemPairRetrieve_2',
       'ItemPairRetrieve_3', 'ItemPairRetrieve_4', 'OrderHistoryDecay_1',
       'OrderHistoryDecay_2', 'OrderHistory_1', 'OrderHistory_2',
       'SaleTrend_1', 'TimeHistoryDecay_1', 'TimeHistoryDecay_2',
       'TimeHistory_1', 'TimeHistory_2', 'UGSaleTrend_1', 'UGTimeHistory_1',
       'UGTimeHistory_2']
# for rule in tqdm(rule_feats):
#     mask = full_data[rule].isna()
#     full_data.loc[mask, rule] = 0
#     full_data.loc[~mask, rule] = 1
# full_data['full_score'] = full_data[rule_feats].mean(axis=1)
# full_data['rank'] = full_data.groupby(['week','customer_id'])['full_score'].rank(ascending=False)
# feats += ['full_score', 'rank']

In [111]:
def train_model(full_data, valid_week_num, train_week_num=4):
    print("Validating week:", valid_week_num)
    train = full_data[
        (valid_week_num < full_data["week"])
        & (full_data["week"] <= valid_week_num + train_week_num)
    ]
    valid = full_data[full_data["week"] == valid_week_num]
    train = train.sort_values(by=["week", "customer_id"], ascending=True).reset_index(
        drop=True
    )
    valid = valid.sort_values(by=["customer_id"], ascending=True).reset_index(drop=True)
    print("Train positive rate:", train.label.mean())  # 0.9946384702188372 4-week

    train_group = train[["customer_id", "article_id", "week"]]
    train_group = train_group.astype(
        "int"
    )  # * convert to int to avoid `0` in groupby count result
    train_group = (
        train_group.groupby(["week", "customer_id"])["article_id"].count().values
    )

    valid_group = valid[["customer_id", "article_id"]]
    valid_group = valid_group.astype(
        "int"
    )  # * convert to int to avoid `0` in groupby count result
    valid_group = valid_group.groupby(["customer_id"])["article_id"].count().values

    train_set = lgb.Dataset(
        data=train[feats],
        label=train["label"],
        group=train_group,
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    valid_set = lgb.Dataset(
        data=valid[feats],
        label=valid["label"],
        group=valid_group,
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    ranker = lgb.train(
        params,
        train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        early_stopping_rounds=30,
        verbose_eval=10,
    )
    ranker.save_model(
        model_dir / f"lgb_ranker_{valid_week_num}.model",
        num_iteration=ranker.best_iteration,
    )
    return ranker

In [112]:
def train_binary_model(full_data, valid_week_num, train_week_num=4):
    print("Validating week:", valid_week_num)
    train = full_data[
        (valid_week_num < full_data["week"])
        & (full_data["week"] <= valid_week_num + train_week_num)
    ]
    valid = full_data[full_data["week"] == valid_week_num]

    print("Train positive rate:", train.label.mean())  # 0.9946384702188372 4-week


    train_set = lgb.Dataset(
        data=train[feats],
        label=train["label"],
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    valid_set = lgb.Dataset(
        data=valid[feats],
        label=valid["label"],
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    ranker = lgb.train(
        params,
        train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        early_stopping_rounds=30,
        verbose_eval=10,
    )
    ranker.save_model(
        model_dir / f"lgb_ranker_{valid_week_num}.model",
        num_iteration=ranker.best_iteration,
    )
    return ranker

In [None]:
ranker = train_model(full_data, 1, 4) # 0.82846 4-week 0.853142
# ranker = train_binary_model(full_data, 1, 4)
# 0.703171 -> 0.709413

### Inference

In [114]:
ranker = lgb.Booster(model_file=model_dir / "lgb_ranker_1.model")

In [None]:
feat_importance = pd.DataFrame(
    {"feature": feats, "importance": ranker.feature_importance()}
).sort_values(by="importance", ascending=False)
plt.figure(figsize=(8, 18))
sns.barplot(y="feature", x="importance", data=feat_importance)

### Validate

In [116]:
val_candidates = full_data[full_data["week"] == 1].reset_index(drop=True)

In [117]:
def predict(ranker, candidates, batch_size = 5_000_000):
    probs = np.zeros(candidates.shape[0])
    for batch in range(0, candidates.shape[0], batch_size):
        outputs = ranker.predict(candidates.loc[batch : batch + batch_size - 1, feats])
        probs[batch : batch + batch_size] = outputs
    candidates["prob"] = probs
    pred_lgb = candidates[['customer_id','article_id','prob']]
    pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
    pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)
    pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
    pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)
    pred_lgb = pred_lgb.groupby("customer_id")["prediction"].progress_apply(list).reset_index()
    return pred_lgb

In [118]:
pred = predict(ranker, val_candidates)

100%|██████████| 80093/80093 [00:05<00:00, 13625.84it/s]


In [119]:
label = labels[1]
label = pd.merge(label, pred, on="customer_id", how="left")

In [None]:
map_at_k(label["article_id"], label["prediction"], k=12)

# 0.02820525160663368 1-week add normalization
# 0.028936597823123886 4-week min_pos_rate 0.006 0.0274
# 0.029111989281461418 4-week drop full-negative customer
# 0.029165419468984943 3-week 0.0270
# 0.028694388096248934 4-week min_pos_rate 0.005 lr=0.03
# 0.028927037894290773 4-week min_pos_rate 0.005 lr=0.01
# 0.028936597823123886 4-week min_pos_rate 0.005 lr=0.01 max_depth=9, num_leaves=256

# 0.029035548891779315 LB:0.0267+
# 0.02936299322497085 LB:0.0272

### Test

In [41]:
del full_data, val_candidates, candidates
gc.collect()

64

In [42]:
test_candidates = pd.read_parquet(data_dir/"processed"/VERSION_NAME/"week0_candidate.pqt")
for feat in cat_features:
    test_candidates[feat] = test_candidates[feat].astype(cate_dict[feat])

In [71]:
# test_candidates['full_score'] = test_candidates[rule_feats].sum(axis=1)
# test_candidates['rank'] = full_data.groupby(['week','customer_id'])['full_score'].rank(ascending=False)

In [72]:
test_pred = predict(ranker, test_candidates)

100%|██████████| 1371980/1371980 [01:36<00:00, 14239.74it/s]


In [73]:
idx2uid = pickle.load(open(data_dir/"index_id_map/user_index2id.pkl", "rb"))
idx2iid = pickle.load(open(data_dir/"index_id_map/item_index2id.pkl", "rb"))

In [74]:
def parse(x):
    l = ['0'+str(idx2iid[i]) for i in x]
    l = ' '.join(l[:12])
    return l

In [75]:
test_pred['prediction'] = test_pred['prediction'].progress_apply(lambda x: parse(x))

100%|██████████| 1371980/1371980 [00:20<00:00, 68122.23it/s]


In [76]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

In [77]:
del submission['prediction']
submission = submission.merge(test_pred, on='customer_id', how='left')
submission['customer_id'] = submission['customer_id'].map(idx2uid)

In [78]:
submission.to_csv('submission.csv', index=False)

In [80]:
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0921906005 0918522001 0751471043 0924243002 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0866731001 0863583001 0918522001 0788575002 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0866731001 0805000001 0788575004 05...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0910601003 0918522001 0910601002 0929275001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0927530006 0896152001 0910601002 0918522001 08...


### Submit

In [46]:
submit_msg = """
0.028927037894290773 4-week drop
---
OrderHistory(train, days=3, name='1'),
OrderHistory(train, days=7, name='2'),
OrderHistoryDecay(train, days=3, n=50, name='1'),
OrderHistoryDecay(train, days=7, n=50, name='2'),
ItemPair(OrderHistory(train, days=3).retrieve(), name='1'),
ItemPair(OrderHistory(train, days=7).retrieve(), name='2'),
ItemPair(OrderHistoryDecay(train, days=3, n=50).retrieve(), name='3'),
ItemPair(OrderHistoryDecay(train, days=7, n=50).retrieve(), name='4'),
UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50, name='1'),
UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50, name='2'),
UserGroupSaleTrend(data, customer_list, train, ['age_bins'], days=7, n=50),
TimeHistory(customer_list, last_week, n=50, name='1'),
TimeHistory(customer_list, last_3days, n=50, name='2'),
TimeHistoryDecay(customer_list, train, days=3, n=50, name='1'),
TimeHistoryDecay(customer_list, train, days=7, n=50, name='2'),
SaleTrend(customer_list, train, days=7, n=50)
---
min_pos_rate = 0.005
pivot
"""

In [47]:
submit_msg

"\n0.029111989281461418 4-week drop full negative user\n---\nOrderHistory(train, 3),\nOrderHistory(train, 7),\nOrderHistoryDecay(train, 3, n=50),\nOrderHistoryDecay(train, 7, n=50),\nItemPair(OrderHistory(train, 3).retrieve(), name='1'),\nItemPair(OrderHistory(train, 7).retrieve(), name='2'),\nItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),\nItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),\nUserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),\nUserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),\nUserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),\nTimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),\nTimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),\nTimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),\nTimeHistoryDecay(customer_list, train, 7, n=50 if w

In [60]:
# ! mkdir ~/.kaggle
# ! cp ../kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json

In [65]:
# %pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 3.9 MB/s eta 0:00:011
Collecting python-slugify
  Downloading python_slugify-6.1.2-py2.py3-none-any.whl (9.4 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 5.7 MB/s eta 0:00:011
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73052 sha256=2aaa2ba8e16413cb4c49862167277c1d8f29337d273f3f9b02c7f2a0d2b3a357
  Stored in directory: /root/.cache/pip/wheels/29/da/11/144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106
Successfully built kaggle
Installing collected packages: text-unidecode, python-slugify, kaggle
Successfully installed kaggle-1.5.12 python-slugify-6.1.2 text-unidecode-1.3
You should consider upgrading via the '/usr/bin/python3 -m pip instal

In [48]:
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f ./submission.csv -m '\n0.029111989281461418 4-week drop full negative user\n---\nOrderHistory(train, 3),\nOrderHistory(train, 7),\nOrderHistoryDecay(train, 3, n=50),\nOrderHistoryDecay(train, 7, n=50),\nItemPair(OrderHistory(train, 3).retrieve(), name='1'),\nItemPair(OrderHistory(train, 7).retrieve(), name='2'),\nItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),\nItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),\nUserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),\nUserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),\nUserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),\nTimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),\nTimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),\nTimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),\nTimeHistoryDecay(customer_list, train, 7, n=50 if week!=0 else 8),\nSaleTrend(customer_list, train, 7, n=50 if week!=0 else 2)\n---\nmin_pos_rate = 0.006\npivot\n'

100%|████████████████████████████████████████| 258M/258M [00:11<00:00, 22.7MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations