In [1]:
# %pip install -U lightgbm==3.3.2

In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [2]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("../src/") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [3]:
from data import DataHelper
from data.metrics import map_at_k, hr_at_k, recall_at_k

from retrieval.rules import (
    OrderHistory,
    OrderHistoryDecay,
    ItemPair,

    UserGroupTimeHistory,
    UserGroupSaleTrend,

    TimeHistory,
    TimeHistoryDecay,
    SaleTrend,

    OutOfStock,
)
from retrieval.collector import RuleCollector

from features import cum_sale, week_sale, repurchase_ratio, purchased_before

from utils import calc_valid_date, merge_week_data


In [4]:
data_dir = Path("../data/")

In [5]:
TRAIN_WEEK_NUM = 4
WEEK_NUM = TRAIN_WEEK_NUM + 2
VERSION_NAME = "pivot"# "v1"

In [6]:
import os
if not os.path.exists(data_dir/"interim"/VERSION_NAME):
    os.mkdir(data_dir/"interim"/VERSION_NAME)

In [7]:
TEST = True # * Set as `False` when do local experiments to save time

Pepare data: encoding ids and preprocessing

In [8]:
dh = DataHelper(data_dir)

In [9]:
data = dh.preprocess_data(save=True, name="encoded_full") # * run only once, processed data will be saved

In [10]:
data = dh.load_data(name="encoded_full")

In [11]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

## Retrieval


Generate candidates for each week

In [12]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
data['user']['age_bins'] = pd.cut(data['user']['age'], listBin)

In [14]:
# * WEEK_NUM = 0: test
# * WEEK_NUM = 1: valid
# * WEEK_NUM > 1: train
for week in range(1,WEEK_NUM):
    # * use sliding window to generate candidates
    if week == 0 and not TEST:
        continue
    trans = data["inter"]

    start_date, end_date = calc_valid_date(week)
    print(f"Week {week}: [{start_date}, {end_date})")
    
    last_week_start = pd.to_datetime(start_date) - pd.Timedelta(days=7)
    last_week_start = last_week_start.strftime("%Y-%m-%d")
    
    last_3day_start = pd.to_datetime(start_date) - pd.Timedelta(days=3)
    last_3day_start = last_3day_start.strftime("%Y-%m-%d")

    train, valid = dh.split_data(trans, start_date, end_date)

    last_week = train.loc[train.t_dat >= last_week_start]
    last_week = last_week.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
    
    last_3days = train.loc[train.t_dat >= last_3day_start]
    last_3days = last_3days.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
    
    train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

    if week != 0:
        customer_list = valid["customer_id"].values
    else:
        customer_list = submission['customer_id'].values

    # * ========================== Retrieval Strategies ==========================

    candidates = RuleCollector().collect(
        valid = valid,
        customer_list=customer_list,
        rules=[
            OrderHistory(train, 3),
            OrderHistory(train, 7),
            OrderHistoryDecay(train, 3, n=50),
            OrderHistoryDecay(train, 7, n=50),
            ItemPair(OrderHistory(train, 3).retrieve(), name='1'),
            ItemPair(OrderHistory(train, 7).retrieve(), name='2'),
            ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),
            ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),
            UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50, name='1'),
            UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50, name='2'),
            UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50),
            TimeHistory(customer_list, last_week, n=50, name='1'),
            TimeHistory(customer_list, last_3days, n=50, name='2'),
            TimeHistoryDecay(customer_list, train, 3, n=50),
            TimeHistoryDecay(customer_list, train, 7, n=50),
            SaleTrend(customer_list, train, 7, n=50),
        ],
        filters=[OutOfStock(trans)],
        min_pos_rate=0.005,
        compress=False,
    )
    
    candidates = (
        pd.pivot_table(
            candidates,
            values="score",
            index=["customer_id", "article_id"],
            columns=["method"],
            aggfunc=np.sum,
        )
        .reset_index()
        # .fillna(0)
    )

    candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
    if WEEK_NUM != 0:
        valid.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_label.pqt")

Week 1: [2020-09-16, 2020-09-23)


Retrieve items by rules:   6% 1/16 [00:31<07:46, 31.11s/it]

Positive rate: 0.03038



Retrieve items by rules:  12% 2/16 [01:05<07:41, 32.98s/it]

Positive rate: 0.02859
Positive rate: 0.01413



Retrieve items by rules:  19% 3/16 [02:25<11:48, 54.48s/it]

Positive rate: 0.01295



Retrieve items by rules:  25% 4/16 [03:48<13:09, 65.81s/it]

Positive rate: 0.01519



Retrieve items by rules:  31% 5/16 [04:17<09:36, 52.39s/it]

Positive rate: 0.01472



Retrieve items by rules:  38% 6/16 [04:49<07:35, 45.59s/it]

Positive rate: 0.00908



Retrieve items by rules:  44% 7/16 [05:37<06:56, 46.32s/it]

Positive rate: 0.00853



Retrieve items by rules:  50% 8/16 [06:38<06:48, 51.10s/it]

TOP35.0 Positive rate: 0.00501



Retrieve items by rules:  56% 9/16 [08:09<07:24, 63.56s/it]

TOP38.5 Positive rate: 0.00508



Retrieve items by rules:  62% 10/16 [09:38<07:08, 71.49s/it]

TOP3.0 Positive rate: 0.00542



Retrieve items by rules:  69% 11/16 [11:20<06:43, 80.78s/it]

TOP25.0 Positive rate: 0.00503



Retrieve items by rules:  75% 12/16 [12:48<05:32, 83.02s/it]

TOP30.0 Positive rate: 0.00505



Retrieve items by rules:  81% 13/16 [14:16<04:13, 84.48s/it]

TOP31.0 Positive rate: 0.00503



Retrieve items by rules:  88% 14/16 [16:36<03:22, 101.26s/it]

TOP24.0 Positive rate: 0.00506



Retrieve items by rules:  94% 15/16 [18:55<01:52, 112.39s/it]

TOP4.0 Positive rate: 0.00568


Retrieve items by rules: 100% 16/16 [20:30<00:00, 76.93s/it] 


Week 2: [2020-09-09, 2020-09-16)


Retrieve items by rules:   6% 1/16 [00:31<07:54, 31.61s/it]

Positive rate: 0.03008



Retrieve items by rules:  12% 2/16 [01:05<07:37, 32.67s/it]

Positive rate: 0.02825
Positive rate: 0.01354



Retrieve items by rules:  19% 3/16 [02:24<11:41, 53.94s/it]

Positive rate: 0.01244



Retrieve items by rules:  25% 4/16 [03:47<13:05, 65.44s/it]

Positive rate: 0.01435



Retrieve items by rules:  31% 5/16 [04:15<09:33, 52.14s/it]

Positive rate: 0.01376



Retrieve items by rules:  38% 6/16 [04:46<07:29, 44.90s/it]

Positive rate: 0.00886



Retrieve items by rules:  44% 7/16 [05:37<07:02, 46.95s/it]

Positive rate: 0.00830



Retrieve items by rules:  50% 8/16 [06:44<07:06, 53.33s/it]

TOP21.0 Positive rate: 0.00501



Retrieve items by rules:  56% 9/16 [08:19<07:44, 66.32s/it]

TOP11.0 Positive rate: 0.00515


Retrieve items by rules:  69% 11/16 [11:38<07:00, 84.13s/it]

skip
TOP13.0 Positive rate: 0.00502



Retrieve items by rules:  75% 12/16 [13:10<05:45, 86.46s/it]

TOP6.0 Positive rate: 0.00544



Retrieve items by rules:  81% 13/16 [14:42<04:23, 87.98s/it]

TOP11.0 Positive rate: 0.00500



Retrieve items by rules:  88% 14/16 [17:05<03:29, 104.58s/it]

TOP11.0 Positive rate: 0.00517


Retrieve items by rules: 100% 16/16 [21:07<00:00, 79.19s/it] 

skip





Week 3: [2020-09-02, 2020-09-09)


Retrieve items by rules:   6% 1/16 [00:31<07:49, 31.29s/it]

Positive rate: 0.02822



Retrieve items by rules:  12% 2/16 [01:04<07:30, 32.21s/it]

Positive rate: 0.02679
Positive rate: 0.01321



Retrieve items by rules:  19% 3/16 [02:24<11:43, 54.08s/it]

Positive rate: 0.01237



Retrieve items by rules:  25% 4/16 [03:45<12:59, 64.94s/it]

Positive rate: 0.01490



Retrieve items by rules:  31% 5/16 [04:13<09:27, 51.59s/it]

Positive rate: 0.01435



Retrieve items by rules:  38% 6/16 [04:45<07:30, 45.01s/it]

Positive rate: 0.00910



Retrieve items by rules:  44% 7/16 [05:42<07:20, 48.90s/it]

Positive rate: 0.00873



Retrieve items by rules:  50% 8/16 [06:54<07:27, 56.00s/it]

TOP31.0 Positive rate: 0.00502



Retrieve items by rules:  56% 9/16 [08:35<08:11, 70.21s/it]

TOP29.5 Positive rate: 0.00505


Retrieve items by rules:  69% 11/16 [12:02<07:20, 88.09s/it]

skip
TOP20.0 Positive rate: 0.00501



Retrieve items by rules:  75% 12/16 [13:38<06:01, 90.43s/it]

TOP17.0 Positive rate: 0.00519



Retrieve items by rules:  81% 13/16 [15:14<04:36, 92.28s/it]

TOP17.0 Positive rate: 0.00515



Retrieve items by rules:  88% 14/16 [17:43<03:38, 109.21s/it]

TOP18.0 Positive rate: 0.00522


Retrieve items by rules: 100% 16/16 [21:52<00:00, 82.03s/it] 

skip





Week 4: [2020-08-26, 2020-09-02)


Retrieve items by rules:   6% 1/16 [00:31<07:54, 31.61s/it]

Positive rate: 0.02406



Retrieve items by rules:  12% 2/16 [01:04<07:35, 32.54s/it]

Positive rate: 0.02268
Positive rate: 0.01182



Retrieve items by rules:  19% 3/16 [02:25<11:49, 54.59s/it]

Positive rate: 0.01123



Retrieve items by rules:  25% 4/16 [03:47<13:03, 65.29s/it]

Positive rate: 0.01353



Retrieve items by rules:  31% 5/16 [04:15<09:32, 52.05s/it]

Positive rate: 0.01306



Retrieve items by rules:  38% 6/16 [04:47<07:31, 45.11s/it]

Positive rate: 0.00876



Retrieve items by rules:  44% 7/16 [05:49<07:35, 50.59s/it]

Positive rate: 0.00836



Retrieve items by rules:  50% 8/16 [07:02<07:42, 57.77s/it]

TOP26.0 Positive rate: 0.00514



Retrieve items by rules:  56% 9/16 [08:39<08:10, 70.04s/it]

TOP27.5 Positive rate: 0.00505


Retrieve items by rules:  69% 11/16 [12:04<07:18, 87.63s/it]

skip
TOP26.0 Positive rate: 0.00503



Retrieve items by rules:  75% 12/16 [13:35<05:54, 88.52s/it]

TOP26.0 Positive rate: 0.00508



Retrieve items by rules:  81% 13/16 [15:10<04:31, 90.47s/it]

TOP19.0 Positive rate: 0.00500



Retrieve items by rules:  88% 14/16 [17:36<03:34, 107.21s/it]

TOP14.0 Positive rate: 0.00502


Retrieve items by rules: 100% 16/16 [21:45<00:00, 81.59s/it] 

skip





Week 5: [2020-08-19, 2020-08-26)


Retrieve items by rules:   6% 1/16 [00:30<07:44, 30.97s/it]

Positive rate: 0.02329



Retrieve items by rules:  12% 2/16 [01:03<07:25, 31.85s/it]

Positive rate: 0.02211
Positive rate: 0.01187



Retrieve items by rules:  19% 3/16 [02:20<11:24, 52.63s/it]

Positive rate: 0.01101



Retrieve items by rules:  25% 4/16 [03:39<12:35, 62.95s/it]

Positive rate: 0.01325



Retrieve items by rules:  31% 5/16 [04:08<09:15, 50.53s/it]

Positive rate: 0.01269



Retrieve items by rules:  38% 6/16 [04:38<07:18, 43.82s/it]

Positive rate: 0.00869



Retrieve items by rules:  44% 7/16 [05:34<07:10, 47.81s/it]

Positive rate: 0.00826



Retrieve items by rules:  50% 8/16 [06:48<07:26, 55.87s/it]

TOP15.5 Positive rate: 0.00506



Retrieve items by rules:  56% 9/16 [07:49<06:44, 57.76s/it]

TOP22.0 Positive rate: 0.00502


Retrieve items by rules:  69% 11/16 [10:42<06:06, 73.26s/it]

skip
TOP6.0 Positive rate: 0.00517



Retrieve items by rules:  75% 12/16 [11:48<04:43, 70.92s/it]

TOP11.0 Positive rate: 0.00502



Retrieve items by rules:  81% 13/16 [13:07<03:40, 73.49s/it]

TOP6.0 Positive rate: 0.00513



Retrieve items by rules:  88% 14/16 [15:14<02:59, 89.78s/it]

TOP10.0 Positive rate: 0.00505


Retrieve items by rules: 100% 16/16 [18:45<00:00, 70.34s/it]

skip





In [15]:
# * use the threshold in week 1 to generate candidates for test data, see the log in the upper cell 
for week in range(1):
    if week == 0 and not TEST:
        continue
    trans = data["inter"]

    start_date, end_date = calc_valid_date(week)
    print(f"Week {week}: [{start_date}, {end_date})")
    
    last_5week_start = pd.to_datetime(start_date) - 5*pd.Timedelta(days=7)
    last_5week_start = last_5week_start.strftime("%Y-%m-%d")
    
    last_week_start = pd.to_datetime(start_date) - pd.Timedelta(days=7)
    last_week_start = last_week_start.strftime("%Y-%m-%d")
    
    last_3day_start = pd.to_datetime(start_date) - pd.Timedelta(days=3)
    last_3day_start = last_3day_start.strftime("%Y-%m-%d")

    train, valid = dh.split_data(trans, start_date, end_date)
    
    last_5week = train.loc[train.t_dat >= last_5week_start]
    last_5week = last_5week.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

    last_week = train.loc[train.t_dat >= last_week_start]
    last_week = last_week.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
    
    last_3days = train.loc[train.t_dat >= last_3day_start]
    last_3days = last_3days.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
    
    train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
    last_5week = last_5week.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

    if week != 0:
        customer_list = valid["customer_id"].values
    else:
        customer_list = submission['customer_id'].values

    # * ========================== Retrieval Strategies ==========================

    candidates = RuleCollector().collect(
        valid = valid,
        customer_list=customer_list,
        rules=[
            OrderHistory(train, 3),
            OrderHistory(train, 7),
            OrderHistoryDecay(train, 3, n=50),
            OrderHistoryDecay(train, 7, n=50),
            ItemPair(OrderHistory(train, 3).retrieve(), name='1'),
            ItemPair(OrderHistory(train, 7).retrieve(), name='2'),
            ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),
            ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),
            UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=35, name='1'),
            UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=38.5, name='2'),
            UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=3),
            TimeHistory(customer_list, last_week, n=25, name='1'),
            TimeHistory(customer_list, last_3days, n=30, name='2'),
            TimeHistoryDecay(customer_list, train, 3, n=31),
            TimeHistoryDecay(customer_list, train, 7, n=24),
            SaleTrend(customer_list, train, 7, n=4),
        ],
        filters=[OutOfStock(trans)],
        min_pos_rate=0.005,
        compress=False,
    )
    
    candidates = (
        pd.pivot_table(
            candidates,
            values="score",
            index=["customer_id", "article_id"],
            columns=["method"],
            aggfunc=np.sum,
        )
        .reset_index()
        # .fillna(0)
    )

    candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
    if WEEK_NUM != 0:
        valid.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_label.pqt")

Week 0: [2020-09-23, 2020-09-30)


Retrieve items by rules: 100% 16/16 [14:26<00:00, 54.15s/it]


In [16]:
del train, valid, last_week, customer_list, candidates
gc.collect()

2361

## Feature engineering


In [17]:
user = data["user"]
item = data["item"]
inter = data["inter"]

In [18]:
# merge `product_code`
inter = inter.merge(item[["article_id", "product_code"]], on="article_id", how="left")
# calculate week number
inter['week'] = (pd.to_datetime('2020-09-29') - pd.to_datetime(inter['t_dat'])).dt.days // 7

In [19]:
inter = inter.sort_values(["customer_id", "t_dat"]).reset_index(drop=True)

In [20]:
inter.shape

(31788324, 7)

Week Sale

In [19]:
inter["item_sale"] = week_sale(inter, ["article_id"])
inter["pro_sale"] = week_sale(inter, ["product_code"])
inter["item_sale_uni"] = week_sale(inter, ["article_id"], True)
inter["pro_sale_uni"] = week_sale(inter, ["product_code"], True)

inter["item_sale_ratio"] = inter["item_sale"] / (inter["pro_sale"] + 1e-6)
inter["item_sale_uni_ratio"] = inter["pro_sale_uni"] / (inter["pro_sale_uni"] + 1e-6)

item_feats = [
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",
]
inter = inter.merge(item[["article_id", *item_feats]], on="article_id", how="left")

for feat in tqdm(item_feats):
    inter[f"{feat}_sale"] = week_sale(inter, [feat], f"{feat}_sale")

inter = inter.drop(columns=item_feats)

100% 6/6 [01:03<00:00, 10.52s/it]


Repurchase Ratio

In [20]:
inter['i_repurchase_ratio'] = repurchase_ratio(inter, ['article_id'])
inter['p_repurchase_ratio'] = repurchase_ratio(inter, ['product_code'])

Already Bought Item

In [21]:
inter["purchased_item"] = purchased_before(inter, ["article_id"])
inter["purchased_pro"] = purchased_before(inter, ["product_code"])

Save data

In [22]:
inter.shape

(31788324, 23)

In [23]:
inter.to_parquet(data_dir / "interim/processed_inter.pqt")

## Merge Features


In [90]:
candidates = {}
for i in tqdm(range(WEEK_NUM)):
    if i == 0 and not TEST:
        continue
        
    tmp_candidate = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate.pqt")
    candidates[i] = tmp_candidate

100% 6/6 [00:19<00:00,  3.30s/it]


In [91]:
inter = pd.read_parquet(data_dir / "interim/processed_inter.pqt")
data["inter"] = inter

Merge features

In [None]:
for i in tqdm(range(WEEK_NUM)):
    if i in candidates:
        candidates[i] = merge_week_data(data, i, candidates[i])

 50% 3/6 [03:05<03:03, 61.18s/it]

Merge user and item embeddings

In [None]:
#* embedding from DSSM model
dssm_user_embd = np.load(data_dir / "external/dssm_user_embd.npy", allow_pickle=True)
dssm_item_embd = np.load(data_dir / "external/dssm_item_embd.npy", allow_pickle=True)

for i in range(WEEK_NUM):
    if i not in candidates:
        continue
    tmp = candidates[i]
    sim = np.zeros(tmp.shape[0])
    batch_size = 10000
    for batch in tqdm(range(0, tmp.shape[0], batch_size)):
        tmp_users = tmp.loc[batch : batch + batch_size - 1, 'customer_id'].values-1
        tmp_items = tmp.loc[batch : batch + batch_size - 1, 'article_id'].values-1
        tmp_user_embd = np.expand_dims(dssm_user_embd[tmp_users],1) # (batch_size, 1, dim)
        tmp_item_embd = np.expand_dims(dssm_item_embd[tmp_items],2) # (batch_size, dim, 1)
        tmp_sim = np.einsum('ijk,ikj->ij',tmp_user_embd,tmp_item_embd)
        sim[batch : batch + batch_size] = tmp_sim.reshape(-1)
        
    tmp["dssm_similarity"] = sim
    candidates[i] = tmp

del dssm_user_embd, dssm_item_embd
gc.collect()

In [None]:
# * embedding from YouTubeDNN model
yt_user_embd = np.load(data_dir / "external/yt_user_embd.npy", allow_pickle=True)
yt_item_embd = np.load(data_dir / "external/yt_item_embd.npy", allow_pickle=True)

for i in range(WEEK_NUM):
    if i not in candidates:
        continue
    tmp = candidates[i]
    sim = np.zeros(tmp.shape[0])
    batch_size = 10000
    for batch in tqdm(range(0, tmp.shape[0], batch_size)):
        tmp_users = tmp.loc[batch : batch + batch_size - 1, 'customer_id'].values-1
        tmp_items = tmp.loc[batch : batch + batch_size - 1, 'article_id'].values-1
        tmp_user_embd = np.expand_dims(yt_user_embd[tmp_users],1) # (batch_size, 1, dim)
        tmp_item_embd = np.expand_dims(yt_item_embd[tmp_items],2) # (batch_size, dim, 1)
        tmp_sim = np.einsum('ijk,ikj->ij',tmp_user_embd,tmp_item_embd)
        sim[batch : batch + batch_size] = tmp_sim.reshape(-1)
        
    tmp["yt_similarity"] = sim
    # tmp["yt_similarity_cos"] = cos_sim
    candidates[i] = tmp

del yt_user_embd, yt_item_embd
gc.collect()

Save results

In [None]:
for i in tqdm(candidates.keys()):
    candidates[i].to_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate_full.pqt")

## Ranking


In [None]:
candidates = {}
labels = {}
for i in tqdm(range(WEEK_NUM)):
    if i==0 and not TEST:
        continue
    candidates[i] = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate_full.pqt")
    if i != 0:
        tmp_label = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_label.pqt")
        labels[i] = tmp_label
    else:
        labels[i] = None

In [None]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "sales_channel_id",
        "t_dat",
        "week",
        "WeekSaleTrend_item",
        "WeekSaleTrend_pro",
        "ThreeDaySaleTrend_item",
        "ThreeDaySaleTrend_pro",
    ]
]
cat_features = [
    "customer_id",
    "article_id",
    "product_code",
    "FN",
    "Active",
    "club_member_status",
    "fashion_news_frequency",
    "age",
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",
]


In [None]:
# * convert categorical featues as `CategoricalDtype`
cate_dict = {f:set() for f in cat_features}
for i in tqdm(range(WEEK_NUM)):
    if i==0 and not TEST:
        continue
    for feat in cat_features:
        cate_dict[feat] = cate_dict[feat] | set(candidates[i][feat].unique())
        
for feat in cat_features:
    cate_dict[feat] = CategoricalDtype(categories=cate_dict[feat])

for i in tqdm(range(WEEK_NUM)):
    if i == 0 and not TEST:
        continue
    for feat in cat_features:
        candidates[i][feat] = candidates[i][feat].astype(cate_dict[feat])

### Train


In [None]:
params = {
    "objective": "lambdarank",
    "boosting_type": "gbdt",
    "metric": "map",
    "max_depth": 8,
    "num_leaves": 128,
    "learning_rate": 0.03,

    "verbose": -1,
    "eval_at": 12,
    # 'device':'gpu'
}

In [None]:
full_data = pd.concat(candidates[i] for i in range(1, WEEK_NUM))

In [None]:
train = full_data[full_data["week"] != 1]
valid = full_data[full_data["week"] == 1]
del full_data

In [None]:
# # * Remove customers with 0 hit_rate
# null_candidates = None
# for i in tqdm(range(WEEK_NUM)):
#     if i==0:
#         continue
#     tmp = pd.read_parquet(data_dir/"interim"/"v1"/f"week{i}_candidate.pqt")
#     tmp = tmp.groupby("customer_id")['hit_rate'].sum().reset_index()
#     tmp = tmp.loc[tmp['hit_rate']==0, ['customer_id']]
#     tmp['week'] = i
#     null_candidates = pd.concat([null_candidates, tmp], ignore_index=True)

# null_candidates['remove'] = 1
# train = pd.merge(train, null_candidates, on=['customer_id','week'], how='left')
# train = train[train['remove']!=1]
# del train['remove']
# train['customer_id'] = train['customer_id'].astype(cate_dict['customer_id'])

In [None]:
train = train.sort_values(by=['week', 'customer_id'],ascending=True).reset_index(drop=True)

In [None]:
sum(train['label']==0) / len(train)
# 0.9946384702188372 4-week
# 0.994541076811467 4-week remove all-negative customer

In [None]:
# valid = candidates[1]
valid = valid.sort_values(by=['customer_id'],ascending=True).reset_index(drop=True)

In [None]:
train_group = train[["customer_id", "article_id", "week"]]
train_group = train_group.astype("int")
train_group = train_group.groupby(["week","customer_id"])['article_id'].count().values

valid_group = valid[["customer_id", "article_id"]]
valid_group = valid_group.astype("int")
valid_group = valid_group.groupby(["customer_id"])['article_id'].count().values

In [None]:
train_set = lgb.Dataset(
    data=train[feats],
    label=train["label"],
    group=train_group,
    feature_name=feats,
    categorical_feature=cat_features,
    params=params,
)

valid_set = lgb.Dataset(
    data=valid[feats],
    label=valid["label"],
    group=valid_group,
    feature_name=feats,
    categorical_feature=cat_features,
    params=params,
)

ranker = lgb.train(
    params,
    train_set,
    num_boost_round=300,
    valid_sets=[valid_set],
    early_stopping_rounds=30,
    verbose_eval=10
)
# 0.82846 4-week
# 0.828932 3-week

In [None]:
ranker.save_model(data_dir / "interim/lgb_ranker.model", num_iteration=ranker.best_iteration)

### Inference

In [None]:
ranker = lgb.Booster(model_file=data_dir / "interim/lgb_ranker.model")

In [None]:
feat_importance = pd.DataFrame(
    {"feature": feats, "importance": ranker.feature_importance()}
).sort_values(by="importance", ascending=False)
plt.figure(figsize=(8, 12))
sns.barplot(y="feature", x="importance", data=feat_importance)

### Validate

In [None]:
val_candidates = valid.reset_index(drop=True)

In [None]:
probs = np.zeros(val_candidates.shape[0])
batch_size = 5_000_000
for batch in tqdm(range(0, val_candidates.shape[0], batch_size)):
    outputs = ranker.predict(val_candidates.loc[batch : batch + batch_size - 1, feats])
    probs[batch : batch + batch_size] = outputs

In [None]:
val_candidates["prob"] = probs

In [None]:
pred_lgb = val_candidates[['customer_id','article_id','prob']]
pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)

In [None]:
pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')

In [None]:
pred_lgb = pred_lgb.groupby("customer_id")["prediction"].progress_apply(list).reset_index()

In [None]:
label = labels[1]
label = pd.merge(label, pred_lgb, on="customer_id", how="left")

In [None]:
map_at_k(label["article_id"], label["prediction"], k=12)

# 0.02820525160663368 1-week add normalization
# 0.028936597823123886 4-week 0.0274
# 0.029111989281461418 4-week drop full-negative customer
# 0.029165419468984943 3-week 0.0270

### Test

In [35]:
test_candidates = candidates[0].reset_index(drop=True)

In [36]:
# As the threshold is set mannually when generating candidates for test set, we need to change the name of the columns
test_candidates.rename(columns = {
    'SaleTrend_7_top4':'SaleTrend_7_top50', 
    'TimeHistoryDecay_3_top31':'TimeHistoryDecay_3_top50',
    'TimeHistoryDecay_7_top24':'TimeHistoryDecay_7_top50',
    'TimeHistory_25_1':'TimeHistory_50_1', 
    'TimeHistory_30_2':'TimeHistory_50_2', 
    'UGSaleTrend_3_top2':'UGSaleTrend_7_top50', 
    'UGTimeHistory_age_bins_351':'UGTimeHistory_age_bins_501',
    'UGTimeHistory_age_bins_38.52':'UGTimeHistory_age_bins_502'
},inplace=True)

In [37]:
probs = np.zeros(test_candidates.shape[0])
batch_size = 5_000_000
for batch in tqdm(range(0, test_candidates.shape[0], batch_size)):
    outputs = ranker.predict(test_candidates.loc[batch : batch + batch_size - 1, feats])
    probs[batch : batch + batch_size] = outputs

100% 12/12 [05:23<00:00, 26.98s/it]


In [38]:
test_candidates["prob"] = probs

In [39]:
pred_lgb = test_candidates[['customer_id','article_id','prob']]
pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)

In [40]:
idx2uid = pickle.load(open(data_dir/"index_id_map/user_index2id.pkl", "rb"))
idx2iid = pickle.load(open(data_dir/"index_id_map/item_index2id.pkl", "rb"))

In [41]:
pred_lgb['prediction'] = pred_lgb['prediction'].map(idx2iid).progress_apply(lambda x:'0'+str(x))
pred_lgb = pred_lgb.groupby('customer_id')['prediction'].progress_apply(list).reset_index()
pred_lgb['prediction'] = pred_lgb['prediction'].progress_apply(lambda x: ' '.join(x[:12]))

  0% 77317/55310622 [00:00<04:13, 217935.74it/s]
100% 1371980/1371980 [02:44<00:00, 8339.20it/s]
100% 1371980/1371980 [00:03<00:00, 391006.78it/s]


In [42]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

In [43]:
del submission['prediction']
submission = submission.merge(pred_lgb, on='customer_id', how='left')
submission['customer_id'] = submission['customer_id'].map(idx2uid)

In [44]:
submission.to_csv('submission.csv', index=False)

In [45]:
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0779781015 0915529003 0918522001 0751471043 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0918292001 0918522001 0915529003 0448509014 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0805000001 0918292001 0918522001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0751471043 0918522001 0915529003 0751471001 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152001 0927530006 0791587001 0852584001 07...


### Submit

In [46]:
submit_msg = """
0.029111989281461418 4-week drop full negative user
---
OrderHistory(train, 3),
OrderHistory(train, 7),
OrderHistoryDecay(train, 3, n=50),
OrderHistoryDecay(train, 7, n=50),
ItemPair(OrderHistory(train, 3).retrieve(), name='1'),
ItemPair(OrderHistory(train, 7).retrieve(), name='2'),
ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),
ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),
UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),
UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),
UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),
TimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),
TimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),
TimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),
TimeHistoryDecay(customer_list, train, 7, n=50 if week!=0 else 8),
SaleTrend(customer_list, train, 7, n=50 if week!=0 else 2)
---
min_pos_rate = 0.006
pivot
"""

In [47]:
submit_msg

"\n0.029111989281461418 4-week drop full negative user\n---\nOrderHistory(train, 3),\nOrderHistory(train, 7),\nOrderHistoryDecay(train, 3, n=50),\nOrderHistoryDecay(train, 7, n=50),\nItemPair(OrderHistory(train, 3).retrieve(), name='1'),\nItemPair(OrderHistory(train, 7).retrieve(), name='2'),\nItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),\nItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),\nUserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),\nUserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),\nUserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),\nTimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),\nTimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),\nTimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),\nTimeHistoryDecay(customer_list, train, 7, n=50 if w

In [60]:
# ! mkdir ~/.kaggle
# ! cp ../kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json

In [65]:
# %pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 3.9 MB/s eta 0:00:011
Collecting python-slugify
  Downloading python_slugify-6.1.2-py2.py3-none-any.whl (9.4 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 5.7 MB/s eta 0:00:011
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73052 sha256=2aaa2ba8e16413cb4c49862167277c1d8f29337d273f3f9b02c7f2a0d2b3a357
  Stored in directory: /root/.cache/pip/wheels/29/da/11/144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106
Successfully built kaggle
Installing collected packages: text-unidecode, python-slugify, kaggle
Successfully installed kaggle-1.5.12 python-slugify-6.1.2 text-unidecode-1.3
You should consider upgrading via the '/usr/bin/python3 -m pip instal

In [48]:
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f ./submission.csv -m '\n0.029111989281461418 4-week drop full negative user\n---\nOrderHistory(train, 3),\nOrderHistory(train, 7),\nOrderHistoryDecay(train, 3, n=50),\nOrderHistoryDecay(train, 7, n=50),\nItemPair(OrderHistory(train, 3).retrieve(), name='1'),\nItemPair(OrderHistory(train, 7).retrieve(), name='2'),\nItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),\nItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),\nUserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),\nUserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),\nUserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),\nTimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),\nTimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),\nTimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),\nTimeHistoryDecay(customer_list, train, 7, n=50 if week!=0 else 8),\nSaleTrend(customer_list, train, 7, n=50 if week!=0 else 2)\n---\nmin_pos_rate = 0.006\npivot\n'

100%|████████████████████████████████████████| 258M/258M [00:11<00:00, 22.7MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations