In [1]:
# %pip install -U lightgbm==3.3.2

In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [2]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("../src/")
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [3]:
from data import DataHelper
from data.metrics import map_at_k, hr_at_k, recall_at_k

from retrieval.rules import (
    OrderHistory,
    OrderHistoryDecay,
    ItemPair,

    UserGroupTimeHistory,
    UserGroupSaleTrend,

    TimeHistory,
    TimeHistoryDecay,
    SaleTrend,

    OutOfStock,
)
from retrieval.collector import RuleCollector

from features import cum_sale, week_sale, repurchase_ratio, purchased_before

from utils import calc_valid_date, merge_week_data


In [4]:
data_dir = Path("../data/")

In [67]:
TRAIN_WEEK_NUM = 4
WEEK_NUM = TRAIN_WEEK_NUM + 2
VERSION_NAME = "pivot"# "v1"

In [6]:
import os
if not os.path.exists(data_dir/"interim"/VERSION_NAME):
    os.mkdir(data_dir/"interim"/VERSION_NAME)

In [7]:
TEST = True # * Set as `False` when do local experiments

Pepare data: encoding ids and preprocessing

In [8]:
dh = DataHelper(data_dir)

In [9]:
# data = dh.preprocess_data(save=True, name="encoded_full") # * run only once

In [10]:
data = dh.load_data(name="encoded_full")

In [11]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

## Retrieval


Generate candidates for each week

In [12]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
data['user']['age_bins'] = pd.cut(data['user']['age'], listBin)

In [13]:
# * WEEK_NUM = 0: test
# * WEEK_NUM = 1: valid
# * WEEK_NUM > 1: train
for week in range(WEEK_NUM):
    if week == 0 and not TEST:
        continue
    trans = data["inter"]

    start_date, end_date = calc_valid_date(week)
    print(f"Week {week}: [{start_date}, {end_date})")
    
    last_week_start = pd.to_datetime(start_date) - pd.Timedelta(days=7)
    last_week_start = last_week_start.strftime("%Y-%m-%d")
    
    last_3day_start = pd.to_datetime(start_date) - pd.Timedelta(days=3)
    last_3day_start = last_3day_start.strftime("%Y-%m-%d")

    train, valid = dh.split_data(trans, start_date, end_date)

    last_week = train.loc[train.t_dat >= last_week_start]
    last_week = last_week.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
    
    last_3days = train.loc[train.t_dat >= last_3day_start]
    last_3days = last_3days.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
    
    train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

    if week != 0:
        customer_list = valid["customer_id"].values
    else:
        customer_list = submission['customer_id'].values

    # * ========================== Retrieval Strategies ==========================

    candidates = RuleCollector().collect(
        valid = valid,
        customer_list=customer_list,
        rules=[
            OrderHistory(train, 3),
            OrderHistory(train, 7),
            OrderHistoryDecay(train, 3, n=50),
            OrderHistoryDecay(train, 7, n=50),
            ItemPair(OrderHistory(train, 3).retrieve(), name='1'),
            ItemPair(OrderHistory(train, 7).retrieve(), name='2'),
            ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),
            ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),
            UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),
            UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),
            UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),
            TimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),
            TimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),
            TimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),
            TimeHistoryDecay(customer_list, train, 7, n=50 if week!=0 else 8),
            SaleTrend(customer_list, train, 7, n=50 if week!=0 else 2),
        ],
        filters=[OutOfStock(trans)],
        min_pos_rate=0.006,
        compress=False,
    )
    
#     candidates = (
#         pd.pivot_table(
#             candidates,
#             values="score",
#             index=["customer_id", "article_id"],
#             columns=["method"],
#             aggfunc=np.sum,
#         )
#         .reset_index()
#         # .fillna(0)
#     )

    candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
    if WEEK_NUM != 0:
        valid.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_label.pqt")

Week 0: [2020-09-23, 2020-09-30)


Retrieve items by rules: 100% 16/16 [09:56<00:00, 37.27s/it]


Week 1: [2020-09-16, 2020-09-23)


Retrieve items by rules:   6% 1/16 [00:30<07:44, 30.99s/it]

Positive rate: 0.03038



Retrieve items by rules:  12% 2/16 [01:03<07:28, 32.03s/it]

Positive rate: 0.02859
Positive rate: 0.01413



Retrieve items by rules:  19% 3/16 [02:23<11:36, 53.61s/it]

Positive rate: 0.01295



Retrieve items by rules:  25% 4/16 [03:44<12:55, 64.60s/it]

Positive rate: 0.01519



Retrieve items by rules:  31% 5/16 [04:11<09:22, 51.18s/it]

Positive rate: 0.01472



Retrieve items by rules:  38% 6/16 [04:43<07:24, 44.45s/it]

Positive rate: 0.00908



Retrieve items by rules:  44% 7/16 [05:30<06:47, 45.29s/it]

Positive rate: 0.00853



Retrieve items by rules:  50% 8/16 [06:28<06:35, 49.38s/it]

TOP15.0 Positive rate: 0.00602



Retrieve items by rules:  56% 9/16 [07:59<07:15, 62.27s/it]

TOP20.5 Positive rate: 0.00608



Retrieve items by rules:  62% 10/16 [09:27<07:01, 70.29s/it]

TOP2.0 Positive rate: 0.00652



Retrieve items by rules:  69% 11/16 [11:09<06:39, 79.94s/it]

TOP9.0 Positive rate: 0.00631



Retrieve items by rules:  75% 12/16 [12:38<05:31, 82.92s/it]

TOP16.0 Positive rate: 0.00601



Retrieve items by rules:  81% 13/16 [14:11<04:17, 85.80s/it]

TOP12.0 Positive rate: 0.00607



Retrieve items by rules:  88% 14/16 [16:30<03:23, 102.00s/it]

TOP8.0 Positive rate: 0.00616



Retrieve items by rules:  94% 15/16 [18:47<01:52, 112.63s/it]

TOP2.0 Positive rate: 0.00758


Retrieve items by rules: 100% 16/16 [20:23<00:00, 76.48s/it] 


Week 2: [2020-09-09, 2020-09-16)


Retrieve items by rules:   6% 1/16 [00:31<07:45, 31.02s/it]

Positive rate: 0.03008



Retrieve items by rules:  12% 2/16 [01:03<07:26, 31.89s/it]

Positive rate: 0.02825
Positive rate: 0.01354



Retrieve items by rules:  19% 3/16 [02:23<11:41, 53.99s/it]

Positive rate: 0.01244



Retrieve items by rules:  25% 4/16 [03:46<13:05, 65.45s/it]

Positive rate: 0.01435



Retrieve items by rules:  31% 5/16 [04:14<09:31, 51.91s/it]

Positive rate: 0.01376



Retrieve items by rules:  38% 6/16 [04:46<07:29, 44.95s/it]

Positive rate: 0.00886



Retrieve items by rules:  44% 7/16 [05:36<07:00, 46.75s/it]

Positive rate: 0.00830



Retrieve items by rules:  50% 8/16 [06:41<06:59, 52.41s/it]

TOP10.0 Positive rate: 0.00610



Retrieve items by rules:  56% 9/16 [08:16<07:40, 65.78s/it]

TOP7.0 Positive rate: 0.00611



Retrieve items by rules:  62% 10/16 [09:51<07:29, 74.86s/it]

TOP13.0 Positive rate: 0.00236



Retrieve items by rules:  69% 11/16 [11:37<07:02, 84.47s/it]

TOP6.0 Positive rate: 0.00595



Retrieve items by rules:  75% 12/16 [13:12<05:50, 87.63s/it]

TOP5.0 Positive rate: 0.00589



Retrieve items by rules:  81% 13/16 [14:46<04:28, 89.54s/it]

TOP6.0 Positive rate: 0.00606



Retrieve items by rules:  88% 14/16 [17:10<03:31, 105.91s/it]

TOP7.0 Positive rate: 0.00603



Retrieve items by rules:  94% 15/16 [19:31<01:56, 116.66s/it]

TOP12.0 Positive rate: 0.00226


Retrieve items by rules: 100% 16/16 [21:12<00:00, 79.53s/it] 


Week 3: [2020-09-02, 2020-09-09)


Retrieve items by rules:   6% 1/16 [00:31<07:53, 31.60s/it]

Positive rate: 0.02822



Retrieve items by rules:  12% 2/16 [01:04<07:33, 32.43s/it]

Positive rate: 0.02679
Positive rate: 0.01321



Retrieve items by rules:  19% 3/16 [02:26<11:53, 54.90s/it]

Positive rate: 0.01237



Retrieve items by rules:  25% 4/16 [03:48<13:10, 65.85s/it]

Positive rate: 0.01490



Retrieve items by rules:  31% 5/16 [04:17<09:34, 52.25s/it]

Positive rate: 0.01435



Retrieve items by rules:  38% 6/16 [04:47<07:29, 44.97s/it]

Positive rate: 0.00910



Retrieve items by rules:  44% 7/16 [05:44<07:19, 48.85s/it]

Positive rate: 0.00873



Retrieve items by rules:  50% 8/16 [06:54<07:24, 55.51s/it]

TOP16.0 Positive rate: 0.00601



Retrieve items by rules:  56% 9/16 [08:37<08:13, 70.43s/it]

TOP10.5 Positive rate: 0.00612



Retrieve items by rules:  62% 10/16 [10:19<07:59, 79.98s/it]

TOP1.0 Positive rate: 0.00443



Retrieve items by rules:  69% 11/16 [12:08<07:25, 89.14s/it]

TOP11.0 Positive rate: 0.00619



Retrieve items by rules:  75% 12/16 [13:46<06:07, 91.78s/it]

TOP9.0 Positive rate: 0.00619



Retrieve items by rules:  81% 13/16 [15:26<04:42, 94.31s/it]

TOP10.0 Positive rate: 0.00601



Retrieve items by rules:  88% 14/16 [17:57<03:42, 111.18s/it]

TOP9.0 Positive rate: 0.00622



Retrieve items by rules:  94% 15/16 [20:25<02:02, 122.30s/it]

TOP3.0 Positive rate: 0.00457


Retrieve items by rules: 100% 16/16 [22:13<00:00, 83.36s/it] 


Week 4: [2020-08-26, 2020-09-02)


Retrieve items by rules:   6% 1/16 [00:30<07:42, 30.80s/it]

Positive rate: 0.02406



Retrieve items by rules:  12% 2/16 [01:02<07:19, 31.42s/it]

Positive rate: 0.02268
Positive rate: 0.01182



Retrieve items by rules:  19% 3/16 [02:23<11:41, 53.93s/it]

Positive rate: 0.01123



Retrieve items by rules:  25% 4/16 [03:46<13:05, 65.49s/it]

Positive rate: 0.01353



Retrieve items by rules:  31% 5/16 [04:15<09:34, 52.25s/it]

Positive rate: 0.01306



Retrieve items by rules:  38% 6/16 [04:46<07:31, 45.20s/it]

Positive rate: 0.00876



Retrieve items by rules:  44% 7/16 [05:49<07:38, 50.97s/it]

Positive rate: 0.00836



Retrieve items by rules:  50% 8/16 [07:06<07:52, 59.04s/it]

TOP13.0 Positive rate: 0.00603



Retrieve items by rules:  56% 9/16 [08:49<08:30, 72.99s/it]

TOP14.5 Positive rate: 0.00606



Retrieve items by rules:  62% 10/16 [10:32<08:13, 82.17s/it]

TOP43.0 Positive rate: 0.00349



Retrieve items by rules:  69% 11/16 [12:30<07:45, 93.15s/it]

TOP5.0 Positive rate: 0.00631



Retrieve items by rules:  75% 12/16 [14:08<06:17, 94.48s/it]

TOP8.0 Positive rate: 0.00650



Retrieve items by rules:  81% 13/16 [15:49<04:49, 96.62s/it]

TOP7.0 Positive rate: 0.00607



Retrieve items by rules:  88% 14/16 [18:24<03:48, 114.32s/it]

TOP3.0 Positive rate: 0.00660



Retrieve items by rules:  94% 15/16 [20:54<02:04, 124.97s/it]

TOP7.0 Positive rate: 0.00346


Retrieve items by rules: 100% 16/16 [22:47<00:00, 85.46s/it] 


Week 5: [2020-08-19, 2020-08-26)


Retrieve items by rules:   6% 1/16 [00:30<07:42, 30.83s/it]

Positive rate: 0.02329



Retrieve items by rules:  12% 2/16 [01:02<07:19, 31.38s/it]

Positive rate: 0.02211
Positive rate: 0.01187



Retrieve items by rules:  19% 3/16 [02:19<11:16, 52.08s/it]

Positive rate: 0.01101



Retrieve items by rules:  25% 4/16 [03:37<12:28, 62.37s/it]

Positive rate: 0.01325



Retrieve items by rules:  31% 5/16 [04:04<09:05, 49.62s/it]

Positive rate: 0.01269



Retrieve items by rules:  38% 6/16 [04:35<07:12, 43.25s/it]

Positive rate: 0.00869



Retrieve items by rules:  44% 7/16 [05:30<07:05, 47.24s/it]

Positive rate: 0.00826



Retrieve items by rules:  50% 8/16 [06:43<07:22, 55.32s/it]

TOP2.5 Positive rate: 0.00600



Retrieve items by rules:  56% 9/16 [07:47<06:46, 58.05s/it]

TOP8.0 Positive rate: 0.00605



Retrieve items by rules:  62% 10/16 [09:09<06:32, 65.40s/it]

TOP1.0 Positive rate: 0.00443



Retrieve items by rules:  69% 11/16 [10:44<06:12, 74.43s/it]

TOP2.0 Positive rate: 0.00657



Retrieve items by rules:  75% 12/16 [11:51<04:49, 72.29s/it]

TOP4.0 Positive rate: 0.00580



Retrieve items by rules:  81% 13/16 [13:13<03:45, 75.19s/it]

TOP1.0 Positive rate: 0.00644



Retrieve items by rules:  88% 14/16 [15:22<03:03, 91.60s/it]

TOP7.0 Positive rate: 0.00569



Retrieve items by rules:  94% 15/16 [17:20<01:39, 99.30s/it]

TOP9.0 Positive rate: 0.00369


Retrieve items by rules: 100% 16/16 [18:57<00:00, 71.10s/it]


In [16]:
for i in tqdm(range(WEEK_NUM)):
    if i == 0 and not TEST:
        continue

    tmp_candidate = pd.read_parquet(data_dir/"interim"/"v1"/f"week{i}_candidate.pqt")
    print(tmp_candidate.shape, end='\t')
    tmp_candidate = (
        pd.pivot_table(
            tmp_candidate,
            values="score",
            index=["customer_id", "article_id"],
            columns=["method"],
            aggfunc=np.sum,
        )
        .reset_index()
        # .fillna(0)
    )
    print(tmp_candidate.shape)
    
    tmp_candidate.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate.pqt")



  0% 0/6 [00:00<?, ?it/s]

(142403016, 5)	(55310622, 18)



 17% 1/6 [05:16<26:20, 316.12s/it]

(32932743, 5)	(13987231, 18)



 33% 2/6 [06:29<11:32, 173.11s/it]

(31893823, 5)	(14341918, 18)



 50% 3/6 [07:39<06:18, 126.28s/it]

(32084589, 5)	(13701711, 18)



 67% 4/6 [08:51<03:29, 104.69s/it]

(33840059, 5)	(16144422, 18)



 83% 5/6 [10:06<01:34, 94.01s/it] 

(26593215, 5)	(12332815, 18)


100% 6/6 [11:05<00:00, 110.84s/it]


In [14]:
del train, valid, last_week, customer_list, candidates
gc.collect()

2047

## Feature engineering


In [15]:
user = data["user"]
item = data["item"]
inter = data["inter"]

In [16]:
# merge `product_code`
inter = inter.merge(item[["article_id", "product_code"]], on="article_id", how="left")
# calculate week number
inter['week'] = (pd.to_datetime('2020-09-29') - pd.to_datetime(inter['t_dat'])).dt.days // 7

In [17]:
inter = inter.sort_values(["customer_id", "t_dat"]).reset_index(drop=True)

In [18]:
inter.shape

(31788324, 7)

Week Sale

In [19]:
inter["item_sale"] = week_sale(inter, ["article_id"])
inter["pro_sale"] = week_sale(inter, ["product_code"])
inter["item_sale_uni"] = week_sale(inter, ["article_id"], True)
inter["pro_sale_uni"] = week_sale(inter, ["product_code"], True)

inter["item_sale_ratio"] = inter["item_sale"] / (inter["pro_sale"] + 1e-6)
inter["item_sale_uni_ratio"] = inter["pro_sale_uni"] / (inter["pro_sale_uni"] + 1e-6)

item_feats = [
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",
]
inter = inter.merge(item[["article_id", *item_feats]], on="article_id", how="left")

for feat in tqdm(item_feats):
    inter[f"{feat}_sale"] = week_sale(inter, [feat], f"{feat}_sale")

inter = inter.drop(columns=item_feats)

100% 6/6 [01:03<00:00, 10.52s/it]


Repurchase Ratio

In [20]:
inter['i_repurchase_ratio'] = repurchase_ratio(inter, ['article_id'])
inter['p_repurchase_ratio'] = repurchase_ratio(inter, ['product_code'])

Already Bought Item

In [21]:
inter["purchased_item"] = purchased_before(inter, ["article_id"])
inter["purchased_pro"] = purchased_before(inter, ["product_code"])

Save data

In [22]:
inter.shape

(31788324, 23)

In [23]:
inter.to_parquet(data_dir / "interim/processed_inter.pqt")

## Merge Features


In [68]:
candidates = {}
labels = {}
for i in tqdm(range(WEEK_NUM)):
    if i == 0 and not TEST:
        continue

    tmp_candidate = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate.pqt")
    candidates[i] = tmp_candidate
    if i == 0:
        labels[i] = None
    else:
        tmp_label = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_label.pqt")
        labels[i] = tmp_label

 60% 3/5 [00:23<00:15,  7.98s/it]


KeyboardInterrupt: 

In [None]:
inter = pd.read_parquet(data_dir / "interim/processed_inter.pqt")
data["inter"] = inter

Merge features

In [None]:
for i in range(WEEK_NUM):
    if i in candidates:
        candidates[i] = merge_week_data(data, i, candidates[i], labels[i])

Merge user and item embeddings

In [None]:
dssm_user_embd = np.load(data_dir / "external/dssm_user_embd.npy", allow_pickle=True)
dssm_item_embd = np.load(data_dir / "external/dssm_item_embd.npy", allow_pickle=True)

for i in range(WEEK_NUM):
    if i not in candidates:
        continue
    tmp = candidates[i]
    sim = np.zeros(tmp.shape[0])
    batch_size = 10000
    for batch in tqdm(range(0, tmp.shape[0], batch_size)):
        tmp_users = tmp.loc[batch : batch + batch_size - 1, 'customer_id'].values-1
        tmp_items = tmp.loc[batch : batch + batch_size - 1, 'article_id'].values-1
        tmp_user_embd = np.expand_dims(dssm_user_embd[tmp_users],1) # (batch_size, 1, dim)
        tmp_item_embd = np.expand_dims(dssm_item_embd[tmp_items],2) # (batch_size, dim, 1)
        tmp_sim = np.einsum('ijk,ikj->ij',tmp_user_embd,tmp_item_embd)
        sim[batch : batch + batch_size] = tmp_sim.reshape(-1)
        
    tmp["dssm_similarity"] = sim
    candidates[i] = tmp

del dssm_user_embd, dssm_item_embd
gc.collect()

In [None]:
yt_user_embd = np.load(data_dir / "external/yt_user_embd.npy", allow_pickle=True)
yt_item_embd = np.load(data_dir / "external/yt_item_embd.npy", allow_pickle=True)

for i in range(WEEK_NUM):
    if i not in candidates:
        continue
    tmp = candidates[i]
    sim = np.zeros(tmp.shape[0])
    batch_size = 10000
    for batch in tqdm(range(0, tmp.shape[0], batch_size)):
        tmp_users = tmp.loc[batch : batch + batch_size - 1, 'customer_id'].values-1
        tmp_items = tmp.loc[batch : batch + batch_size - 1, 'article_id'].values-1
        tmp_user_embd = np.expand_dims(yt_user_embd[tmp_users],1) # (batch_size, 1, dim)
        tmp_item_embd = np.expand_dims(yt_item_embd[tmp_items],2) # (batch_size, dim, 1)
        tmp_sim = np.einsum('ijk,ikj->ij',tmp_user_embd,tmp_item_embd)
        sim[batch : batch + batch_size] = tmp_sim.reshape(-1)
        
    tmp["yt_similarity"] = sim
    # tmp["yt_similarity_cos"] = cos_sim
    candidates[i] = tmp

del yt_user_embd, yt_item_embd
gc.collect()

Save results

In [24]:
for i in tqdm(candidates.keys()):
    candidates[i].to_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate_full.pqt")

100% 6/6 [01:33<00:00, 15.62s/it]


## Ranking


In [69]:
candidates = {}
labels = {}
for i in range(WEEK_NUM):
    if i==0 and not TEST:
        continue
    candidates[i] = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate_full.pqt")
    if i != 0:
        tmp_label = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_label.pqt")
        labels[i] = tmp_label
    else:
        labels[i] = None

In [70]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "sales_channel_id",
        "t_dat",
        "week",
        "WeekSaleTrend_item",
        "WeekSaleTrend_pro",
        "ThreeDaySaleTrend_item",
        "ThreeDaySaleTrend_pro",
#         "rule_similarity",
#         "hit_rate",
#         "match_score",
        # "yt_similarity",
        # "dssm_similarity",
    ]
]
cat_features = [
#     "method",  # * retrieval method
    "customer_id",
    "article_id",
    "product_code",
    "FN",
    "Active",
    "club_member_status",
    "fashion_news_frequency",
    "age",
    "product_type_no",
    "product_group_name",
    "graphical_appearance_no",
    "colour_group_code",
    "perceived_colour_value_id",
    "perceived_colour_master_id",
]


In [71]:
# * convert categorical featues as `CategoricalDtype`
cate_dict = {f:set() for f in cat_features}
for i in tqdm(range(WEEK_NUM)):
    if i==0 and not TEST:
        continue
    for feat in cat_features:
        cate_dict[feat] = cate_dict[feat] | set(candidates[i][feat].unique())
        
for feat in cat_features:
    cate_dict[feat] = CategoricalDtype(categories=cate_dict[feat])

for i in tqdm(range(WEEK_NUM)):
    if i == 0 and not TEST:
        continue
    for feat in cat_features:
        candidates[i][feat] = candidates[i][feat].astype(cate_dict[feat])

100% 5/5 [00:06<00:00,  1.26s/it]
100% 5/5 [01:02<00:00, 12.43s/it]


### Train


In [72]:
params = {
    "objective": "lambdarank",
    "boosting_type": "gbdt",
    "metric": "map",
    "max_depth": 8,
    "num_leaves": 128,
    "learning_rate": 0.03,

    "verbose": -1,
    "eval_at": 12,
    # 'device':'gpu'
}

In [73]:
train = pd.concat(candidates[i] for i in range(2, WEEK_NUM))

In [74]:
# # * Remove customers with 0 hit_rate
# null_candidates = None
# for i in tqdm(range(WEEK_NUM)):
#     if i==0:
#         continue
#     tmp = pd.read_parquet(data_dir/"interim"/"v1"/f"week{i}_candidate.pqt")
#     tmp = tmp.groupby("customer_id")['hit_rate'].sum().reset_index()
#     tmp = tmp.loc[tmp['hit_rate']==0, ['customer_id']]
#     tmp['week'] = i
#     null_candidates = pd.concat([null_candidates, tmp], ignore_index=True)

# null_candidates['remove'] = 1
# train = pd.merge(train, null_candidates, on=['customer_id','week'], how='left')
# train = train[train['remove']!=1]
# del train['remove']
# train['customer_id'] = train['customer_id'].astype(cate_dict['customer_id'])

In [75]:
train = train.sort_values(by=['week', 'customer_id'],ascending=True).reset_index(drop=True)

In [76]:
sum(train['label']==0) / len(train)
# 0.9946384702188372

0.9947380165899505

In [77]:
valid = candidates[1]
valid = valid.sort_values(by=['customer_id'],ascending=True).reset_index(drop=True)

In [78]:
train_group = train[["customer_id", "article_id", "week"]]
train_group = train_group.astype("int")
train_group = train_group.groupby(["week","customer_id"])['article_id'].count().values

valid_group = valid[["customer_id", "article_id"]]
valid_group = valid_group.astype("int")
valid_group = valid_group.groupby(["customer_id"])['article_id'].count().values

In [None]:
train_set = lgb.Dataset(
    data=train[feats],
    label=train["label"],
    group=train_group,
    feature_name=feats,
    categorical_feature=cat_features,
    params=params,
)

valid_set = lgb.Dataset(
    data=valid[feats],
    label=valid["label"],
    group=valid_group,
    feature_name=feats,
    categorical_feature=cat_features,
    params=params,
)

ranker = lgb.train(
    params,
    train_set,
    num_boost_round=300,
    valid_sets=[valid_set],
    early_stopping_rounds=50,
    verbose_eval=10
)
# 0.82846 4-week

Training until validation scores don't improve for 50 rounds
[10]	valid_0's map@12: 0.827321


In [None]:
ranker.save_model(data_dir / "interim/lgb_ranker.model", num_iteration=ranker.best_iteration)

### Inference

In [None]:
ranker = lgb.Booster(model_file=data_dir / "interim/lgb_ranker.model")

In [None]:
feat_importance = pd.DataFrame(
    {"feature": feats, "importance": ranker.feature_importance()}
).sort_values(by="importance", ascending=False)
plt.figure(figsize=(8, 12))
sns.barplot(y="feature", x="importance", data=feat_importance)

### Validate

In [None]:
val_candidates = candidates[1].reset_index(drop=True)

In [None]:
probs = np.zeros(val_candidates.shape[0])
batch_size = 5_000_000
for batch in tqdm(range(0, val_candidates.shape[0], batch_size)):
    outputs = ranker.predict(val_candidates.loc[batch : batch + batch_size - 1, feats])
    probs[batch : batch + batch_size] = outputs

In [None]:
val_candidates["prob"] = probs

In [None]:
pred_lgb = val_candidates[['customer_id','article_id','prob']]
pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)

In [None]:
pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')

In [None]:
pred_lgb = pred_lgb.groupby("customer_id")["prediction"].progress_apply(list).reset_index()

In [None]:
label = labels[1]
label = pd.merge(label, pred_lgb, on="customer_id", how="left")

In [None]:
map_at_k(label["article_id"], label["prediction"], k=12)

# 0.02820525160663368 1-week add normalization
# 0.028936597823123886 4-week 0.0274

### Test

In [45]:
test_candidates = candidates[0].reset_index(drop=True)

In [50]:
test_candidates.rename(columns = {
    'SaleTrend_7_top2':'SaleTrend_7_top50', 
    'TimeHistoryDecay_3_top12':'TimeHistoryDecay_3_top50',
    'TimeHistoryDecay_7_top8':'TimeHistoryDecay_7_top50',
    'TimeHistory_9_1':'TimeHistory_50_1', 
    'TimeHistory_16_2':'TimeHistory_50_2', 
    'UGSaleTrend_7_top2':'UGSaleTrend_7_top50', 
    'UGTimeHistory_age_bins_151':'UGTimeHistory_age_bins_501',
    'UGTimeHistory_age_bins_20.52':'UGTimeHistory_age_bins_502'
},inplace=True)

In [51]:
probs = np.zeros(test_candidates.shape[0])
batch_size = 5_000_000
for batch in tqdm(range(0, test_candidates.shape[0], batch_size)):
    outputs = ranker.predict(test_candidates.loc[batch : batch + batch_size - 1, feats])
    probs[batch : batch + batch_size] = outputs

100% 12/12 [04:18<00:00, 21.52s/it]


In [52]:
test_candidates["prob"] = probs

In [53]:
pred_lgb = test_candidates[['customer_id','article_id','prob']]
pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)

In [54]:
idx2uid = pickle.load(open(data_dir/"index_id_map/user_index2id.pkl", "rb"))
idx2iid = pickle.load(open(data_dir/"index_id_map/item_index2id.pkl", "rb"))

In [55]:
pred_lgb['prediction'] = pred_lgb['prediction'].map(idx2iid).progress_apply(lambda x:'0'+str(x))
pred_lgb = pred_lgb.groupby('customer_id')['prediction'].progress_apply(list).reset_index()
pred_lgb['prediction'] = pred_lgb['prediction'].progress_apply(lambda x: ' '.join(x[:12]))

  0% 77317/55310622 [00:00<04:18, 213741.39it/s]
100% 1371980/1371980 [02:33<00:00, 8932.50it/s]
100% 1371980/1371980 [00:03<00:00, 381710.56it/s]


In [56]:
del submission['prediction']
submission = submission.merge(pred_lgb, on='customer_id', how='left')
submission['customer_id'] = submission['customer_id'].map(idx2uid)

In [57]:
submission.to_csv('submission.csv', index=False)

In [58]:
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0779781015 0751471043 0918522001 0573085028 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0918522001 0918292001 0448509014 0915529003 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0805000001 0918292001 0918522001 06...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0751471043 0918522001 0915529003 0751471001 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152001 0927530006 0791587001 0730683050 08...


### Submit

In [62]:
submit_msg = """
0.028936597823123886 4-week
---
OrderHistory(train, 3),
OrderHistory(train, 7),
OrderHistoryDecay(train, 3, n=50),
OrderHistoryDecay(train, 7, n=50),
ItemPair(OrderHistory(train, 3).retrieve(), name='1'),
ItemPair(OrderHistory(train, 7).retrieve(), name='2'),
ItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),
ItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),
UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),
UserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),
UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),
TimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),
TimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),
TimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),
TimeHistoryDecay(customer_list, train, 7, n=50 if week!=0 else 8),
SaleTrend(customer_list, train, 7, n=50 if week!=0 else 2)
---
min_pos_rate = 0.006
pivot
"""

In [60]:
! mkdir ~/.kaggle
! cp ../kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [65]:
# %pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 3.9 MB/s eta 0:00:011
Collecting python-slugify
  Downloading python_slugify-6.1.2-py2.py3-none-any.whl (9.4 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 5.7 MB/s eta 0:00:011
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73052 sha256=2aaa2ba8e16413cb4c49862167277c1d8f29337d273f3f9b02c7f2a0d2b3a357
  Stored in directory: /root/.cache/pip/wheels/29/da/11/144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106
Successfully built kaggle
Installing collected packages: text-unidecode, python-slugify, kaggle
Successfully installed kaggle-1.5.12 python-slugify-6.1.2 text-unidecode-1.3
You should consider upgrading via the '/usr/bin/python3 -m pip instal

In [66]:
! kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f ./submission.csv -m "\n0.028936597823123886 4-week\n---\nOrderHistory(train, 3),\nOrderHistory(train, 7),\nOrderHistoryDecay(train, 3, n=50),\nOrderHistoryDecay(train, 7, n=50),\nItemPair(OrderHistory(train, 3).retrieve(), name='1'),\nItemPair(OrderHistory(train, 7).retrieve(), name='2'),\nItemPair(OrderHistoryDecay(train, 3, n=50).retrieve(), name='3'),\nItemPair(OrderHistoryDecay(train, 7, n=50).retrieve(), name='4'),\nUserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], n=50 if week!=0 else 15, name='1'),\nUserGroupTimeHistory(data, customer_list, last_3days, ['age_bins'], n=50 if week!=0 else 20.5, name='2'),\nUserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=50 if week!=0 else 2),\nTimeHistory(customer_list, last_week, n=50 if week!=0 else 9, name='1'),\nTimeHistory(customer_list, last_3days, n=50 if week!=0 else 16, name='2'),\nTimeHistoryDecay(customer_list, train, 3, n=50 if week!=0 else 12),\nTimeHistoryDecay(customer_list, train, 7, n=50 if week!=0 else 8),\nSaleTrend(customer_list, train, 7, n=50 if week!=0 else 2)\n---\nmin_pos_rate = 0.006\npivot\n"

100%|████████████████████████████████████████| 258M/258M [00:12<00:00, 22.0MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations