### H&M Recommendation System — 03 Ranking

In this notebook, based on (customer, candidate_item) pair generated by `02_recall.ipynb`to create feature and label and then train ranking model(LightGBM Ranker)

#### Step 1: Load Dataset

In [1]:
import os
import gc
from datetime import datetime, timedelta
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle
from tqdm import tqdm

pd.set_option("display.max_columns", 5)
pd.set_option("display.max_rows", 5)

HM_DIR = "../hm_data"
RECALL_DIR = "../data/recall"
TRAIN_DIR = "../data/train"
RES_DIR = "../data/ranking"

VALID_START = pd.to_datetime("2020-09-16")

In [2]:
# save recall results
def save_pickle(obj, path, overwrite=False):
    if os.path.exists(path) and not overwrite:
        print(f"[Skip] {path} already exists.")
        return
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

In [3]:
cust_cand_pair = load_pickle(os.path.join(RECALL_DIR, "recall_final_merged.pkl"))
valid_gt = load_pickle(os.path.join(TRAIN_DIR, "validation_groundtruth.pkl"))
train_df = load_pickle(os.path.join(TRAIN_DIR, "train_df.pkl"))
valid_df = load_pickle(os.path.join(TRAIN_DIR, "valid_df.pkl"))

In [4]:
len(cust_cand_pair), len(valid_gt)

(1362281, 68984)

In [None]:
NEG_LIMIT = 50
rows = []
for cust in tqdm(valid_gt.keys(), desc="Building rank_df (valid users only)"):
    candidates = cust_cand_pair.get(cust, [])
    valid = valid_gt.get(cust, set())
    negatives = 0
    for article_id in candidates:
        label = 1 if article_id in valid else 0
        if label == 0:
            negatives += 1
        rows.append({"customer_id": cust, "article_id": article_id, "label": label})
        if(negatives > NEG_LIMIT):
            break
rank_df = pd.DataFrame(rows)

save_pickle(rank_df, os.path.join(RES_DIR, "rank_df.pkl"))
    

Building rank_df (valid users only):   6%|▋         | 4466/68984 [00:00<00:02, 22645.59it/s]

Building rank_df (valid users only): 100%|██████████| 68984/68984 [00:02<00:00, 26416.01it/s]


In [6]:
print(rank_df.head(5))
print(rank_df['label'].value_counts())

                                         customer_id  article_id  label
0  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   636455004      0
1  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   625939003      0
2  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   671186001      0
3  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   504155012      0
4  00039306476aaf41a07fed942884f16b30abfa83a2a8be...   632832001      0
label
0    3518184
1       9379
Name: count, dtype: int64


In [4]:
rank_df = load_pickle(os.path.join(RES_DIR, "rank_df.pkl"))

In [None]:
# Get user information

# Number of purchases
g = train_df.groupby("customer_id")
tx_cnt = g.size()
tx_cnt.head()
# How many unique items have been bought?
unique_items = g["article_id"].nunique()
# Number of days since the last purchase
train_df["t_dat"] = pd.to_datetime(train_df["t_dat"])
last_date = g["t_dat"].max()
recency_days = (VALID_START - last_date).dt.days

user_features = pd.DataFrame({
    "tx_cnt": tx_cnt,
    "unique_items": unique_items,
    "last_date": last_date,
    "recency_days": recency_days,
})
user_features = user_features.reset_index()
save_pickle(user_features, os.path.join(RES_DIR, "user_features.pkl"))

In [None]:
# merge user info into rank_df
rank_df = rank_df.merge(
    user_features,
    on="customer_id",
    how="left"
)

In [13]:
user_features.isna().mean()

customer_id     0.0
tx_cnt          0.0
unique_items    0.0
last_date       0.0
recency_days    0.0
dtype: float64

In [11]:
user_features.head()

Unnamed: 0,customer_id,tx_cnt,unique_items,last_date,recency_days
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,21,19,2020-09-05,11
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,86,64,2020-07-08,70
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,18,14,2020-09-15,1
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,2,2,2019-06-09,465
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,13,12,2020-08-12,35


In [12]:
rank_df.head()

Unnamed: 0,customer_id,article_id,...,last_date,recency_days
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,636455004,...,2020-02-06,223.0
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,625939003,...,2020-02-06,223.0
2,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,671186001,...,2020-02-06,223.0
3,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,504155012,...,2020-02-06,223.0
4,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,632832001,...,2020-02-06,223.0


In [14]:
# Get product information
articles = pd.read_csv(os.path.join(HM_DIR, "articles.csv"))
item_meta = articles[[
   "article_id",
   "product_type_no",
   "product_group_name",
   "colour_group_code",
   "department_no",
   "index_group_no",
   "garment_group_no"
]]

g_item = train_df.groupby("article_id")

# how many times this product has been purchased
item_popularity = g_item.size()
# How many different people have bought it
unique_buyers = g_item["customer_id"].nunique()
item_last_date = g_item["t_dat"].max()
# Number of days since the last purchase
item_recency_days = (VALID_START - item_last_date).dt.days

item_agg = pd.DataFrame({
    "item_popularity": item_popularity,
    "unique_buyers": unique_buyers,
    "item_last_date": item_last_date,
    "item_recency_days": item_recency_days
})
item_agg = item_agg.reset_index()
item_features = item_meta.merge(item_agg, on="article_id", how="left")

# NA processing method
item_features["item_popularity"] = item_features["item_popularity"].fillna(0).astype(int)
item_features["unique_buyers"] = item_features["unique_buyers"].fillna(0).astype(int)
very_old_date = pd.to_datetime("1970-01-01")
item_features["item_last_date"] = item_features["item_last_date"].fillna(very_old_date)
item_features["item_recency_days"] = item_features["item_recency_days"].fillna(999).astype(int)

save_pickle(item_features, os.path.join(RES_DIR, "item_features.pkl"))

In [15]:
item_features.head()

Unnamed: 0,article_id,product_type_no,...,item_last_date,item_recency_days
0,108775015,253,...,2020-07-22,56
1,108775044,253,...,2020-09-13,3
2,108775051,253,...,2019-06-28,446
3,110065001,306,...,2020-08-02,45
4,110065002,306,...,2020-08-05,42


In [16]:
item_features.isna().mean()

article_id           0.0
product_type_no      0.0
                    ... 
item_last_date       0.0
item_recency_days    0.0
Length: 11, dtype: float64

In [17]:
rank_df = rank_df.merge(
    item_features,
    on="article_id",
    how="left"
)

In [18]:
rank_df.head()

Unnamed: 0,customer_id,article_id,...,item_last_date,item_recency_days
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,636455004,...,2019-12-30,261
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,625939003,...,2020-08-25,22
2,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,671186001,...,2020-03-04,196
3,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,504155012,...,2020-02-01,228
4,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,632832001,...,2019-09-05,377


In [19]:
rank_df.columns

Index(['customer_id', 'article_id', 'label', 'tx_cnt', 'unique_items',
       'last_date', 'recency_days', 'product_type_no', 'product_group_name',
       'colour_group_code', 'department_no', 'index_group_no',
       'garment_group_no', 'item_popularity', 'unique_buyers',
       'item_last_date', 'item_recency_days'],
      dtype='object')

In [20]:
rank_df.isna().mean().sort_values(ascending=False)

unique_items         0.080814
recency_days         0.080814
                       ...   
item_last_date       0.000000
item_recency_days    0.000000
Length: 17, dtype: float64

In [25]:
save_pickle(rank_df, os.path.join(RES_DIR, "rank_df.pkl"), overwrite=True)

In [22]:
group_sizes = rank_df.groupby("customer_id").size().tolist()
group_sizes

[51,
 51,
 52,
 51,
 52,
 51,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 52,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 53,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 51,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 51,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 51,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 52,
 51,
 52,
 51,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 53,
 51,
 51,
 52,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 51,
 52,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 52,
 51,
 51,
 53,
 51,
 51,
 51,
 51,
 51,
 51,
 51,
 53,
 51,
 51,
 51,
 51,
 51,


In [23]:
save_pickle(group_sizes, os.path.join(RES_DIR, "group_sizes.pkl"))