Code modified from from my [another project](https://github.com/Wp-Zhang/H-M-Fashion-RecSys).

In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [2]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("./") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [3]:
from src.data.metrics import map_at_k, hr_at_k, recall_at_k
from src.retrieval.rules import (
    OrderHistory,
    ItemPair,
    TimeHistory,
    ItemCF
)
from src.retrieval.collector import RuleCollector
from src.utils import calc_valid_date

In [4]:
data_dir = Path("./")
model_dir = Path("./models/")

In [5]:
TRAIN_WEEK_NUM = 4
WEEK_NUM = TRAIN_WEEK_NUM + 2

VERSION_NAME = "Deploy"

In [6]:
import os
if not os.path.exists(data_dir/"interim"/VERSION_NAME):
    os.mkdir(data_dir/"interim"/VERSION_NAME)
if not os.path.exists(data_dir/"processed"/VERSION_NAME):
    os.mkdir(data_dir/"processed"/VERSION_NAME)

In [7]:
order = pd.read_csv('orders.csv')

In [8]:
del order['article_id']

In [9]:
order.head()

Unnamed: 0,t_dat,customer_id,price,order_id,product_code
0,2018-09-20,3,29.99,1,663713
1,2018-09-20,3,17.99,1,541518
2,2018-09-20,8,8.99,2,505221
3,2018-09-20,8,9.99,2,685687
4,2018-09-20,8,9.99,2,685687


In [10]:
order['t_dat'] = pd.to_datetime(order['t_dat'])

# Recall

In [53]:
def split_data(
        trans_data: pd.DataFrame,
        train_end_date: str,
        valid_end_date: str,
        item_id: str = "product_code",
    ):
        if item_id not in trans_data.columns:
            raise KeyError(f"{item_id} is not one of the columns")

        train_set = trans_data.loc[trans_data["t_dat"] < train_end_date]
        valid_set = trans_data.loc[
            (train_end_date <= trans_data["t_dat"])
            & (trans_data["t_dat"] < valid_end_date)
        ]
        valid_set = (
            valid_set.groupby(["customer_id"])[item_id].apply(list).reset_index()
        )

        return train_set, valid_set

In [13]:
# * WEEK_NUM = 0: test
# * WEEK_NUM = 1: valid
# * WEEK_NUM > 1: train
for week in range(1,WEEK_NUM):
    # * use sliding window to generate candidates
    trans = order

    start_date, end_date = calc_valid_date(week)
    print(f"Week {week}: [{start_date}, {end_date})")
    
    train, valid = split_data(trans, start_date, end_date)

    last_week = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=7)]
    last_2week = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=14)]
    last_80day = train[train['t_dat']>train['t_dat'].max()-pd.Timedelta(days=80)]

    if week != 0:
        customer_list = valid["customer_id"].values

    # * ========================== Retrieval Strategies ==========================

    candidates = RuleCollector().collect(
        week_num = week,
        trans_df = trans,
        customer_list=customer_list,
        rules=[
            OrderHistory(train, days=7, n=8, name=1),
            ItemPair(OrderHistory(train, days=7, n=8).retrieve(), name=2),
            ItemCF(last_80day, last_2week, top_k=8, name=3),
            TimeHistory(customer_list, last_week, n=8, name=4),
        ],
        norm=False,
        min_pos_rate=0,
        compress=False,
    )

    candidates.to_parquet(data_dir/"interim"/VERSION_NAME/f"week{week}_candidate.pqt")
    valid.to_parquet(data_dir/"processed"/VERSION_NAME/f"week{week}_label.pqt")

Week 1: [2020-09-16, 2020-09-23)


Retrieve items by rules:  25%|██▌       | 1/4 [00:20<01:00, 20.09s/it]

Positive rate: 0.03642


Retrieve items by rules:  50%|█████     | 2/4 [00:30<00:29, 14.57s/it]

Positive rate: 0.01468
Positive rate: 0.02520


Retrieve items by rules: 100%|██████████| 4/4 [01:40<00:00, 25.14s/it]

Positive rate: 0.01227





Week 2: [2020-09-09, 2020-09-16)


Retrieve items by rules:  25%|██▌       | 1/4 [00:18<00:56, 18.67s/it]

Positive rate: 0.03661


Retrieve items by rules:  50%|█████     | 2/4 [00:29<00:27, 13.97s/it]

Positive rate: 0.01584
Positive rate: 0.02530


Retrieve items by rules: 100%|██████████| 4/4 [01:44<00:00, 26.23s/it]

Positive rate: 0.01442





Week 3: [2020-09-02, 2020-09-09)


Retrieve items by rules:  25%|██▌       | 1/4 [00:19<00:59, 19.89s/it]

Positive rate: 0.03393


Retrieve items by rules:  50%|█████     | 2/4 [00:31<00:29, 14.87s/it]

Positive rate: 0.01528


Retrieve items by rules:  75%|███████▌  | 3/4 [01:54<00:45, 45.95s/it]

Positive rate: 0.02410


Retrieve items by rules: 100%|██████████| 4/4 [01:54<00:00, 28.70s/it]

Positive rate: 0.01552





Week 4: [2020-08-26, 2020-09-02)


Retrieve items by rules:  25%|██▌       | 1/4 [00:18<00:55, 18.35s/it]

Positive rate: 0.03033


Retrieve items by rules:  50%|█████     | 2/4 [00:29<00:28, 14.40s/it]

Positive rate: 0.01373


Retrieve items by rules:  75%|███████▌  | 3/4 [01:54<00:46, 46.57s/it]

Positive rate: 0.02173


Retrieve items by rules: 100%|██████████| 4/4 [01:55<00:00, 28.87s/it]

Positive rate: 0.01462





Week 5: [2020-08-19, 2020-08-26)


Retrieve items by rules:  25%|██▌       | 1/4 [00:18<00:56, 18.71s/it]

Positive rate: 0.03284


Retrieve items by rules:  50%|█████     | 2/4 [00:29<00:28, 14.14s/it]

Positive rate: 0.01488


Retrieve items by rules:  75%|███████▌  | 3/4 [01:54<00:46, 46.35s/it]

Positive rate: 0.02109


Retrieve items by rules: 100%|██████████| 4/4 [01:54<00:00, 28.73s/it]

Positive rate: 0.01174





In [15]:
pred = candidates.drop_duplicates(['customer_id','product_code']).groupby('customer_id')['product_code'].apply(list).reset_index()
label = valid.merge(pred.rename({'product_code':'pred'},axis=1),on='customer_id',how='left')
recall_at_k(label['product_code'], label['pred'], k=40)

0.0991061642860725

In [54]:
start_date, end_date = calc_valid_date(1)
train, valid = split_data(order, start_date, end_date)

In [57]:
df = order[order['t_dat']>=start_date][order['t_dat']<=end_date]

In [91]:
ip = ItemPair(df)
pair = ip._get_freq_pair()
del pair['count']
pair['method'] = 2
pair.to_csv('./recall_item_pair.csv',index=None)

In [75]:
sale = df.drop_duplicates(['customer_id','product_code']).groupby('product_code').size().reset_index(name='score')
sale = sale.sort_values(by='score',ascending=False).head(8)
sale['method'] = 4
sale.to_csv('./recall_sale.csv',index=None)

In [77]:
last_2week = order[order['t_dat']>order['t_dat'].max()-pd.Timedelta(days=14)]
last_80day = order[order['t_dat']>order['t_dat'].max()-pd.Timedelta(days=80)]
cf = ItemCF(last_80day, last_2week)
pred_next, pred_score = cf._item_cf(8)

df_l = []
for pid in tqdm(pred_next):
    tmp = pd.DataFrame()
    tmp['pair'] = pred_next[pid]
    tmp['score'] = pred_score[pid]
    tmp['product_code'] = pid
    df_l.append(tmp)

cf_df = pd.concat(df_l, ignore_index=True)
cf_df['method'] = 3
cf_df.to_csv('./recall_cf.csv', index=None)

In [93]:
cf_df.shape

(148761, 4)

# Feature Engineering

In [16]:
# calculate week number
order['week'] = (pd.to_datetime('2020-09-29') - pd.to_datetime(order['t_dat'])).dt.days // 7

In [17]:
feature_df = order.drop_duplicates(['product_code','week'])[['product_code','week']].reset_index(drop=True)

In [18]:
week_sale = order.groupby(['product_code','week']).size().reset_index(name="week_sale")
full_sale = order.groupby(['product_code']).size().reset_index(name="full_sale")
feature_df = feature_df.merge(week_sale, on=['product_code','week'], how='left')
feature_df = feature_df.merge(full_sale, on=['product_code'], how='left')

In [19]:
feature_df['week_sale_ratio'] = feature_df['week_sale'] / feature_df['full_sale']

In [20]:
feature_df.head()

Unnamed: 0,product_code,week,week_sale,full_sale,week_sale_ratio
0,663713,105,40,633,0.063191
1,541518,105,94,8161,0.011518
2,505221,105,38,127,0.299213
3,685687,105,4424,6613,0.668985
4,688873,105,1624,37368,0.04346


In [21]:
product = pd.read_json('./product.json')

In [22]:
cols = [
    "product_type_name",
    "garment_group_name",
    "section_name",
    "index_group_name",
]
for col in cols:
    product[col] = product[col].map(dict(zip(product[col].unique(),range(product[col].nunique()))))

In [23]:
prod_cols = [
        "product_code",
        "price",
        "product_type_name",
        "garment_group_name",
        "section_name",
        "index_group_name",
    ]

item_feats = week_sale[week_sale['week']==1][['product_code','week_sale']]
item_feats = item_feats.merge(full_sale, on=['product_code'], how='right')
item_feats = item_feats.fillna(0)
item_feats['week_sale_ratio'] = item_feats['week_sale'] / item_feats['full_sale']
item_feats['product_code'] = item_feats['product_code'].astype(int)
item_feats = item_feats.merge(product[prod_cols], on='product_code',how='left')
item_feats.to_csv('./item_features.csv',index=False)

In [40]:
for i in tqdm(range(1, WEEK_NUM)):
    candidate = pd.read_parquet(
        data_dir / "interim" / VERSION_NAME / f"week{i}_candidate.pqt"
    )
    label = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_label.pqt")    

    candidate["week"] = i

    candidate = candidate.merge(feature_df, on=["product_code", "week"], how="left")
    candidate = candidate.merge(label.rename({"product_code":'label'},axis=1),on='customer_id',how='right')
    candidate = candidate.merge(product[prod_cols],on='product_code',how='left')
    candidate['label'] = candidate.progress_apply(lambda x:x['product_code'] in x['label'], axis=1)

    candidate.to_parquet(
        data_dir / "processed" / VERSION_NAME / f"week{i}_candidate.pqt"
    )

100%|██████████| 1703852/1703852 [00:45<00:00, 37190.51it/s]
100%|██████████| 1826720/1826720 [00:49<00:00, 36540.27it/s]
100%|██████████| 1882870/1882870 [00:51<00:00, 36212.94it/s]
100%|██████████| 1948812/1948812 [00:55<00:00, 34834.76it/s]
100%|██████████| 1848024/1848024 [00:51<00:00, 35679.97it/s]
100%|██████████| 5/5 [05:04<00:00, 60.83s/it]


# Ranking

In [7]:
candidates = {}
labels = {}
for i in tqdm(range(1, WEEK_NUM)):
    candidates[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")
    labels[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_label.pqt")    

100%|██████████| 5/5 [00:01<00:00,  2.67it/s]


In [8]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "t_dat",
        'week',
        'customer_id'
    ]
]
cat_features = [
    "method",
    # "customer_id",
    "product_code",
    "product_type_name",
    "garment_group_name",
    "section_name",
    "index_group_name",
]

In [26]:
full_data = pd.concat([candidates[i] for i in range(1, WEEK_NUM)], ignore_index=True)

## Train

In [27]:
train = full_data.loc[full_data['week']>1]
valid = full_data.loc[full_data['week']==1]

del full_data
gc.collect()

69

In [28]:
params = {
    "objective": "binary", # "lambdarank"
    "boosting_type": "gbdt",
    "metric": "auc", # "map"
    "max_depth": 8,
    "num_leaves": 128,
    "learning_rate": 0.03,

    "verbose": -1,
    "eval_at": 12,
}

In [29]:
def train_rank_model(train, valid, train_group, valid_group):

    train_set = lgb.Dataset(
        data=train[feats],
        label=train["label"],
        group=train_group,
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    valid_set = lgb.Dataset(
        data=valid[feats],
        label=valid["label"],
        group=valid_group,
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    ranker = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[valid_set],
        early_stopping_rounds=100,
        verbose_eval=20,
    )
    ranker.save_model(
        model_dir / f"lgb_ranker.model",
        num_iteration=ranker.best_iteration,
    )
    return ranker

In [30]:
def train_binary_model(train, valid):

    train_set = lgb.Dataset(
        data=train[feats],
        label=train["label"],
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    valid_set = lgb.Dataset(
        data=valid[feats],
        label=valid["label"],
        feature_name=feats,
        categorical_feature=cat_features,
        params=params,
    )

    ranker = lgb.train(
        params,
        train_set,
        num_boost_round=100,
        valid_sets=[valid_set],
        early_stopping_rounds=10,
        verbose_eval=10,
    )
    ranker.save_model(
        model_dir / f"lgb_binary.model",
        num_iteration=ranker.best_iteration,
    )
    return ranker

In [31]:
print("Train positive rate:", train.label.mean())

Train positive rate: 0.02048524823930856


In [32]:
train = train.sort_values(by=["week", "customer_id"], ascending=True).reset_index(drop=True)
valid = valid.sort_values(by=["customer_id"], ascending=True).reset_index(drop=True)

In [33]:
train_group = train[["customer_id", "product_code", "week"]]
train_group = train_group.astype("int32")  # * convert to int to avoid `0` in groupby count result
train_group = (train_group.groupby(["week", "customer_id"]).size().values)

In [34]:
valid_group = valid[["customer_id", "product_code"]]
valid_group = valid_group.astype("int32")  # * convert to int to avoid `0` in groupby count result
valid_group = valid_group.groupby(["customer_id"]).size().values

In [20]:
# ranker = train_rank_model(train, valid, train_group, valid_group)
ranker = train_binary_model(train[feats+['label']], valid[feats+['label']])
# 0.728776

Training until validation scores don't improve for 10 rounds
[10]	valid_0's auc: 0.720171
[20]	valid_0's auc: 0.724597
[30]	valid_0's auc: 0.726479
[40]	valid_0's auc: 0.72779
[50]	valid_0's auc: 0.728788
[60]	valid_0's auc: 0.729495
[70]	valid_0's auc: 0.729859
[80]	valid_0's auc: 0.730414
[90]	valid_0's auc: 0.730779
[100]	valid_0's auc: 0.7311
Did not meet early stopping. Best iteration is:
[98]	valid_0's auc: 0.731131


## Validate

In [35]:
val_candidates = valid.reset_index(drop=True)

In [36]:
def predict(ranker, candidates, batch_size = 5_000_000):
    probs = np.zeros(candidates.shape[0])
    for batch in range(0, candidates.shape[0], batch_size):
        outputs = ranker.predict(candidates.loc[batch : batch + batch_size - 1, feats])
        probs[batch : batch + batch_size] = outputs
    candidates["prob"] = probs
    pred_lgb = candidates[['customer_id','product_code','prob']]
    pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
    pred_lgb.rename(columns={'product_code':'prediction'}, inplace=True)
    pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
    pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)
    pred_lgb = pred_lgb.groupby("customer_id")["prediction"].progress_apply(list).reset_index()
    return pred_lgb

In [37]:
pred = predict(ranker, val_candidates)

100%|██████████| 68952/68952 [00:01<00:00, 35582.41it/s]


In [38]:
label = labels[1]
label = pd.merge(label, pred, on="customer_id", how="left")

In [39]:
map_at_k(label["product_code"], label["prediction"], k=4)

# 0.03933704686674143

0.03894939797089119

In [40]:
recall_at_k(label["product_code"], label["prediction"], k=10)

0.09276867095395626

# Prepare for deploy

In [None]:
import mlflow
import mlflow.lightgbm
import lightgbm as lgb

In [None]:
model = lgb.Booster(model_file='./models/lgb_binary.model')
mlflow.lightgbm.save_model(model, './models/mlflow_lgb')