In [1]:
# %pip install -U lightgbm

In [1]:
import pandas as pd
import numpy as np

import pickle
from tqdm import tqdm
from pathlib import Path
import gc

In [2]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("../src/")
InteractiveShell.ast_node_interactivity = "all"

In [3]:
from data import DataHelper
from data.metrics import map_at_k, hr_at_k, recall_at_k
from retrieval.rules import (
    OrderHistory,
    OrderHistoryDecay,
    ItemPair,

    UserGroupTimeHistory,
    UserGroupSaleTrend,

    TimeHistory,
    TimeHistoryDecay,
    SaleTrend,

    OutOfStock,
)
from retrieval.collector import RuleCollector


In [4]:
data_dir = Path("../data/")
dh = DataHelper(data_dir)

In [5]:
# data = dh.preprocess_data(save=True) # run only once

In [6]:
data = dh.load_data(name="encoded_full")

In [7]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
data['user']['age_bins'] = pd.cut(data['user']['age'], listBin)

In [8]:
trans = data["inter"].merge(data['item'][['article_id','product_code']], on='article_id', how='left')

## Retrieval

In [9]:
trans = data["inter"]
train, valid = dh.split_data(trans, "2020-09-16", "2020-09-23")
customer_list = valid["customer_id"].values

last_week = train.loc[train.t_dat >= "2020-09-09"]
last_3days = train.loc[train.t_dat >= "2020-09-13"]

In [10]:
train = train.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
last_week = last_week.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')
last_3days = last_3days.merge(data['user'][['customer_id','age_bins']], on='customer_id', how='left')

In [11]:
last_week = last_week.merge(data['item'][['article_id','perceived_colour_master_id','product_group_name']], on='article_id', how='left')

In [21]:
candidates = RuleCollector().collect(
    # data=data,
    customer_list=customer_list,
    rules=[
        OrderHistory(train, 14),
        ItemPair(OrderHistory(train, 14).retrieve()),
        UserGroupTimeHistory(data, customer_list, last_week, ['age_bins'], 24),
        OrderHistoryDecay(train, 14, n=14),
        # UserGroupSaleTrend(data, customer_list, train, ['age_bins'], 7, n=24),
        # TimeHistory(customer_list, last_week, 7),
        # TimeHistoryDecay(customer_list, train, 7, n=12),
    ],
    filters=[OutOfStock(trans)],
    compress=False,
)

Retrieve items by rules: 100%|██████████| 4/4 [02:14<00:00, 33.52s/it]


In [22]:
# scores = (
#     pd.pivot_table(
#         candidates,
#         values="score",
#         index=["customer_id", "article_id"],
#         columns=["method"],
#         aggfunc=np.sum,
#     )
#     .reset_index()
#     .fillna(0)
# )
# hit_rate = (
#     pd.pivot_table(
#         candidates,
#         values="hit_rate",
#         index=["customer_id", "article_id"],
#         columns=["method"],
#         aggfunc=np.sum,
#     )
#     .reset_index()
#     .fillna(0)
# )
# methods = list(candidates['method'].unique())
# scores[methods] = hit_rate[methods] * scores[methods]
# scores['rule_similarity'] = scores[methods].sum(axis=1)

In [23]:
candidates = candidates.drop_duplicates(['customer_id','article_id'])

In [24]:
candidates = candidates.groupby('customer_id')['article_id'].apply(list).reset_index()

In [25]:
candidates.rename(columns={'article_id': 'prediction'}, inplace=True)
valid2 = pd.merge(valid, candidates, on="customer_id", how="left")

In [26]:
map_at_k(valid2["article_id"], valid2["prediction"], k=12)
hr_at_k(valid2["article_id"], valid2["prediction"], k=12)
recall_at_k(valid2["article_id"], valid2["prediction"], k=12)
# 0.025620866741013788

0.025671835027045554

0.11585295140902238

0.06050771316261749

In [27]:
valid2['prediction'].apply(len).mean()
# 34.577206308709265  0.09688025735821157
# 31.902049750666823

32.89101820712049

In [28]:
recall_at_k(valid2["article_id"], valid2["prediction"], k=1000)
# 0.09688025735821157
# 0.10422457776642167
# 0.14717778891791014

# 0.0945484902083186
# 0.11138181112336008

0.0961116033075716

In [29]:
recall_at_k(valid2["article_id"], valid2["prediction"], k=1000) / valid2['prediction'].apply(len).mean()
# 0.0028018532351414835
# 0.0023242087715079573
# 0.000930998471281766

0.002922123076346863

## Predict

In [19]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
idx2uid = pickle.load(open(data_dir/"index_id_map/user_index2id.pkl", "rb"))
idx2iid = pickle.load(open(data_dir/"index_id_map/item_index2id.pkl", "rb"))

In [30]:
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

In [27]:
last_week = trans.loc[trans.t_dat >= "2020-09-16"]
candidates = RuleCollector().collect(
    customer_list=submission['customer_id'].values,
    rules=[
        OrderHistory(trans, 7),
        ItemPair(OrderHistory(trans, 7).retrieve()),
        TimeHistory(last_week, 12),
    ],
    filters=[OutOfStock(trans)],
    compress=False,
)

Retrieve items by rules: 100%|██████████| 3/3 [00:47<00:00, 15.81s/it]


In [28]:
candidates['article_id'] = candidates['article_id'].map(idx2iid).apply(lambda x:'0'+str(x))
candidates = candidates.groupby('customer_id')['article_id'].apply(list).reset_index()
candidates['article_id'] = candidates['article_id'].apply(lambda x: ' '.join(x))
candidates.rename(columns={'article_id':'prediction'}, inplace=True)

In [33]:
del submission['prediction']
submission = submission.merge(candidates, on='customer_id', how='left')
submission['customer_id'] = submission['customer_id'].map(idx2uid)

In [None]:
submission.to_csv('submission.csv', index=False)