In [None]:
import gc
import os
from pathlib import Path
import pickle
import time
import catboost
import faiss
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from scipy import sparse

In [None]:
data_dir = '/home/workspace/h-and-m-personalized-fashion-recommendations'
output_dir = './'
transactions = pd.read_pickle(f"{data_dir}/transactions_train.pkl") 
users = pd.read_pickle(f"{data_dir}/users.pkl") 
items = pd.read_pickle(f"{data_dir}/items.pkl") 

In [None]:
class CFG:
    model_type = 'CatBoost' 
    popular_num_items = 60 
    popular_weeks = 1 
    train_weeks = 6 
    item2item_num_items_for_same_product_code = 12 

    # features
    user_transaction_feature_weeks = 50 
    item_transaction_feature_weeks = 16 
    item_age_feature_weeks = 40 
    user_volume_feature_weeks = 50 
    item_volume_feature_weeks = 20 
    user_item_volume_feature_weeks = 16 


## 生成candidates

In [None]:
def create_candidates(transactions: pd.DataFrame, target_users: np.ndarray, week: int) -> pd.DataFrame:
    print(f"create candidates (week: {week})")
    assert len(target_users) == len(set(target_users)) 

    def create_candidates_repurchase(
            strategy: str,  
            transactions: pd.DataFrame, 
            target_users: np.ndarray, 
            week_start: int, 
            max_items_per_user: int=1234567890 
        ) -> pd.DataFrame:
        tr = transactions.query("user in @target_users and @week_start <= week")[['user', 'item', 'week', 'day']].drop_duplicates(ignore_index=True) # 筛选指定user和week的交易样本

        gr_day = tr.groupby(['user', 'item'])['day'].min().reset_index(name='day') 
        gr_week = tr.groupby(['user', 'item'])['week'].min().reset_index(name='week') 
        gr_volume = tr.groupby(['user', 'item']).size().reset_index(name='volume') 

        gr_day['day_rank'] = gr_day.groupby('user')['day'].rank() 
        gr_week['week_rank'] = gr_week.groupby('user')['week'].rank() 
        gr_volume['volume_rank'] = gr_volume.groupby('user')['volume'].rank(ascending=False) 

        candidates = gr_day.merge(gr_week, on=['user', 'item']).merge(gr_volume, on=['user', 'item']) 

        candidates['rank_meta'] = 10**9 * candidates['day_rank'] + candidates['volume_rank'] 
        candidates['rank_meta'] = candidates.groupby('user')['rank_meta'].rank(method='min') 
        
        
        candidates = candidates.query("rank_meta <= @max_items_per_user").reset_index(drop=True) 

        candidates = candidates[['user', 'item', 'week_rank', 'volume_rank', 'rank_meta']].rename(columns={'week_rank': f'{strategy}_week_rank', 'volume_rank': f'{strategy}_volume_rank'})

        candidates['strategy'] = strategy 
        return candidates.drop_duplicates(ignore_index=True)


    def  create_candidates_popular(
            transactions: pd.DataFrame, 
            target_users: np.ndarray, 
            week_start: int,
            num_weeks: int, 
            num_items: int, 
        ) -> pd.DataFrame:

        tr = transactions.query("@week_start <= week < @week_start + @num_weeks")[['user', 'item']].drop_duplicates(ignore_index=True)
        popular_items = tr['item'].value_counts().index.values[:num_items] 
        popular_items = pd.DataFrame({
            'item': popular_items, 
            'rank': range(num_items), 
            'crossjoinkey': 1, 
        })

        candidates = pd.DataFrame({
            'user': target_users, 
            'crossjoinkey': 1,
        })

        candidates = candidates.merge(popular_items, on='crossjoinkey').drop('crossjoinkey', axis=1) 
        candidates = candidates.rename(columns={'rank': f'pop_rank'}) 

        candidates['strategy'] = 'pop' 
        return candidates.drop_duplicates(ignore_index=True)


    def create_candidates_category_popular(
        transactions: pd.DataFrame, 
        items: pd.DataFrame,
        base_candidates: pd.DataFrame, 
        week_start: int, 
        num_weeks: int, 
        num_items_per_category: int, 
        category: str, 
    ) -> pd.DataFrame:
        tr = transactions.query("@week_start <= week < @week_start + @num_weeks")[['user', 'item']].drop_duplicates() 
        tr = tr.groupby('item').size().reset_index(name='volume') 
        tr = tr.merge(items[['item', category]], on='item') 
        tr['cat_volume_rank'] = tr.groupby(category)['volume'].rank(ascending=False, method='min') 
        tr = tr.query("cat_volume_rank <= @num_items_per_category").reset_index(drop=True) 
        tr = tr[['item', category, 'cat_volume_rank']].reset_index(drop=True)

        candidates = base_candidates[['user', 'item']].merge(items[['item', category]], on='item') 
        candidates = candidates.groupby(['user', category]).size().reset_index(name='cat_volume') 
        candidates = candidates.merge(tr, on=category).drop(category, axis=1)
        candidates['strategy'] = 'cat_pop' 
        return candidates


    candidates_repurchase = create_candidates_repurchase('repurchase', transactions, target_users, week)
    candidates_popular = create_candidates_popular(transactions, target_users, week, CFG.popular_weeks, CFG.popular_num_items)
    candidates_item2item2 = create_candidates_repurchase('item2item2', transactions, target_users, week, CFG.item2item_num_items_for_same_product_code) 
    candidates_dept = create_candidates_category_popular(transactions, items, candidates_item2item2, week, 1, 6, 'department_no_idx')

    def drop_common_user_item(candidates_target: pd.DataFrame, candidates_reference: pd.DataFrame) -> pd.DataFrame:
        tmp = candidates_reference[['user', 'item']].reset_index(drop=True) 
        tmp['flag'] = 1
        candidates = candidates_target.merge(tmp, on=['user', 'item'], how='left') 
        return candidates.query("flag != 1").reset_index(drop=True).drop('flag', axis=1) 
    candidates_dept = drop_common_user_item(candidates_dept, candidates_repurchase) 
    candidates = [
        candidates_repurchase,
        candidates_popular,
        candidates_dept,
    ]
    candidates = pd.concat(candidates)
    
    print(f"volume: {len(candidates)}") 
    print(f"duplicates: {len(candidates) / len(candidates[['user', 'item']].drop_duplicates())}") 

    volumes = candidates.groupby('strategy').size().reset_index(name='volume').sort_values(by='volume', ascending=False).reset_index(drop=True)  
    volumes['ratio'] = volumes['volume'] / volumes['volume'].sum()
    print(volumes)

    meta_columns = [c for c in candidates.columns if c.endswith('_meta')]
    return candidates.drop(meta_columns, axis=1) 


In [None]:
candidates = []
for week in range(1+CFG.train_weeks): 
    target_users = transactions.query("week == @week")['user'].unique() 
    candidates.append(create_candidates(transactions, target_users, week+1)) 

## 生成candidates对应的labels

In [None]:
def merge_labels(candidates: pd.DataFrame, week: int) -> pd.DataFrame:
    print(f"merge labels (week: {week})") 
    labels = transactions[transactions['week'] == week][['user', 'item']].drop_duplicates(ignore_index=True) 
    original_positives = len(labels) 
    labels['y'] = 1
    labels = candidates.merge(labels, on=['user', 'item'], how='left')
    labels['y'] = labels['y'].fillna(0)
    
    remaining_positives_total = labels[['user', 'item', 'y']].drop_duplicates(ignore_index=True)['y'].sum() 
    recall = remaining_positives_total / original_positives 
    print(f"Recall: {recall}")

    volumes = candidates.groupby('strategy').size().reset_index(name='volume') 
    remaining_positives = labels.groupby('strategy')['y'].sum().reset_index() 
    remaining_positives = remaining_positives.merge(volumes, on='strategy') 
    remaining_positives['recall'] = remaining_positives['y'] / original_positives 
    remaining_positives['hit_ratio'] = remaining_positives['y'] / remaining_positives['volume'] 
    remaining_positives = remaining_positives.sort_values(by='y', ascending=False).reset_index(drop=True)
    print(remaining_positives) 

    return labels 

In [None]:
for idx in range(len(candidates)): 
    candidates[idx] = merge_labels(candidates[idx], idx) 

## candidates数据 再处理

In [None]:
for idx in range(len(candidates)):
    candidates[idx]['week'] = idx

candidates_valid_all = candidates[0].copy()

def drop_trivial_users(labels):

    bef = len(labels)
    df = labels[labels['user'].isin(labels[['user', 'y']].drop_duplicates().groupby('user').size().reset_index(name='sz').query("sz==2").user)].reset_index(drop=True) # 保留同时用正面和负面样本的用户
    aft = len(df)
    print(f"drop trivial queries: {bef} -> {aft}") 
    return df
    
for idx in range(len(candidates)):
    candidates[idx] = drop_trivial_users(candidates[idx])

In [None]:
# candidates columns = ['user', 'item', 'repurchase_week_rank', 'repurchase_volume_rank', 'strategy', 'pop_rank', 'cat_volume', 'cat_volume_rank', 'y', 'week']

## attach features

In [None]:
def calc_embeddings(model_type: str, week: int, dim: int):
    with open(f"{data_dir}/lfm/lfm_{model_type}_week{week}_dim{dim}_model.pkl", 'rb') as f:
        model = pickle.load(f)
    biases, embeddings = model.get_user_representations(None)
    n_user = len(biases) 
    a = np.hstack([embeddings, biases.reshape(n_user, 1)]) 
    user_embeddings = pd.DataFrame(a, columns=[f"user_rep_{i}" for i in range(dim + 1)]) 
    user_embeddings = pd.concat([pd.DataFrame({'user': range(n_user)}), user_embeddings], axis=1) 

    return user_embeddings

In [None]:
def attach_features(transactions: pd.DataFrame, users: 
                    pd.DataFrame, items: pd.DataFrame, candidates: pd.DataFrame, week: int, pretrain_week: int) -> pd.DataFrame:
    print(f"attach features (week: {week})")
    n_original = len(candidates) 
    df = candidates.copy() 

    df = df.merge(users[['user', 'age']], on='user')

    item_features = [c for c in items.columns if c.endswith('idx')]
    df = df.merge(items[['item'] + item_features], on='item')

    week_end = week + CFG.user_transaction_feature_weeks
    tmp = transactions.query("@week <= week < @week_end").groupby('user')[['price', 'sales_channel_id']].agg(['mean', 'std'])
    tmp.columns = ['user_' + '_'.join(a) for a in tmp.columns.to_flat_index()] 
    df = df.merge(tmp, on='user', how='left') 

    week_end = week + CFG.item_transaction_feature_weeks
    tmp = transactions.query("@week <= week < @week_end").groupby('item')[['price', 'sales_channel_id']].agg(['mean', 'std'])
    tmp.columns = ['item_' + '_'.join(a) for a in tmp.columns.to_flat_index()] 
    df = df.merge(tmp, on='item', how='left')

    week_end = week + CFG.item_age_feature_weeks
    tmp = transactions.query("@week <= week < @week_end").merge(users[['user', 'age']], on='user')
    tmp = tmp.groupby('item')['age'].agg(['mean', 'std']) 
    tmp.columns = [f'age_{a}' for a in tmp.columns.to_flat_index()] 
    df = df.merge(tmp, on='item', how='left') 

    tmp = transactions.query("@week <= week").groupby('item')['day'].min().reset_index(name='item_day_min') 
    tmp['item_day_min'] -= transactions.query("@week == week")['day'].min() 
    df = df.merge(tmp, on='item', how='left') 

    week_end = week + CFG.item_volume_feature_weeks
    tmp = transactions.query("@week <= week < @week_end").groupby('item').size().reset_index(name='item_volume')
    df = df.merge(tmp, on='item', how='left')

    tmp = transactions.query("@week <= week").groupby('user')['day'].min().reset_index(name='user_day_min') 
    tmp['user_day_min'] -= transactions.query("@week == week")['day'].min() 
    df = df.merge(tmp, on='user', how='left')

    week_end = week + CFG.user_volume_feature_weeks
    tmp = transactions.query("@week <= week < @week_end").groupby('user').size().reset_index(name='user_volume')
    df = df.merge(tmp, on='user', how='left') 

    tmp = transactions.query("@week <= week").groupby(['user', 'item'])['day'].min().reset_index(name='user_item_day_min') 
    tmp['user_item_day_min'] -= transactions.query("@week == week")['day'].min() 
    df = df.merge(tmp, on=['item', 'user'], how='left') 

    week_end = week + CFG.user_item_volume_feature_weeks
    tmp = transactions.query("@week <= week < @week_end").groupby(['user', 'item']).size().reset_index(name='user_item_volume')
    df = df.merge(tmp, on=['user', 'item'], how='left')

    seen_users = transactions.query("week >= @pretrain_week")['user'].unique() 
    user_reps = calc_embeddings('i_i', pretrain_week, 16) 
    user_reps = user_reps.query("user in @seen_users") 
    df = df.merge(user_reps, on='user', how='left') 

    assert len(df) == n_original
    return df

dataset_oof = attach_features(transactions, users, items, candidates_valid_all, 1, CFG.train_weeks+1)
datasets_train_valid = [attach_features(transactions, users, items, candidates[idx], 1+idx, CFG.train_weeks+1) for idx in range(len(candidates))]

## 划分Train/Valid

In [None]:
 for idx in range(len(datasets_train_valid)): 
    datasets_train_valid[idx]['query_group'] = datasets_train_valid[idx]['week'].astype(str) + '_' + datasets_train_valid[idx]['user'].astype(str) # 新列 query_group = "week_user"
    datasets_train_valid[idx] = datasets_train_valid[idx].sort_values(by='query_group').reset_index(drop=True) 

def concat_train(datasets_train_valid, begin, num):
    train = pd.concat([datasets_train_valid[idx] for idx in range(begin, begin+num)])
    return train

train = concat_train(datasets_train_valid, 1, CFG.train_weeks) 
valid = datasets_train_valid[0] # valid data

feature_columns = [c for c in valid.columns if c not in ['y', 'strategy', 'query_group', 'week']] # 
print(f"feature_columns:\n{feature_columns}") 

cat_feature_values = [c for c in feature_columns if c.endswith('idx')] 
cat_features = [feature_columns.index(c) for c in cat_feature_values] 
print(f"\ncat_feature_values:\n{cat_feature_values}")
print(f"\ncat_features:\n{cat_features}")

## Model Training

In [None]:
def get_query_group(df):
    users = df['user'].values 
    comp_seq_index, = np.concatenate(([True], users[1:]!=users[:-1], [True])).nonzero() 
    group = list(np.ediff1d(comp_seq_index)) 
    return group

In [None]:
if CFG.model_type == 'LightGBM':
    group_train = get_query_group(train) 
    group_valid = get_query_group(valid)

    train_dataset = lgb.Dataset(train[feature_columns], train['y'], group=group_train)
    valid_dataset = lgb.Dataset(valid[feature_columns], valid['y'], group=group_valid, reference=train_dataset) 

    params = {
        'objective': 'xendcg',
        'boosting_type': 'gbdt', 
        'learning_rate': 1e-6,
        'num_leaves': 255, 
        'min_data_in_leaf': 100, 
        'metric': 'map',
        'eval_at': 12,
    }

    model = lgb.train(
                        params,
                        train_dataset, 
                        valid_sets=[train_dataset, valid_dataset],
                        num_boost_round=1000, 
                        callbacks=[lgb.early_stopping(20)] 
                    )

    lgb.plot_importance(model, importance_type='gain', figsize=(8, 16)) 

elif CFG.model_type == 'CatBoost':
    train_dataset = catboost.Pool(data=train[feature_columns], label=train['y'], group_id=train['query_group'], cat_features=cat_features)
    valid_dataset = catboost.Pool(data=valid[feature_columns], label=valid['y'], group_id=valid['query_group'], cat_features=cat_features)

    params = {
        'loss_function': 'YetiRank',
        'use_best_model': True,
        'one_hot_max_size': 300,
        'iterations': 10000,
    }
    model = catboost.CatBoost(params) 
    model.fit(train_dataset, eval_set=valid_dataset)

    plt.plot(model.get_evals_result()['validation']['PFound'])

    feature_importance = model.get_feature_importance(train_dataset)
    sorted_idx = np.argsort(feature_importance)
    plt.figure(figsize=(8, 16))
    plt.yticks(range(len(feature_columns)), np.array(feature_columns)[sorted_idx])
    plt.barh(range(len(feature_columns)), feature_importance[sorted_idx])

del train, valid, train_dataset, valid_dataset
gc.collect()

with open(f'{output_dir}/model_for_validation.pkl', 'wb') as f:
    pickle.dump(model, f)

## Validation

In [None]:
pred = dataset_oof[['user', 'item']].reset_index(drop=True) 
pred['pred'] = model.predict(dataset_oof[feature_columns]) 

pred = pred.groupby(['user', 'item'])['pred'].max().reset_index() 
pred = pred.sort_values(by=['user', 'pred'], ascending=False).reset_index(drop=True).groupby('user')['item'].apply(lambda x: list(x)[:12]).reset_index() 

gt = transactions.query("week == 0").groupby('user')['item'].apply(list).reset_index().rename(columns={'item': 'gt'})
merged = gt.merge(pred, on='user', how='left') 
merged['item'] = merged['item'].fillna('').apply(list) 

merged.to_pickle(f'{output_dir}/merged_100.pkl') 
dataset_oof.to_pickle(f'{output_dir}/valid_all_100.pkl') 

In [None]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted) > k:
        predicted = predicted[:k] 

    score = 0.0 
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]: 
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0 

    return score / min(len(actual), k)


def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


print('mAP@12:', mapk(merged['gt'], merged['item']))

## submission

In [None]:
datasets = [attach_features(transactions, users, items, candidates[idx], 1+idx, CFG.train_weeks) for idx in range(len(candidates))]

for idx in range(len(datasets)):
    datasets[idx]['query_group'] = datasets[idx]['week'].astype(str) + '_' + datasets[idx]['user'].astype(str) 
    datasets[idx] = datasets[idx].sort_values(by='query_group').reset_index(drop=True) 

train = concat_train(datasets, 0, CFG.train_weeks) 

if CFG.model_type == 'LightGBM':
    group_train = get_query_group(train) 
    train_dataset = lgb.Dataset(train[feature_columns], train['y'], group=group_train) 

    best_iteration = model.best_iteration 
    model = lgb.train(params, train_dataset, num_boost_round=best_iteration)
    lgb.plot_importance(model, importance_type='gain', figsize=(8, 16))


elif CFG.model_type == 'CatBoost':
    train_dataset = catboost.Pool(data=train[feature_columns], label=train['y'], group_id=train['query_group'], cat_features=cat_features)

    params['iterations'] = model.get_best_iteration()
    params['use_best_model'] = False 
    model = catboost.CatBoost(params) 
    model.fit(train_dataset) 

    feature_importance = model.get_feature_importance(train_dataset)
    sorted_idx = np.argsort(feature_importance)
    plt.figure(figsize=(8, 16))
    plt.yticks(range(len(feature_columns)), np.array(feature_columns)[sorted_idx])
    plt.barh(range(len(feature_columns)), feature_importance[sorted_idx])


del train, train_dataset
gc.collect()
with open(f'{output_dir}/model_for_submission.pkl', 'wb') as f:
    pickle.dump(model, f)


del datasets, dataset_oof, candidates, candidates_valid_all
gc.collect()

In [None]:
all_users = users['user'].values 
all_users 
preds = []

n_split_prediction = 10
n_chunk = (len(all_users) + n_split_prediction - 1)// n_split_prediction
for i in range(0, len(all_users), n_chunk):
    print(f"chunk: {i}")
    target_users = all_users[i:i+n_chunk]

    candidates = create_candidates(transactions, target_users, 0) 
    candidates = attach_features(transactions, users, items, candidates, 0, CFG.train_weeks) 

    candidates['pred'] = model.predict(candidates[feature_columns]) 
    pred = candidates.groupby(['user', 'item'])['pred'].max().reset_index() 
    pred = pred.sort_values(by=['user', 'pred'], ascending=False).reset_index(drop=True).groupby('user')['item'].apply(lambda x: list(x)[:12]).reset_index() 
    preds.append(pred)

pred = pd.concat(preds).reset_index(drop=True) 
assert len(pred) == len(all_users)
assert np.array_equal(pred['user'].values, all_users)

In [None]:
mp_user = pd.read_pickle(f"{data_dir}/mp_customer_id.pkl")
mp_item = pd.read_pickle(f"{data_dir}/mp_article_id.pkl") # 
a_user = mp_user['val'].values 
a_item = mp_item['val'].values

pred['customer_id'] = pred['user'].apply(lambda x: a_user[x])
pred['prediction'] = pred['item'].apply(lambda x: list(map(lambda y: a_item[y], x)))
pred['prediction'] = pred['prediction'].apply(lambda x: ' '.join(map(str, x))) 

submission = pred[['customer_id', 'prediction']] 
submission.to_csv('submission.csv', index=False) 