## Load Packages

In [None]:
import os, sys
import torch
import pandas as pd
import numpy as np

In [7]:
# uncomment the line below to install recbole
# %pip install recbole

In [8]:
from recbole.quick_start import load_data_and_model, run_recbole
from recbole.data.interaction import Interaction

## Prepare Atomic Files

In [3]:
DATASET = 'HM'
save_path = f'./recbole_data/{DATASET}'
os.makedirs(save_path, exist_ok = True)

### Items

In [4]:
feature_map = {
    'article_id': 'item_id:token', 
    'product_code': 'product_code:token', 
    'product_type_no': 'product_type_no:float',
    'product_group_name': 'product_group_name:token_seq', 
    'graphical_appearance_no': 'graphical_appearance_no:token',
    'colour_group_code': 'colour_group_code:token',
    'perceived_colour_value_id': 'perceived_colour_value_id:token', 
    'perceived_colour_master_id': 'perceived_colour_master_id:token',
    'department_no': 'department_no:token', 
    'index_code': 'index_code:token', 
    'index_group_no': 'index_group_no:token', 
    'section_no': 'section_no:token', 
    'garment_group_no': 'garment_group_no:token'
}

items_df = pd.read_csv(r"./input/articles.csv", dtype={'article_id': 'str'})

tmp = items_df[feature_map.keys()]
tmp = tmp.rename(columns=feature_map)

tmp.describe()

Unnamed: 0,product_code:token,product_type_no:float,graphical_appearance_no:token,colour_group_code:token,perceived_colour_value_id:token,perceived_colour_master_id:token,department_no:token,index_group_no:token,section_no:token,garment_group_no:token
count,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0
mean,698424.563378,234.861875,1009515.0,32.233822,3.206183,7.807972,4532.777833,3.171534,42.664219,1010.43829
std,128462.384432,75.049308,22413.59,28.086154,1.563839,5.376727,2712.692011,4.353234,23.260105,6.731023
min,108775.0,-1.0,-1.0,-1.0,-1.0,-1.0,1201.0,1.0,2.0,1001.0
25%,616992.5,252.0,1010008.0,9.0,2.0,4.0,1676.0,1.0,20.0,1005.0
50%,702213.0,259.0,1010016.0,14.0,4.0,5.0,4222.0,2.0,46.0,1009.0
75%,796703.0,272.0,1010016.0,52.0,4.0,11.0,7389.0,4.0,61.0,1017.0
max,959461.0,762.0,1010029.0,93.0,7.0,20.0,9989.0,26.0,97.0,1025.0


In [5]:
tmp.to_csv(f"{save_path}/{DATASET}.item", index=False, sep='\t')

### Users

In [5]:
feature_map = {
    'customer_id': 'user_id:token',
    'club_member_status': 'club_member_status:token',
    'fashion_news_frequency': 'fashion_news_frequency:token',
    'age':'age:float',
    'postal_code':'postal_code:token'
}
users_df = pd.read_csv(r"./input/customers.csv", dtype={'customer_id': 'str'})

tmp = users_df[feature_map.keys()].rename(columns=feature_map)

tmp.describe()

Unnamed: 0,age:float
count,1356119.0
mean,36.38696
std,14.31363
min,16.0
25%,24.0
50%,32.0
75%,49.0
max,99.0


In [7]:
tmp.to_csv(f'{save_path}/{DATASET}.user', index=False, sep='\t')


### Interactions

In [6]:
feature_map = {
    'customer_id': 'user_id:token',
    'article_id': 'item_id:token',
    'price' : 'price:float',
    'timestamp': 'timestamp:float'
}

df = pd.read_csv(r"./input/transactions_train.csv", 
                 dtype={'article_id': 'str'})
df['t_dat'] = pd.to_datetime(df['t_dat'], format="%Y-%m-%d")
df['timestamp'] = df.t_dat.astype('int64') // 10**9
df_filtered = df[df['timestamp'] >= 0] # filter  from 2020-01-01 use: 1577836800

    
tmp = df_filtered[feature_map.keys()].rename(columns=feature_map)

tmp.describe()

Unnamed: 0,price:float,timestamp:float
count,31788320.0,31788320.0
mean,0.02782927,1568568000.0
std,0.01918113,18258650.0
min,1.694915e-05,1537402000.0
25%,0.01581356,1553731000.0
50%,0.02540678,1566691000.0
75%,0.03388136,1585440000.0
max,0.5915254,1600733000.0


In [9]:
tmp.to_csv(f'{save_path}/{DATASET}.inter', index=False, sep='\t')

## Train Model

In [10]:

parameter_dict  = {
    'data_path': './recbole_data',
    'reproducibility': False,
    'save_dataset': False,
    'save_dataloaders': False,
    'show_progress': False,
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': "[5,inf)",
    'item_inter_num_interval': "[0,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp'],
                'item': ['item_id', 'product_code', 'product_type_no', 'product_group_name', 'graphical_appearance_no',
                    'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id',
                    'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no'],
                'user':['user_id', 'club_member_status', 'fashion_news_frequency', 'age', 'postal_code']
            },
    'selected_features': ['product_code', 'product_type_no', 'product_group_name', 'graphical_appearance_no',
                        'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id',
                        'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no'],
    'neg_sampling': None,
    'stopping_step':20,
    'eval_step':1, 
    'eval_args': {'split': {'RS': [1, 0, 0]},
                'group_by': 'user',
                'order': 'TO',
                'mode': 'full'
            },
    'topk': 12,
    'valid_metric': 'MAP@12',
    'metrics':['MAP','Recall', 'MRR', 'Hit', 'Precision']
}
    
run_recbole(model="BERT4Rec", dataset='HM', config_dict=parameter_dict)

  from .autonotebook import tqdm as notebook_tqdm


## Submission

The following code are primarily adapted from a [public Kaggle notebook](https://www.kaggle.com/code/astrung/lstm-model-with-item-infor-fix-missing-last-item/notebook).

### Load Model Checkpoint

In [None]:
MODEL = "BERT4Rec-Apr-22-2022_01-43-12"

# load trained models and filtered data sets (or data loaders) from disk
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(
    model_file=f'saved/{MODEL}.pth'
)



10 May 20:33    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = False
data_path = ./recbole_data/HM_medium
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 120
train_batch_size = 2048
learner = adam
learning_rate = 0.001
neg_sampling = None
eval_step = 1
stopping_step = 20
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.95, 0.5, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}
repeatable = True
metrics = ['MAP', 'Recall', 'MRR', 'Hit', 'Precision']
topk = [12]
valid_metric = MAP@12
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user_id
ITEM_ID_FIELD = item_id
RATING_FIELD = rating
TIME_FIELD = 

### Genreate model predictions

In [None]:
# remove default "PAD" in array from RecBole 
external_user_ids = dataset.id2token(
    dataset.uid_field, list(range(dataset.user_num)))[1:] 


def add_last_item(old_interaction, last_item_id, max_len=50):
    new_seq_items = old_interaction['item_id_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_items[old_interaction['item_length'][-1].item()] = last_item_id
    else:
        new_seq_items = torch.roll(new_seq_items, -1)
        new_seq_items[-1] = last_item_id
    return new_seq_items.view(1, len(new_seq_items))

def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset.inter_feat[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[index]
        test = {
            'item_id_list': add_last_item(input_interaction, 
                                          input_interaction['item_id'][-1].item(), model.max_seq_length),
            'item_length': torch.tensor(
                [input_interaction['item_length'][-1].item() + 1
                 if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, 12)


In [None]:
topk_items = []
for external_user_id in external_user_ids:
    _, topk_iid_list = predict_for_all_item(external_user_id, dataset, model)
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)


external_item_str = [' '.join(x) for x in topk_items]
result = pd.DataFrame(external_user_ids, columns=['customer_id'])
result['prediction'] = external_item_str

# store incomplete predictions
result.to_csv(f'./submit/result_{MODEL}.csv', index=False)

### Blend predictions with default submission 

In [None]:
submit_df = pd.read_csv('submit/default_submission.csv')


submit_df = pd.merge(submit_df, result, on='customer_id', how='outer')
submit_df = submit_df.fillna(-1)
submit_df['prediction'] = submit_df.apply(
    lambda x: x['prediction_y'] if x['prediction_y'] != -1 else x['prediction_x'], axis=1)

submit_df = submit_df.drop(columns=['prediction_y', 'prediction_x'])
submit_df.to_csv(f'./submit/{MODEL}.csv', index=False)