In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
!pip install recbole mamba-ssm ray kmeans-pytorch

Collecting recbole
  Downloading recbole-1.2.0-py3-none-any.whl.metadata (1.4 kB)
Collecting mamba-ssm
  Downloading mamba_ssm-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Collecting kmeans-pytorch
  Downloading kmeans_pytorch-0.3-py3-none-any.whl.metadata (1.6 kB)
Collecting colorlog==4.7.2 (from recbole)
  Downloading colorlog-4.7.2-py2.py3-none-any.whl.metadata (9.9 kB)
Collecting colorama==0.4.4 (from recbole)
  Downloading colorama-0.4.4-py2.py3-none-any.whl.metadata (14 kB)
Collecting thop>=0.1.1.post2207130030 (from recbole)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting einops (from mamba-ssm)
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting triton (from mamba-ssm)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

In [3]:
import pandas as pd
import numpy as np
import torch
import time

# Data preparation

In [4]:
%cd /kaggle/working
%cp -r /kaggle/input/h-and-m-run-file ./
%cd h-and-m-run-file
%mkdir dataset
%mkdir dataset/hm

/kaggle/working
/kaggle/working/h-and-m-run-file


In [5]:
dtype={"article_id": str}
inter = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype=dtype)

In [6]:
inter["timestamp"] = pd.to_datetime(inter["t_dat"]).astype(int) / 10 ** 9
data = inter[inter["timestamp"] > inter["timestamp"].quantile(1-1/64)][["customer_id", "article_id", "timestamp"]]

In [7]:
user_seqs = data.groupby("customer_id")["article_id"].agg(list).reset_index()["customer_id"]

In [8]:
from sklearn.model_selection import train_test_split

train_ids, test_ids = train_test_split(user_seqs, test_size=0.2, random_state=42)

In [9]:
train_df = data[data["customer_id"].isin(train_ids)]
test_df = data[data["customer_id"].isin(test_ids)]

In [10]:
train_df.columns = ["user_id:token", "item_id:token", "timestamp:float"]

In [11]:
train_df.to_csv("dataset/hm/hm.inter", sep="\t", index=False)

# Model training

In [12]:
import sys
import os
import logging
import argparse
from logging import getLogger
from recbole.utils import init_logger, init_seed
from recbole.trainer import Trainer
from mamba4rec import Mamba4Rec
# from mamba4rec_attr import Mamba4Rec
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.transform import construct_transform
from recbole.utils import (
    init_logger,
    get_model,
    get_trainer,
    init_seed,
    set_color,
    get_flops,
    get_environment,
)

In [13]:
config = Config(model=Mamba4Rec, config_file_list=["config_hm.yaml"])
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
logger.info(sys.argv)
logger.info(config)

# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

# model loading and initialization
init_seed(config["seed"] + config["local_rank"], config["reproducibility"])
model = Mamba4Rec(config, train_data.dataset).to(config['device'])
logger.info(model)

transform = construct_transform(config)
flops = get_flops(model, dataset, config["device"], logger, transform)
logger.info(set_color("FLOPs", "blue") + f": {flops}")

# trainer loading and initialization
trainer = Trainer(config, model)


if os.path.exists("saved/checkpoint.pth"):
    trainer.resume_checkpoint("saved/checkpoint.pth")
# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, show_progress=False,
    saved=True, verbose=True
)

# trainer.eval_collector.data_collect(train_data)
# model evaluation
test_result = trainer.evaluate(
    test_data, show_progress=False
)

environment_tb = get_environment(config)
print(
    "The running environment of this training is as follows:\n"
    + environment_tb.draw()
)

print(set_color("best valid ", "yellow") + f": {best_valid_result}")
print(set_color("test result", "yellow") + f": {test_result}")

The running environment of this training is as follows:
+-------------+----------------+
| Environment |     Usage      |
| CPU         |     2.50 %     |
+-------------+----------------+
| GPU         | 2.06 G/14.74 G |
+-------------+----------------+
| Memory      | 4.76 G/31.36 G |
+-------------+----------------+
best valid : OrderedDict([('map@10', 0.1472), ('map@12', 0.1481), ('ndcg@10', 0.1664), ('ndcg@12', 0.1694), ('mrr@10', 0.1472), ('mrr@12', 0.1481)])
test result: OrderedDict([('map@10', 0.1823), ('map@12', 0.1833), ('ndcg@10', 0.2017), ('ndcg@12', 0.2049), ('mrr@10', 0.1823), ('mrr@12', 0.1833)])


# Model analysis

## Item similarity

In [14]:
item = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv", dtype=dtype)
item = item[item["article_id"].isin(data["article_id"])].reset_index()

In [15]:
item.head()

Unnamed: 0,index,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,6,111565001,111565,20 den 1p Stockings,304,Underwear Tights,Socks & Tights,1010016,Solid,9,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny nylon stockings with a wide, reinfo..."
2,8,111586001,111586,Shape Up 30 den 1p Tights,273,Leggings/Tights,Garment Lower body,1010016,Solid,9,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Tights with built-in support to lift the botto...
3,9,111593001,111593,Support 40 den 1p Tights,304,Underwear Tights,Socks & Tights,1010016,Solid,9,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny tights that shape the tummy, thighs..."
4,10,111609001,111609,200 den 1p Tights,304,Underwear Tights,Socks & Tights,1010016,Solid,9,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Opaque matt tights. 200 denier.


In [16]:
print(f"Number of total items: {item['article_id'].nunique()}")
print(f"Number of valid items: {dataset.item_num-1}")

Number of total items: 22640
Number of valid items: 6837


In [17]:
item_str = item.select_dtypes("object")
item_str["description"] = item_str.drop(columns=["article_id", "index_code"]).apply(lambda x: ' '.join(x.dropna()), axis=1)
item = item_str[["article_id", "description"]]

In [18]:
item = item.sort_values("article_id")
item_mapper = {item["article_id"].iloc[i]: i for i in range(item["article_id"].nunique())}
item_inv_mapper = {i: item["article_id"].iloc[i] for i in range(item["article_id"].nunique())}

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

vect = TfidfVectorizer()
tfidf = vect.fit_transform(item["description"])

X = csr_matrix(tfidf)

svd = TruncatedSVD(n_components=16, n_iter=3, random_state=42)
X = svd.fit_transform(X)

In [20]:
valid_articles = dataset.id2token(dataset.iid_field, range(1, dataset.item_num))
valid_ids = [item_mapper[item] for item in valid_articles]
X_valid = X[valid_ids]
valid_mapper = {valid_articles[i]: i for i in range(len(valid_articles))}
valid_inv_mapper = {i: valid_articles[i] for i in range(len(valid_articles))}

In [21]:
sim_cosine_all = cosine_similarity(X, X)
sim_cosine_valid = cosine_similarity(X, X_valid)

## Map non-data items to valid items

In [22]:
from functools import lru_cache
from sklearn.metrics.pairwise import cosine_similarity

def to_valid_list(item_list):

    global convert2valid
    @lru_cache(maxsize=2048)  
    def convert2valid(item):
        item_id = item_mapper[item]
        sim_scores = sim_cosine_valid[item_id] # cosine_similarity(X[item_id].reshape(1,-1), X_valid)[0]
        sort_indices = np.argsort(-sim_scores)
        return valid_inv_mapper[sort_indices[0]]

    valid_list = []
    for i in range(len(item_list)-1):
        if item_list[i] not in valid_articles:
            valid_list.append(convert2valid(item_list[i]))
        else:
            valid_list.append(item_list[i])
            
    return ['[PAD]'] if len(valid_list) == 0 else valid_list

In [23]:
test_sequence = test_df.sort_values(['customer_id', 'timestamp']) \
                        .groupby('customer_id')['article_id'] \
                        .agg(list) \
                        .reset_index()
test_sequence.columns = ["customer_id", "sequence"]
print(len(test_sequence))
test_sequence.head()

25667


Unnamed: 0,customer_id,sequence
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[0794321007]
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[0624486001]
2,001674dc3a5fd1be9256feaecbf7a8a9ebd17232cb2188...,"[0909924002, 0806131012, 0893059005, 094450600..."
3,0016df4fbb49288b9ed4a8a0fa3d2f7038fc4ab7f02e3c...,[0863937003]
4,00194061f3caa80bf10d615bf406bc5959a3bd799e4f21...,[0572998005]


In [24]:
test_sequence["item_id_list"] = test_sequence["sequence"].apply(to_valid_list)
test_sequence["item_length"] = test_sequence["item_id_list"].apply(len)

In [25]:
test_sequence.head()

Unnamed: 0,customer_id,sequence,item_id_list,item_length
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[0794321007],[[PAD]],1
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[0624486001],[[PAD]],1
2,001674dc3a5fd1be9256feaecbf7a8a9ebd17232cb2188...,"[0909924002, 0806131012, 0893059005, 094450600...","[0909924002, 0806131012, 0893059005, 0944506001]",4
3,0016df4fbb49288b9ed4a8a0fa3d2f7038fc4ab7f02e3c...,[0863937003],[[PAD]],1
4,00194061f3caa80bf10d615bf406bc5959a3bd799e4f21...,[0572998005],[[PAD]],1


## Pre- and Post-processing step for non-data item

In [26]:
sim_based = True
K = 10

In [27]:
from sklearn.metrics import average_precision_score, ndcg_score

### NDCG on TRAINING test_data

In [28]:
total_len = 0
NDCG_sum = 0
NDCG_sim_sum = 0 
for i, epoch in enumerate(test_data):
    interaction = epoch[0]    
    scores = model.full_sort_predict(interaction.to("cuda"))
    y_scores = scores[:, 1:].cpu().detach().numpy()
    true_items = dataset.id2token(dataset.iid_field, epoch[3])
    
    
    y_true_cos_sim = sim_cosine_valid[[item_mapper[i] for i in true_items]]
    y_true_cos = np.eye(dataset.item_num-1)[(epoch[3] - 1)]

    NDCG_sim_sum += ndcg_score(y_true_cos_sim, y_scores) * len(epoch[2])
    NDCG_sum += ndcg_score(y_true_cos, y_scores) * len(epoch[2])
    total_len += len(epoch[2])

print(f"Hit NDCG on test data: {NDCG_sum / total_len:.3f}")
print(f"Similarity NDCG on test data: {NDCG_sim_sum / total_len:.3f}")

Hit NDCG on test data: 0.298
Similarity NDCG on test data: 0.898


### NDCG on NON-TRAINING data

In [29]:
# topk_list = []
shape = (len(test_sequence), dataset.item_num-1)

NDCG_pre = np.zeros(shape[0])
NDCG_sim_pre = np.zeros(shape[0])

for i, row in test_sequence.iterrows():
    item_id_list = np.array([dataset.token2id(dataset.iid_field, row["item_id_list"])])
    interaction = {
        "item_id_list": torch.LongTensor(item_id_list).to("cuda"),
        "item_length": torch.LongTensor(np.array([row["item_length"]])).to("cuda")
    }
    scores = model.full_sort_predict(interaction)[0]
    y_scores = scores[1:].cpu().detach().numpy().reshape(1,-1)

   

    true_item = row["sequence"][-1]
    true_item_id = item_mapper[true_item]
    y_true_cos_sim = sim_cosine_valid[true_item_id, : ].reshape(1,-1)

    true_item = convert2valid(true_item) if true_item not in valid_articles else true_item
    true_item_id = dataset.token2id(dataset.iid_field, true_item) - 1
    y_true_cos = np.zeros(shape=(1, shape[1]))
    y_true_cos[:, true_item_id] = 1
        
    NDCG_pre[i] = ndcg_score(y_true_cos, y_scores)
    NDCG_sim_pre[i] = ndcg_score(y_true_cos_sim, y_scores)
#print(len(topk_list))

In [30]:
print(f"Hit NDCG with preprocessing step: {NDCG_pre.mean():.3f}")
print(f"Similarity NDCG with preprocessing step: {NDCG_sim_pre.mean():.3f}")

Hit NDCG with preprocessing step: 0.227
Similarity NDCG with preprocessing step: 0.892


In [31]:
non_valid_ids = [item_mapper[item] for item in item_mapper.keys() if item not in valid_articles]
valid_ids = [item_mapper[item] for item in valid_articles]
sim_weights = sim_cosine_valid[non_valid_ids] / np.sum(sim_cosine_valid[non_valid_ids], axis=1).reshape(-1,1)
# print(sim_weights.shape)

def weighted_similarity(initial_scores):
    final_scores = np.zeros((X.shape[0]))
    final_scores[valid_ids] = initial_scores
    
    # sim_scores = sim_cosine_valid[non_valid_ids][:,match_item_ids]
    # weights = sim_scores / np.sum(sim_scores, axis=1).reshape(-1,1)
    final_scores[non_valid_ids] = np.dot(sim_weights, initial_scores)
    return final_scores.reshape(1,-1)

In [32]:
# topk_list = []

NDCG_sim_post = np.zeros(shape[0])
NDCG_post = np.zeros(shape[0])
N = shape[1]
k = 10
weights = np.array([1/(np.log2(a+2)) for a in range(k)])

for i, row in test_sequence.iterrows():
    item_id_list = np.array([dataset.token2id(dataset.iid_field, row["item_id_list"])])
    interaction = {
        "item_id_list": torch.LongTensor(item_id_list).to("cuda"),
        "item_length": torch.LongTensor(np.array([row["item_length"]])).to("cuda")
    }
    scores = model.full_sort_predict(interaction)[0]
    # scores[0] = -float("inf")
    scores = scores.cpu().detach().numpy()
    # top_indices = np.argsort(-scores)[:k]
    # top_valid_items = dataset.id2token(dataset.iid_field, top_indices)
    y_scores = weighted_similarity(scores[1:])

    true_item_id = item_mapper[row["sequence"][-1]]
    y_true_cos_sim = sim_cosine_all[true_item_id, :].reshape(1,-1)

    y_true_cos = np.zeros(shape=(1, X.shape[0]))
    y_true_cos[:, true_item_id] = 1
        
    NDCG_post[i] = ndcg_score(y_true_cos, y_scores)
    NDCG_sim_post[i] = ndcg_score(y_true_cos_sim, y_scores)
#print(len(topk_list))

In [33]:
print(f"Hit NDCG with pre- and post-processing step: {NDCG_post.mean():.3f}")
print(f"Similarity NDCG with pre- and post-processing step: {NDCG_sim_post.mean():.3f}")

Hit NDCG with pre- and post-processing step: 0.215
Similarity NDCG with pre- and post-processing step: 0.909
