In [16]:
import warnings

warnings.filterwarnings("ignore")

In [17]:
!pip install recbole mamba-ssm ray kmeans-pytorch



In [18]:
import pandas as pd
import numpy as np
import torch
import time

# Data preparation

In [19]:
%cd /kaggle/working
%cp -r /kaggle/input/h-and-m-run-file ./
%cd h-and-m-run-file
%mkdir dataset
%mkdir dataset/hm

/kaggle/working
/kaggle/working/h-and-m-run-file
mkdir: cannot create directory 'dataset': File exists
mkdir: cannot create directory 'dataset/hm': File exists


In [20]:
dtype={"article_id": str}
inter = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype=dtype)

In [21]:
inter["timestamp"] = pd.to_datetime(inter["t_dat"]).astype(int) / 10 ** 9
data = inter[inter["timestamp"] > inter["timestamp"].quantile(1-1/64)][["customer_id", "article_id", "timestamp"]]

In [22]:
user_seqs = data.groupby("customer_id")["article_id"].agg(list).reset_index()["customer_id"]

In [23]:
from sklearn.model_selection import train_test_split

train_ids, test_ids = train_test_split(user_seqs, test_size=0.2, random_state=42)

In [24]:
train_df = data[data["customer_id"].isin(train_ids)]
test_df = data[data["customer_id"].isin(test_ids)]

In [25]:
train_df.columns = ["user_id:token", "item_id:token", "timestamp:float"]

In [26]:
train_df.to_csv("dataset/hm/hm.inter", sep="\t", index=False)

# Model training

In [27]:
import sys
import os
import logging
import argparse
from logging import getLogger
from recbole.utils import init_logger, init_seed
from recbole.trainer import Trainer
from mamba4rec import Mamba4Rec
# from mamba4rec_attr import Mamba4Rec
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.transform import construct_transform
from recbole.utils import (
    init_logger,
    get_model,
    get_trainer,
    init_seed,
    set_color,
    get_flops,
    get_environment,
)

In [28]:
config = Config(model=Mamba4Rec, config_file_list=["config_hm.yaml"])
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
logger.info(sys.argv)
logger.info(config)

# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

# model loading and initialization
init_seed(config["seed"] + config["local_rank"], config["reproducibility"])
model = Mamba4Rec(config, train_data.dataset).to(config['device'])
logger.info(model)

transform = construct_transform(config)
flops = get_flops(model, dataset, config["device"], logger, transform)
logger.info(set_color("FLOPs", "blue") + f": {flops}")

# trainer loading and initialization
trainer = Trainer(config, model)


if os.path.exists("saved/Mamba4Rec-Nov-12-2024_06-54-21.pth"):
    trainer.resume_checkpoint("saved/Mamba4Rec-Nov-12-2024_06-54-21.pth")
# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, show_progress=False,
    saved=True, verbose=True
)

# trainer.eval_collector.data_collect(train_data)
# model evaluation
test_result = trainer.evaluate(
    test_data, show_progress=False
)

environment_tb = get_environment(config)
print(
    "The running environment of this training is as follows:\n"
    + environment_tb.draw()
)

print(set_color("best valid ", "yellow") + f": {best_valid_result}")
print(set_color("test result", "yellow") + f": {test_result}")

The running environment of this training is as follows:
+-------------+----------------+
| Environment |     Usage      |
| CPU         |     5.50 %     |
+-------------+----------------+
| GPU         | 2.06 G/14.74 G |
+-------------+----------------+
| Memory      | 5.24 G/31.36 G |
+-------------+----------------+
best valid : OrderedDict([('map@10', 0.1478), ('map@12', 0.1487), ('ndcg@10', 0.1665), ('ndcg@12', 0.1695), ('mrr@10', 0.1478), ('mrr@12', 0.1487)])
test result: OrderedDict([('map@10', 0.185), ('map@12', 0.1859), ('ndcg@10', 0.2037), ('ndcg@12', 0.2065), ('mrr@10', 0.185), ('mrr@12', 0.1859)])


# Model analysis

## Item similarity

In [29]:
item = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv", dtype=dtype)
item = item[item["article_id"].isin(data["article_id"])].reset_index()

In [30]:
item.head()

Unnamed: 0,index,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,6,111565001,111565,20 den 1p Stockings,304,Underwear Tights,Socks & Tights,1010016,Solid,9,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny nylon stockings with a wide, reinfo..."
2,8,111586001,111586,Shape Up 30 den 1p Tights,273,Leggings/Tights,Garment Lower body,1010016,Solid,9,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Tights with built-in support to lift the botto...
3,9,111593001,111593,Support 40 den 1p Tights,304,Underwear Tights,Socks & Tights,1010016,Solid,9,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny tights that shape the tummy, thighs..."
4,10,111609001,111609,200 den 1p Tights,304,Underwear Tights,Socks & Tights,1010016,Solid,9,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Opaque matt tights. 200 denier.


In [31]:
print(f"Number of total items: {item['article_id'].nunique()}")
print(f"Number of valid items: {dataset.item_num-1}")

Number of total items: 22640
Number of valid items: 6837


In [32]:
item_str = item.select_dtypes("object")
item_str["description"] = item_str.drop(columns=["article_id", "index_code"]).apply(lambda x: ' '.join(x.dropna()), axis=1)
item = item_str[["article_id", "description"]]

In [33]:
item = item.sort_values("article_id")
item_mapper = {item["article_id"].iloc[i]: i for i in range(item["article_id"].nunique())}
item_inv_mapper = {i: item["article_id"].iloc[i] for i in range(item["article_id"].nunique())}

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

vect = TfidfVectorizer()
tfidf = vect.fit_transform(item["description"])

X = csr_matrix(tfidf)

svd = TruncatedSVD(n_components=16, n_iter=3, random_state=42)
X = svd.fit_transform(X)

In [35]:
valid_articles = dataset.id2token(dataset.iid_field, range(1, dataset.item_num))
valid_ids = [item_mapper[item] for item in valid_articles]
X_valid = X[valid_ids]
valid_mapper = {valid_articles[i]: i for i in range(len(valid_articles))}
valid_inv_mapper = {i: valid_articles[i] for i in range(len(valid_articles))}

In [36]:
sim_cosine_all = cosine_similarity(X, X)
sim_cosine_valid = cosine_similarity(X, X_valid)

## Map non-data items to valid items

In [37]:
from functools import lru_cache
from sklearn.metrics.pairwise import cosine_similarity

def to_valid_list(item_list):

    global convert2valid
    @lru_cache(maxsize=2048)  
    def convert2valid(item):
        item_id = item_mapper[item]
        sim_scores = sim_cosine_valid[item_id] # cosine_similarity(X[item_id].reshape(1,-1), X_valid)[0]
        sort_indices = np.argsort(-sim_scores)
        return valid_inv_mapper[sort_indices[0]]

    valid_list = []
    for i in range(len(item_list)-1):
        if item_list[i] not in valid_articles:
            valid_list.append(convert2valid(item_list[i]))
        else:
            valid_list.append(item_list[i])
            
    return ['[PAD]'] if len(valid_list) == 0 else valid_list

In [38]:
test_sequence = test_df.sort_values(['customer_id', 'timestamp']) \
                        .groupby('customer_id')['article_id'] \
                        .agg(list) \
                        .reset_index()
test_sequence.columns = ["customer_id", "sequence"]
print(len(test_sequence))
test_sequence.head()

25667


Unnamed: 0,customer_id,sequence
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[0794321007]
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[0624486001]
2,001674dc3a5fd1be9256feaecbf7a8a9ebd17232cb2188...,"[0909924002, 0806131012, 0893059005, 094450600..."
3,0016df4fbb49288b9ed4a8a0fa3d2f7038fc4ab7f02e3c...,[0863937003]
4,00194061f3caa80bf10d615bf406bc5959a3bd799e4f21...,[0572998005]


In [39]:
test_sequence["item_id_list"] = test_sequence["sequence"].apply(to_valid_list)
test_sequence["item_length"] = test_sequence["item_id_list"].apply(len)

In [40]:
test_sequence.head()

Unnamed: 0,customer_id,sequence,item_id_list,item_length
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[0794321007],[[PAD]],1
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[0624486001],[[PAD]],1
2,001674dc3a5fd1be9256feaecbf7a8a9ebd17232cb2188...,"[0909924002, 0806131012, 0893059005, 094450600...","[0909924002, 0806131012, 0893059005, 0944506001]",4
3,0016df4fbb49288b9ed4a8a0fa3d2f7038fc4ab7f02e3c...,[0863937003],[[PAD]],1
4,00194061f3caa80bf10d615bf406bc5959a3bd799e4f21...,[0572998005],[[PAD]],1


## Pre- and Post-processing step for non-data item

In [42]:
from sklearn.metrics import average_precision_score, ndcg_score

### NDCG on NON-TRAINING data

In [44]:
# topk_list = []
shape = (len(test_sequence), dataset.item_num-1)

NDCG = np.zeros(shape[0])
NDCG_sim = np.zeros(shape[0])

y_scores = np.zeros(shape)
y_true_cos_sim = np.zeros(shape)
y_true_cos = np.zeros(shape)

for i, row in test_sequence.iterrows():
    item_id_list = np.array([dataset.token2id(dataset.iid_field, row["item_id_list"])])
    interaction = {
        "item_id_list": torch.LongTensor(item_id_list).to("cuda"),
        "item_length": torch.LongTensor(np.array([row["item_length"]])).to("cuda")
    }
    scores = model.full_sort_predict(interaction)[0]
    y_scores[i] = scores[1:].cpu().detach().numpy() #.reshape(1,-1)

    true_item = row["sequence"][-1]
    true_item_id = item_mapper[true_item]
    y_true_cos_sim[i] = (sim_cosine_valid[true_item_id, : ] > 0.9).astype(int) #.reshape(1,-1)

    true_item = convert2valid(true_item) if true_item not in valid_articles else true_item
    true_item_id = dataset.token2id(dataset.iid_field, true_item) - 1
    y_true_cos[i, true_item_id] = 1
        
NDCG_pre = ndcg_score(y_true_cos, y_scores)
NDCG_sim_pre = ndcg_score(y_true_cos_sim, y_scores)
#print(len(topk_list))

In [45]:
print(f"Hit NDCG with preprocessing step: {NDCG_pre:.3f}")
print(f"Similarity NDCG with preprocessing step: {NDCG_sim_pre:.3f}")

Hit NDCG with preprocessing step: 0.227
Similarity NDCG with preprocessing step: 0.568


In [46]:
from scipy.special import softmax
from sklearn.preprocessing import MinMaxScaler

In [47]:
non_valid_ids = [item_mapper[item] for item in item_mapper.keys() if item not in valid_articles]
valid_ids = [item_mapper[item] for item in valid_articles]
sim_scores = sim_cosine_valid[non_valid_ids]/ np.sum(sim_cosine_valid[non_valid_ids], axis=1).reshape(-1,1)
# print(sim_weights.shape)

def weighted_similarity(initial_scores):
    final_scores = np.zeros((X.shape[0]))
    final_scores[valid_ids] = initial_scores
    final_scores[non_valid_ids] = np.dot(sim_scores, initial_scores)
    return final_scores #.reshape(1,-1)

In [48]:
# topk_list = []
post_shape = (len(test_sequence), X.shape[0])

y_scores = np.zeros(post_shape)
y_true_cos_sim = np.zeros(post_shape)
y_true_cos = np.zeros(post_shape)

for i, row in test_sequence.iterrows():
    item_id_list = np.array([dataset.token2id(dataset.iid_field, row["item_id_list"])])
    interaction = {
        "item_id_list": torch.LongTensor(item_id_list).to("cuda"),
        "item_length": torch.LongTensor(np.array([row["item_length"]])).to("cuda")
    }
    scores = model.full_sort_predict(interaction)[0]
    scores = scores.cpu().detach().numpy()
    
    y_scores[i] = weighted_similarity(scores[1:])

    true_item_id = item_mapper[row["sequence"][-1]]
    y_true_cos_sim[i] = (sim_cosine_all[true_item_id, :] > 0.9).astype(int) #.reshape(1,-1)

    y_true_cos[i, true_item_id] = 1
        
NDCG_post = ndcg_score(y_true_cos, y_scores)
NDCG_sim_post = ndcg_score(y_true_cos_sim, y_scores)
#print(len(topk_list))

In [49]:
print(f"Hit NDCG with pre- and post-processing step: {NDCG_post:.3f}")
print(f"Similarity NDCG with pre- and post-processing step: {NDCG_sim_post:.3f}")

Hit NDCG with pre- and post-processing step: 0.215
Similarity NDCG with pre- and post-processing step: 0.603
