In [1]:
import os
import re
import math
import joblib
import nltk
import numpy as np
import scipy.sparse as sparse
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from pymystem3 import Mystem
from nltk.corpus import stopwords

In [2]:
df_items = pd.read_csv('food-dataset-ru.csv')
labels = df_items['label'].values

In [3]:
# Эмбеддинги полученный с помощью BERT
bert_embed_items = np.load('bert_items_embed.npy')
bert_embed_items.shape

(37638, 312)

In [4]:
# Эмбеддинги полученный с помощью TF-IDF
tfidf_embed_items = sparse.load_npz('tfidf_items_embed.npz')
tfidf_embed_items.shape

(37638, 17523)

In [5]:
index_set = np.arange(df_items.shape[0])

np.random.seed(42)
np.random.shuffle(index_set)

train_index = index_set[:-int(len(index_set) * 0.2)]
test_index = index_set[-int(len(index_set) * 0.2):]

## Metrics

In [6]:
def eval_mapk(labels, items_embed, index_set, K=[3, 5, 7]):
    if type(K) is int:
        K = [K]

    # Вычисляем схожесть items
    score = cosine_similarity(tfidf_embed_items, tfidf_embed_items[index_set])
    # Сортируем по схожести items
    rating_arg = np.argsort(score, axis=0)[::-1]

    mapk_dict = {}
    for k in K:
        # Релевантный класс для каждого items
        relevant_cls = labels[rating_arg[0, :]]
        rating_cls = labels[rating_arg[1:k + 1, :]]

        # Сравнение полученных классов с релевантным
        relevant_mask = np.int16(rating_cls == relevant_cls)
        relevant_count = np.sum(relevant_mask, axis=0)
        relevant_count[relevant_count == 0] = 1

        # Average Precision Top K
        precision = np.cumsum(relevant_mask, axis=0) / np.arange(1, k + 1).reshape(k, 1)
        avg_precision = np.sum(precision * relevant_mask, axis=0) / relevant_count

        # Mean Average Precision Top K
        mapk = np.mean(avg_precision)
        mapk_dict[f"MAP@{k}"] = mapk
    return mapk_dict

In [7]:
bert_test_mapk = eval_mapk(labels, tfidf_embed_items, test_index, K=[3, 5, 7, 10])
tfidf_test_mapk = eval_mapk(labels, tfidf_embed_items, test_index, K=[3, 5, 7, 10])

In [8]:
# Метрика MAP@K для BERT RecSys
bert_test_mapk

{'MAP@3': 0.7532438775962091,
 'MAP@5': 0.7475205925335459,
 'MAP@7': 0.7359844295984193,
 'MAP@10': 0.7196004352835713}

In [9]:
# Метрика MAP@K для TF-IDF RecSys
tfidf_test_mapk

{'MAP@3': 0.7532438775962091,
 'MAP@5': 0.7475205925335459,
 'MAP@7': 0.7359844295984193,
 'MAP@10': 0.7196004352835713}