# ДЗ №4

1. Перенесите метрики в модуль src.metrics.py
2. Перенесите функцию prefilter_items в модуль src.utils.py
3. Создайте модуль src.recommenders.py. Напищите код для класса ниже 
(задание обсуждали на вебинаре, для первой функции практически сделали) и положите его в src.recommenders.py
4. Проверьте, что все модули корректно импортируются

In [278]:
import importlib
importlib.reload(src.metrics)
importlib.reload(src.utils)
importlib.reload(src.recommenders)
importlib.reload(src.dataset)
pass

In [279]:
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.dataset import load_data
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [55]:
train, test, product, baseline = load_data()

In [213]:
# найдем категории в которых есть продукты данного магазина
pcats = product[product['brand'] == 'Private'].groupby('department')['item_id'].nunique()
pcats = pcats[pcats>100]
pcats = product[product['department'].isin(pcats.index)]

In [280]:
n_items_before = train['item_id'].nunique()

data_train = prefilter_items(train, product_filter=pcats, top_n=5000)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


In [347]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


class MR:
    """Рекоммендации, которые можно получить из ALS

    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """

    def __init__(self, data, weighting=True):

        # your_code. Это не обязательная часть. Но если вам удобно что-либо посчитать тут - можно это сделать

        self.data = data
        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.user_top = self.top_user_actual(data)
        
        self.id_to_itemid, self.id_to_userid, \
            self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)

        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)

    @staticmethod
    def top_user_actual(data):
        popularity = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        popularity.sort_values('quantity', ascending=False, inplace=True)

        popularity = popularity[popularity['item_id'] != 999999]
        popularity = popularity.groupby('user_id').head(5)

        popularity.sort_values('user_id', ascending=False, inplace=True)
        return popularity

    @staticmethod
    def prepare_matrix(data):
        return pd.pivot_table(data, 
                              index='user_id', columns='item_id', 
#                               values='quantity', aggfunc='count', 
                              values='weight', aggfunc='mean', 
                              fill_value=0).astype(float)

    @staticmethod
    def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())

        return own_recommender

    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""

        model = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())

        return model
    
    def get_recommendations(self, user, N=5):
        res = [self.id_to_itemid[rec[0]] for rec in 
                        self.model.recommend(userid=self.userid_to_id[user], 
                                        user_items=csr_matrix(self.user_item_matrix).tocsr(),
                                        N=N, 
                                        filter_already_liked_items=False, 
                                        filter_items=[self.itemid_to_id[999999]],
                                        recalculate_user=True)]
        return res    

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        def get_rec(x):
            recs = self.model.similar_items(self.itemid_to_id[x], N=2)
            top_rec = recs[1][0]
            return self.id_to_itemid[top_rec]
    
        self.user_top['similar_recommendation'] = self.user_top['item_id'].apply(lambda x: get_rec(x))
        res = self.user_top.groupby('user_id')['similar_recommendation'].unique().reset_index()
        res.columns=['user_id', 'similar_recommendation']

        return res

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        topusers = self.model.similar_users(self.userid_to_id[user], N=6)
        topusers = [self.id_to_userid[i[0]] for i in topusers[1:]]
        
        data = self.data[self.data['user_id'].isin(topusers)]
        data = data[data['item_id'] != 999999]
        
        popularity = data.groupby('item_id')['quantity'].count().reset_index()
        popularity.sort_values('quantity', ascending=False, inplace=True)
        return popularity.drop_duplicates('item_id').head(5).item_id.to_list()

In [348]:
mr = MR(data_train)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [349]:
mr.get_similar_users_recommendation(2500)

[5569471, 1058997, 862349, 1070820, 893501]

In [299]:
baseline['mainrecommender'] = baseline['user_id'].apply(lambda x: mr.get_recommendations(x))

In [None]:
baseline['similar_users'] = baseline['user_id'].apply(lambda x: mr.get_similar_users_recommendation(x))

In [300]:
def compare_precision_at_5(row):
    flds = ['random_recommendation', 'popular_recommendation', 'itemitem', 'cosine', 
            'tfidf', 'own_purchases', 'mainrecommender', similar_users]
    for i in flds:
        row[f'precision_{i}'] = len(set(row[i]) & set(row['actual'])) / 5
    return row

precision = baseline.apply(lambda row: compare_precision_at_5(row), axis=1)
precision.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,2042.0,1257.93095,718.052041,1.0,648.5,1260.5,1879.75,2500.0
precision_random_recommendation,2042.0,0.000588,0.010828,0.0,0.0,0.0,0.0,0.2
precision_popular_recommendation,2042.0,0.15524,0.174668,0.0,0.0,0.2,0.2,0.8
precision_itemitem,2042.0,0.033595,0.085772,0.0,0.0,0.0,0.0,0.6
precision_cosine,2042.0,0.03526,0.087261,0.0,0.0,0.0,0.0,0.6
precision_tfidf,2042.0,0.036141,0.087462,0.0,0.0,0.0,0.0,0.6
precision_own_purchases,2042.0,0.179628,0.189525,0.0,0.0,0.2,0.2,0.8
precision_mainrecommender,2042.0,0.089814,0.151763,0.0,0.0,0.0,0.2,1.0


In [305]:
mr.model.similar_items(mr.itemid_to_id[960732], 5)

[(1743, 0.04261237),
 (3399, 0.03666466),
 (3414, 0.036328178),
 (2291, 0.03613097),
 (815, 0.036103453)]