# Вебинар 5. Домашнее задание

### Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items

### Data import

In [2]:
def import_data():
    data = pd.read_csv('raw_data/transaction_data.csv')

    data.columns = [col.lower() for col in data.columns]
    data.rename(columns={'household_key': 'user_id',
                        'product_id': 'item_id'},
               inplace=True)


    test_size_weeks = 3

    data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
    data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

#     data_train.head(2)
    
    return data_train, data_test

data_train, data_test = import_data()

In [3]:
data_train.shape

(2485538, 12)

In [4]:
item_features = pd.read_csv('raw_data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


### Preprocessing

In [5]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65..."
1,3,"[823704, 834117, 840244, 913785, 917816, 93870..."


In [6]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, item_features)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 90386 to 391


In [7]:
data_train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
221,2305,26996870743,2,859610,2,9.44,414,0.0,1300,1,0.0,0.0,3665,MEAT,National,BEEF,GRND/PATTY - SIRLOIN (90%+),
342,1617,27008755998,3,972143,1,5.95,440,0.0,1148,1,0.0,0.0,4312,MEAT,National,CHICKEN,CHICKEN WINGS,
408,347,27008813029,3,5593053,1,8.99,367,0.0,2110,1,0.0,0.0,69,FLORAL,Private,ROSES,ROSE CONSUMER BUNCH,7 STEM
528,2324,27008841880,3,8203606,1,6.5,32004,0.0,1547,1,0.0,0.0,3283,SEAFOOD,National,SEAFOOD-FRESH,SEAFOOD-FRE-RAW FINFISH-OTHER,
607,1238,27008911488,3,948650,1,6.04,358,0.0,1421,1,0.0,0.0,2872,MEAT,National,BEEF,CHOICE BEEF,


In [8]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [9]:
data_train.shape

(20967, 18)

### Model

In [10]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

user_item_matrix.head(3)

item_id,397896,819308,820296,823990,825226,825343,825749,827546,828106,829685,...,13003402,13007264,13007721,13072766,13158494,13158498,13380863,13504479,13506200,13653499
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
print(type(user_item_matrix), user_item_matrix)

userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

<class 'pandas.core.frame.DataFrame'> item_id  397896    819308    820296    823990    825226    825343    825749    \
user_id                                                                         
1             0.0       0.0       0.0       1.0       0.0       0.0       0.0   
2             0.0       0.0       0.0       0.0       0.0       0.0       0.0   
3             0.0       0.0       0.0       0.0       0.0       0.0       0.0   
4             0.0       0.0       0.0       0.0       0.0       0.0       0.0   
5             0.0       0.0       0.0       0.0       0.0       0.0       0.0   
...           ...       ...       ...       ...       ...       ...       ...   
2496          0.0       0.0       0.0       0.0       0.0       1.0       0.0   
2497          0.0       0.0       0.0       4.0       0.0       0.0       0.0   
2498          0.0       0.0       0.0       0.0       0.0       0.0       0.0   
2499          0.0       0.0       0.0       0.0       0.0       0.0    

In [12]:
user_item_matrix.head()

item_id,397896,819308,820296,823990,825226,825343,825749,827546,828106,829685,...,13003402,13007264,13007721,13072766,13158494,13158498,13380863,13504479,13506200,13653499
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
user_item_matrix = bm25_weight(user_item_matrix.T).T  # Применяется к item-user матрице ! 

In [14]:
%%time

model = AlternatingLeastSquares(factors=20, 
                                regularization=0.001,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Wall time: 97 ms


# Домашнее задание

### Домашнее задание. Описание.

1. Изучите структуру модуля src   [DONE]
2. Перенесите функции prefilter_items и postfilter_items из вебинара в модуль src.utils.py   [DONE]
3. Реализуйте функции get_similar_items_recommendation, get_similar_users_recommendation (они разбирались на вебинаре) и переместите в src.utils.py   [DONE]
4. Создайте модуль src.recommenders.py. Напищите код для класса ниже и положите его в src.recommenders.py
5. Проверьте, что все модули корректно импортируются
6. Если вы еще не прочитали [статью](https://habr.com/ru/company/hh/blog/347276/) о рекомендательных системах и поиске в hh.ru, то обязательно прочитайте   [DONE]

### Домашнее задание. Решение.

##### Prefilter, postfilter functions

In [15]:
def prefilter_items(data, item_features):
    
    data = data.merge(item_features, how='left', on='item_id')
    
    # Убираем самые популярные товары.
    popularity = data.groupby('item_id')['user_id'].nunique().reset_index()
    popularity['user_id'] = popularity['user_id']/ data['user_id'].nunique()
    
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
    popularity.sort_values('share_unique_users', inplace=True)
    
    top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]
    
    # Убираем самые непопулярыне товары.
    top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]
    
    # Убараем товары, по которым не было продаж больше 12 месяцев.
    data = data[~(data['week_no'] > data['week_no'].max() - 12 * 4)]
    
    # Убираем товары из неинтересных категорий (department).
    deps_to_delete = ['GROCERY', 'MISC. TRANS.', 'PASTRY', 'DRUG GM', 'MEAT-PCKGD',
       'SEAFOOD-PCKGD', 'PRODUCE', 'NUTRITION', 'DELI', 'COSMETICS']
    data = data[~(data['department'].isin(deps_to_delete))]
    
    # Убираем самые дешевые товары. 
    data = data[~(data['sales_value'] < 5)]
    
    # Убираем самые дорогие товары.
    data = data[~(data['sales_value'] > 100)]
    
    return data

def postfilter_items(user_id, recommednations):
    pass


##### Get 'similar' functions

In [16]:
def get_similar_items_recommendation(user, model, user_item_matrix, N=5):
    """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
    user_id = itemid_to_id[user]
    
    top_n_items = user_item_matrix.toarray()[user_id, :]
    top_n_items.sort()
    top_n_items = np.argsort(-top_n_items)
    top_n_items = top_n_items[:N]
    print(top_n_items)
    
    similar_items = []
    for item in top_n_items:
        similar_item = [val[0] for val in model.similar_items(itemid=item, N=2) if val[0] != item]
        similar_items.append(similar_item)
    similar_items = np.array(similar_items).flatten()
    
    return similar_items

def get_similar_users_recommendation(user, model, user_item_matrix, N=5):
    """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
    
    similar_users = model.similar_users(userid=userid_to_id[user], N=N)
    similar_items = []
    
    for user in similar_users:
        user = user[0]
        top_n_items = user_item_matrix.toarray()[user, :]
        top_n_items = np.argsort(-top_n_items)[:N]
        print(top_n_items)
        similar_items.append(top_n_items)
    similar_items = list(set(np.array(similar_items).flatten()))
    
    return np.random.choice(similar_items, N)

get_similar_items_recommendation(397896, model, user_item_matrix, N=5)
get_similar_users_recommendation(2375, model, user_item_matrix, N=5)

[390 389 387 265 264]
[ 16  96 202  28   5]
[195  16  82  76  60]
[195  61   8  39 249]
[ 96  45 308  28 169]
[202  45   0 265 264]


array([ 0, 39, 61, 39, 76], dtype=int64)

##### Class incapsulate

In [17]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


class MainRecommender:
    def __init__(self, user_item_data, item_features, weighting=True):
        
        self.item_features = item_features
        self.user_item_matrix = self.prepare_matrix(user_item_data, self.item_features)
        self.prepare_dicts(self.user_item_matrix)
        
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 
        
        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
     
    def prepare_matrix(self, data, item_features):
        data = self.prefilter_items(data, item_features)
        
        user_item_matrix = pd.pivot_table(data, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

        user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit


        return user_item_matrix
    
    def prepare_dicts(self, user_item_matrix):
        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        self.id_to_itemid = dict(zip(matrix_itemids, itemids))
        self.id_to_userid = dict(zip(matrix_userids, userids))

        self.itemid_to_id = dict(zip(itemids, matrix_itemids))
        self.userid_to_id = dict(zip(userids, matrix_userids))
     
    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
    
        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return own_recommender
    
    def fit(self, user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""
        model = AlternatingLeastSquares(factors=n_factors, 
                                             regularization=regularization,
                                             iterations=iterations,  
                                             num_threads=num_threads)
        model.fit(csr_matrix(self.user_item_matrix).T.tocsr())
        
        return model
    
    def prefilter_items(self, data, item_features):
        data = data.merge(item_features, how='left', on='item_id', suffixes=('', 'f'))

        # Убираем самые популярные товары.
        popularity = data.groupby('item_id')['user_id'].nunique().reset_index()
        popularity['user_id'] = popularity['user_id']/ data['user_id'].nunique()

        popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
        popularity.sort_values('share_unique_users', inplace=True)

        top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
        data = data[~data['item_id'].isin(top_popular)]

        # Убираем самые непопулярыне товары.
        top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
        data = data[~data['item_id'].isin(top_notpopular)]

        # Убараем товары, по которым не было продаж больше 12 месяцев.
        data = data[~(data['week_no'] > data['week_no'].max() - 12 * 4)]

        # Убираем товары из неинтересных категорий (department).
        deps_to_delete = ['GROCERY', 'MISC. TRANS.', 'PASTRY', 'DRUG GM', 'MEAT-PCKGD',
           'SEAFOOD-PCKGD', 'PRODUCE', 'NUTRITION', 'DELI', 'COSMETICS']
        data = data[~(data['department'].isin(deps_to_delete))]

        # Убираем самые дешевые товары. 
        data = data[~(data['sales_value'] < 5)]

        # Убираем самые дорогие товары.
        data = data[~(data['sales_value'] > 100)]
        return data


    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        
        model = self.model
        user_item_matrix = self.user_item_matrix
        
        user_id = self.itemid_to_id[user]

        top_n_items = user_item_matrix.toarray()[user_id, :]
        top_n_items.sort()
        top_n_items = np.argsort(-top_n_items)
        top_n_items = top_n_items[:N]
        print(top_n_items)

        similar_items = []
        for item in top_n_items:
            similar_item = [val[0] for val in model.similar_items(itemid=item, N=2) if val[0] != item]
            similar_items.append(similar_item)
        similar_items = np.array(similar_items).flatten()

        assert len(similar_items) == N, 'Количество рекомендаций != {}'.format(N)
        return similar_items

    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        
        model = self.model
        user_item_matrix = self.user_item_matrix
    
        similar_users = model.similar_users(userid=self.userid_to_id[user], N=N)
        similar_items = []

        for user in similar_users:
            user = user[0]
            top_n_items = user_item_matrix.toarray()[user, :]
            top_n_items = np.argsort(-top_n_items)[:N]
            print(top_n_items)
            similar_items.append(top_n_items)
        similar_items = list(set(np.array(similar_items).flatten()))

        return np.random.choice(similar_items, N)

In [18]:
data_train, data_test = import_data()

recommender = MainRecommender(data_train, item_features)

recommender.get_similar_items_recommendation(397896, N=5)
recommender.get_similar_users_recommendation(2375, N=5)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=391.0), HTML(value='')))


[390 389 387 265 264]
[ 16  96 202  28   5]
[195  16  82  76  60]
[195  61   8  39 249]
[ 16  61 176  40 179]
[ 61 159  74 266 265]


array([179, 265, 249,   8, 159], dtype=int64)

##### Check

In [19]:
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, postfilter_items
from src.recommenders import MainRecommender

data_train, data_test = import_data()

recommender = MainRecommender(data_train, item_features)

recommender.get_similar_items_recommendation(397896, N=5)
recommender.get_similar_users_recommendation(2375, N=5)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=391.0), HTML(value='')))


[390 389 387 265 264]
[ 16  96 202  28   5]
[195  61   8  39 249]
[ 16  61 176  40 179]
[202  45   0 265 264]
[202  45   0 265 264]


array([195, 264,  39,  39, 179], dtype=int64)

In [20]:
data_train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [21]:
import gc

gc.collect()

69