In [177]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

### Вопросы

In [178]:
data = pd.read_csv('../retail_train/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [179]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

# Оценивание
За выполнени каждого задания 1 балл

4 балла -> отл

3 балла -> хор

И тд

### Задание 0. Товар 999999
На вебинаре мы использовали товар 999999 - что это за товар?  
Зачем он нужен?  
Используя этот товар мы смещаем качество рекомендаций.
В какую сторону?   
Можно ли удалить этот товар?   
Уберите этот товар и сравните с качеством на семинаре.

In [180]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
popularity.head(3)

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1


In [181]:
#op_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

top_5000 = popularity.sort_values('n_sold',ascending=False).head(5000)['item_id'].tolist()
top_5000[:5]

[6534178, 6533889, 6534166, 6544236, 1404121]

In [182]:
#data_train_top = data_train.loc[data_train['item_id'].isin(top_5000)]
#data_train.head()

data_train_top_5000 = data_train.loc[data_train.item_id.isin(top_5000),:]
print(f'shape: {data_train_top_5000.shape}'),
print(f'unique_items_counted: {data_train_top_5000.item_id.nunique()}')  

data_train_top_5000.head(3)

shape: (1448716, 12)
unique_items_counted: 5000


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [183]:
user_item_matrix = pd.pivot_table(data_train_top_5000, 
                                  index='user_id',
                                  columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()



In [184]:
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [185]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [186]:
recs

[(3407, 70158.0),
 (2148, 57211.0),
 (3586, 27050.0),
 (3946, 19897.0),
 (2307, 18127.0)]

In [187]:
# %%time

model = ItemItemRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [188]:
[id_to_itemid[rec[0]] for rec in recs]

[1082185, 981760, 1098066, 1127831, 995242]

In [189]:
print(f'reccomendations for user id: {id_to_userid[2]}')
df_recs = pd.DataFrame(recs) 
df_recs = df_recs.rename({0:'id', 1:'recommended_item_id'}, axis='columns')
df_recs

reccomendations for user id: 3


Unnamed: 0,id,recommended_item_id
0,3407,70158.0
1,2148,57211.0
2,3586,27050.0
3,3946,19897.0
4,2307,18127.0


In [190]:
result = pd.read_json('../predictions_basic.json')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[3867164, 6396393, 1059913, 9337501, 1973488]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[16769610, 167763, 1260949, 910856, 7409462]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"


## Данный код не работает, получаю ошибку key error 650, почему ?

In [175]:
%%time

result['itemitem_top_5000'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

KeyError: 650

## Получается этот вариант работает но он не персонализирует предложения item_id для всех user id

In [191]:
%%time

result['itemitem_top_5000'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in recs])

Wall time: 45 ms


In [192]:
result.head(3)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,itemitem_top_5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[3867164, 6396393, 1059913, 9337501, 1973488]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]","[1082185, 981760, 1098066, 1127831, 995242]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[16769610, 167763, 1260949, 910856, 7409462]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]","[1082185, 981760, 1098066, 1127831, 995242]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[955818, 308601, 830650, 1756535, 15596134]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]","[1082185, 981760, 1098066, 1127831, 995242]"


In [193]:

def precision_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    flags = np.isin(bought_list, recommended_list)
    #print(flags)
    precision = flags.sum() / len(recommended_list)
    return precision

In [194]:
list_itemitem = [ precision_at_k(i,k ) for i,k in zip(result['itemitem'], result['actual'])]
array_itemitem = np.array(list_itemitem)

list_itemitem_top_5000 = [ precision_at_k(i,k ) for i,k in zip(result['itemitem_top_5000'], result['actual'])]
array_itemitem_top_5000 = np.array(list_itemitem_top_5000)

print(array_itemitem.mean())
print(array_itemitem_top_5000.mean())

0.13692458374143
0.14573947110675808


UPDATED

In [195]:
print('itemitem', round(result.apply(lambda row: precision_at_k(row['itemitem'], row['actual']), axis=1).mean(), 4))
print('itemitem_top_5000', round(result.apply(lambda row: precision_at_k(row['itemitem_top_5000'], row['actual']), axis=1).mean(), 4))

itemitem 0.1369
itemitem_top_5000 0.1457


Без учёта товара 999999 мы улучшаем точность с 13.69% до 14.57%

In [196]:
pd.DataFrame(
    [
        (model_, round(result.apply(lambda row: precision_at_k(recommended_list=row[str(model_)], bought_list=row['actual']), axis=1).mean(), 5)) for model_ in list(result)[2:]
    ], 
    columns=['metric', 'value']
)

Unnamed: 0,metric,value
0,random_recommendation,0.00029
1,popular_recommendation,0.15524
2,itemitem,0.13692
3,cosine,0.13291
4,tfidf,0.13898
5,own_purchases,0.17969
6,itemitem_top_5000,0.14574


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. вес = log(sales_sum товара)
- Придумайте пример 3 весов, посчитайте weighted_random_recommendation для разных весов

In [197]:
weights = data.groupby('item_id')['sales_value'].sum().reset_index()
weights['sales_value'] = weights['sales_value'].apply(lambda x: np.log(x))
weights.rename(columns={'sales_value': 'log_sales'}, inplace=True)

weights.head()

Unnamed: 0,item_id,log_sales
0,25671,3.041661
1,26081,-0.01005
2,26093,0.463734
3,26190,0.431782
4,26355,0.683097


In [198]:
def weighted_random_recommendation(w_items, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    recs = np.random.choice(w_items.item_id, size=n, replace=False)
    recs = w_items.loc[w_items.item_id.isin(recs)].sort_values(by='log_sales', ascending=False).item_id

    return recs.tolist()
    


In [199]:
weighted_random_recommendation(w_items=weights)

[9836752, 8090999, 6034330, 8360414, 10355617]

In [200]:
%%time

result['weighted_random_recommendation'] = result['user_id'].apply(
    lambda x: weighted_random_recommendation(w_items= weights, n=5))
result.head(3)

Wall time: 16.1 s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,itemitem_top_5000,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[3867164, 6396393, 1059913, 9337501, 1973488]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]","[1082185, 981760, 1098066, 1127831, 995242]","[1062002, 10182790, 9553185, 3680492, 9842143]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[16769610, 167763, 1260949, 910856, 7409462]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]","[1082185, 981760, 1098066, 1127831, 995242]","[13210419, 9553061, 999146, 10311718, 1071160]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[955818, 308601, 830650, 1756535, 15596134]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]","[1082185, 981760, 1098066, 1127831, 995242]","[888104, 962268, 902525, 846978, 1204451]"


Сделайте предсказания

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма (с вебинара и weighted_random_recommendation) с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество? Почему?

In [201]:
#result = pd.read_json('../predictions_basic.json')
result.loc[:2, 'actual':]

Unnamed: 0,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,itemitem_top_5000,weighted_random_recommendation
0,"[821867, 834484, 856942, 865456, 889248, 90795...","[3867164, 6396393, 1059913, 9337501, 1973488]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]","[1082185, 981760, 1098066, 1127831, 995242]","[1062002, 10182790, 9553185, 3680492, 9842143]"
1,"[835476, 851057, 872021, 878302, 879948, 90963...","[16769610, 167763, 1260949, 910856, 7409462]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]","[1082185, 981760, 1098066, 1127831, 995242]","[13210419, 9553061, 999146, 10311718, 1071160]"
2,"[920308, 926804, 946489, 1006718, 1017061, 107...","[955818, 308601, 830650, 1756535, 15596134]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]","[1082185, 981760, 1098066, 1127831, 995242]","[888104, 962268, 902525, 846978, 1204451]"


In [202]:
pd.DataFrame(
    [
        (model_, round(result.apply(lambda row: precision_at_k(recommended_list=row[str(model_)], bought_list=row['actual']), axis=1).mean(), 5)) for model_ in list(result)[2:]
    ], 
    columns=['metric', 'value']
)

Unnamed: 0,metric,value
0,random_recommendation,0.00029
1,popular_recommendation,0.15524
2,itemitem,0.13692
3,cosine,0.13291
4,tfidf,0.13898
5,own_purchases,0.17969
6,itemitem_top_5000,0.14574
7,weighted_random_recommendation,0.00098


При использование item_id '99999' мы теряем точность в наших прогнозах. Без данного item_id '99999' precision@5 для item-item recommender
повышается с 13.7% до 14.6 %

### Задание 3. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.
- Попробуйте стратегии ансамблирования изученных алгоритмов


### 1. Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров

In [90]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)  # n_sold - число продаж

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [91]:
data_train_2 = data_train.loc[data_train['item_id'].isin(top_5000), :]

In [92]:
data_train_2.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [93]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""

    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)

    return recs.tolist()

In [101]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""

    popular = data.groupby(by='item_id')['sales_value'].sum().reset_index()
    popular.sort_values(by='sales_value', ascending=False, inplace=True)

    recs = popular.head(n).item_id

    return recs.tolist()

Data Train vs Data Train _2

In [95]:
%%time
items = data_train.item_id.unique()
result['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))

Wall time: 7.4 s


In [96]:
%%time
items = data_train_2.item_id.unique()
result['random_recommendation_2'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))

Wall time: 732 ms


In [97]:
%%time
popular_recs = popularity_recommendation(data_train, n=5)
result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)

Wall time: 293 ms


In [134]:
%%time
popular_recs = popularity_recommendation(data_train_2, n=5)
result['popular_recommendation_2'] = result['user_id'].apply(lambda x: popular_recs)

Wall time: 167 ms


Top_5000 items vs all items 

In [162]:
result_T = result.head(2).T
id_num = result_T.loc['user_id'][:2] # user id #

result_T.rename(columns={0: f'user_id {id_num[0]}', 1 : f'user_id {id_num[1]}'}, inplace=True)

result_T.loc[['actual','random_recommendation','popular_recommendation',\
              'random_recommendation_2','popular_recommendation_2'],:]

Unnamed: 0,user_id 1,user_id 3
actual,"[821867, 834484, 856942, 865456, 889248, 90795...","[835476, 851057, 872021, 878302, 879948, 90963..."
random_recommendation,"[47302, 6554158, 1506719, 1027208, 921401]","[6906425, 1076202, 2266886, 9883927, 1198257]"
popular_recommendation,"[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 6533889, 1029743, 6534166, 1082185]"
random_recommendation_2,"[1053282, 856790, 9552945, 923967, 982393]","[961313, 892048, 856515, 983795, 870780]"
popular_recommendation_2,"[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 6533889, 1029743, 6534166, 1082185]"


### Задание 4. Улучшение детерминированных алгоритмов
На семинаре мы рассматривали 



Далее $U \equiv N_i(u) $

$$r_{u,i} =  \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)r_{v, i}$$
$$ S = \sum\limits_{v \in U} \operatorname{sim}(u,v)$$

Предлагается улучшить эту формулу и учесть средние предпочтения всех пользователей

$$r_{u,i} = \mu + \bar{r_u} + \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)(r_{v, i}-\bar{r_{v}} - \mu)$$

Какие смысл имееют $ \mu $ и $ \bar{r_u}$ ?

Реализуйте алгоритм, прогнозирующий рейтинги на основе данной формулы, на numpy (векторизованно!)

В качестве схожести возьмите CosineSimilarity.

Примените к user_item_matrix. В качестве рейтингов возьмите количество или стоимость купленного товара. 
Данный алгоритм предсказывает рейтинги. Как на основании предсказанных рейтингов предсказать факт покупки?

Предложите вариант.
Посчитайте accuracy@5 и сравните с алгоритмами, разобранными на вебинаре.