In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [2]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list
    recommended_list = recommended_list[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

def recall_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    k_recommended_list = np.array(recommended_list[:k])
    
    flags = np.isin(k_recommended_list, bought_list)
   
    recall = flags.sum() / len(bought_list)
    
    return recall

In [3]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [5]:
result = pd.read_pickle('result.pkl')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[944544, 5584079, 6553298, 882850, 10148814]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[838030, 12946001, 574396, 9526628, 6396571]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]"


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекомендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [6]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекомендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    items = np.array(items_weights.iloc[:, 0])
    weights = np.array(items_weights.iloc[:, 1])
    
    recs = np.random.choice(items, p=weights, size=n, replace=False)
    
    return recs.tolist()

In [7]:
items_weights = data_train.groupby('item_id')['sales_value'].agg(weights='sum').reset_index()
items_weights['weights'] = np.log2(items_weights['weights'] + 1) / np.log2(items_weights['weights'] + 1).sum()
items_weights.head()

Unnamed: 0,item_id,weights
0,25671,1.3e-05
1,26081,3e-06
2,26093,4e-06
3,26190,4e-06
4,26355,5e-06


In [8]:
items_weights['weights'].sum()

0.9999999999999998

In [9]:
%%time

# your_code
result['weighted_random_recommendation'] = result['user_id'].apply(
    lambda x: weighted_random_recommendation(items_weights)
)
result.head(3)

Wall time: 3.78 s


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[944544, 5584079, 6553298, 882850, 10148814]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]","[614196, 479473, 371767, 13092845, 1101273]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[838030, 12946001, 574396, 9526628, 6396571]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]","[14025373, 961175, 851967, 12675405, 15720690]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1065915, 52686, 903416, 1062404, 12757128]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]","[844169, 888650, 1125739, 855109, 911404]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5, Precision@3, Recall@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [10]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[944544, 5584079, 6553298, 882850, 10148814]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]","[614196, 479473, 371767, 13092845, 1101273]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[838030, 12946001, 574396, 9526628, 6396571]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 981760, 1127831, 1098066, 961554]","[1082185, 1029743, 995785, 1004906, 1081177]","[14025373, 961175, 851967, 12675405, 15720690]"


In [11]:
metrics = pd.DataFrame({'Precision@5': [], 
                        'Precision@3': [],
                        'Recall@5': [],
                        })

for col in result.columns[2:]:
    prec5 = result.apply(lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean()
    prec3 = result.apply(lambda row: precision_at_k(row[col], row['actual'], k=3), axis=1).mean()
    rec5 = result.apply(lambda row: recall_at_k(row[col], row['actual'], k=5), axis=1).mean()

    metrics.loc[col] = [prec5, prec3, rec5]
    
metrics

Unnamed: 0,Precision@5,Precision@3,Recall@5
random_recommendation,0.000784,0.000979,5.2e-05
popular_recommendation,0.15524,0.137773,0.024996
itemitem,0.145739,0.174012,0.016218
cosine,0.135455,0.174012,0.014031
tfidf,0.135455,0.174012,0.014031
own_purchases,0.162292,0.219066,0.018309
weighted_random_recommendation,0.001371,0.000653,7.9e-05


По точности топ-5 лучший алгоритм - own_purchases, рядом с ним - popular_recommendation. Для метрики Precision@3 большая часть алгоритмов показывает заметно лучшие результаты - за исключением popular_recommendation и weighted_random_recommendation (дают несколько худшие метрики). По полноте - наилучший результат у popular_recommendation, второе место - у own_purchases. В целом - по метрикам наилучший алгоритмом оказывается own_purchases.

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 (или другое количество) товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

#### улучшение бейзлайнов

In [12]:
# для бейзлайнов
result1 = result[['user_id', 'actual']]
result1

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
...,...,...
2037,2496,[6534178]
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ..."
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972..."
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813..."


In [13]:
def random_recommendation(items, n=5):
    """Случайные рекомендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [14]:
top = [1000, 2500, 5000]

popularity = (data_train.groupby('item_id')['sales_value'].sum()
                                                    .reset_index()
                                                    .sort_values('sales_value', ascending=False))

In [15]:
# random_recommendation и weighted_random_recommendation, 
# т.к. popularity_reccomendation - одинаковые для всех
for i in top:
    top_items = popularity['item_id'][:i].values
    top_items_weights = popularity[:i]
    top_items_weights['weights'] = np.log2(top_items_weights['sales_value'] + 1) / np.log2(top_items_weights['sales_value'] + 1).sum()
    
    
    result1[f'random_recommendation_{i}'] = result1['user_id'].apply(lambda x: random_recommendation(top_items, n=5))
    result1[f'weighted_random_recommendation_{i}'] = result1['user_id'].apply(
                                lambda x: weighted_random_recommendation(top_items_weights[['item_id', 'weights']])
                                )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_items_weights['weights'] = np.log2(top_items_weights['sales_value'] + 1) / np.log2(top_items_weights['sales_value'] + 1).sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result1[f'random_recommendation_{i}'] = result1['user_id'].apply(lambda x: random_recommendation(top_items, n=5))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexin

In [16]:
result1.head(3)

Unnamed: 0,user_id,actual,random_recommendation_1000,weighted_random_recommendation_1000,random_recommendation_2500,weighted_random_recommendation_2500,random_recommendation_5000,weighted_random_recommendation_5000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[953561, 922561, 1127179, 885697, 1136719]","[1132771, 972418, 8290421, 953476, 12810464]","[837107, 1072438, 1031083, 1083043, 961756]","[899624, 1061228, 1102139, 9268695, 959179]","[906883, 1121557, 1112205, 937413, 874255]","[1051323, 1002771, 6464166, 5569792, 7466252]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[882305, 1056005, 1096036, 9487885, 882308]","[929668, 911812, 976998, 1038663, 984680]","[944836, 1113111, 1061220, 1016709, 1059347]","[860490, 7024847, 857215, 1110764, 8020166]","[882305, 1101175, 5995199, 1135468, 866211]","[928263, 1134522, 1103691, 913785, 901062]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[991223, 5585510, 1038663, 1065593, 1055425]","[904236, 12946027, 987724, 892314, 1092026]","[1068719, 5569615, 823641, 12171886, 9527323]","[970866, 1046827, 1118787, 1078717, 1013389]","[993044, 850102, 848412, 1003600, 963683]","[1137771, 825749, 980943, 1026094, 1118533]"


In [17]:
for col in result1.columns[2:]:
    prec5 = result1.apply(lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean()
    prec3 = result1.apply(lambda row: precision_at_k(row[col], row['actual'], k=3), axis=1).mean()
    rec5 = result1.apply(lambda row: recall_at_k(row[col], row['actual'], k=5), axis=1).mean()

    metrics.loc[col] = [prec5, prec3, rec5]

In [18]:
metrics.sort_index()

Unnamed: 0,Precision@5,Precision@3,Recall@5
cosine,0.135455,0.174012,0.014031
itemitem,0.145739,0.174012,0.016218
own_purchases,0.162292,0.219066,0.018309
popular_recommendation,0.15524,0.137773,0.024996
random_recommendation,0.000784,0.000979,5.2e-05
random_recommendation_1000,0.012635,0.013059,0.001303
random_recommendation_2500,0.007052,0.006203,0.000712
random_recommendation_5000,0.005779,0.006693,0.00056
tfidf,0.135455,0.174012,0.014031
weighted_random_recommendation,0.001371,0.000653,7.9e-05


Случайные рекомендации показывают наилучшие метрики на топ-1000 выборке (что ожидаемо: чем меньше топ - тем ближе алгоритм к popular_recommendation). Для weighted_random_recommendation - та же самая закономерность, но чуть более высокие метрики.

#### изменение числа соседей

In [19]:
# для Item-Item
result2 = result[['user_id', 'actual']]

In [20]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [21]:
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', 
                                  columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1
user_item_matrix = user_item_matrix.astype(float)

sparse_user_item = csr_matrix(user_item_matrix).tocsr()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [22]:
userids = user_item_matrix.index.values  # id юзеров
itemids = user_item_matrix.columns.values  # id товаров

matrix_userids = np.arange(len(userids))  # индексы id юзеров
matrix_itemids = np.arange(len(itemids))  # индексы id товаров

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [23]:
K = [2, 3, 9, 15]

for k in K:
    model = ItemItemRecommender(K=k, num_threads=4)
    model.fit(sparse_user_item,
          show_progress=True
         )
    
    result2[f'itemitem_{k}'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]],
                                    recalculate_user=False)[0]])

  0%|          | 0/5001 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result2[f'itemitem_{k}'] = result['user_id'].\


  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [24]:
result2.head(3)

Unnamed: 0,user_id,actual,itemitem_2,itemitem_3,itemitem_9,itemitem_15
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1082185, 995242, 1029743, 840361, 904360]","[1082185, 981760, 995242, 1029743, 840361]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1082185, 995242, 1029743, 840361, 904360]","[1082185, 981760, 995242, 1029743, 840361]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1082185, 995242, 1029743, 840361, 904360]","[1082185, 981760, 995242, 1029743, 840361]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]"


In [25]:
for col in result2.columns[2:]:
    prec5 = result2.apply(lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean()
    prec3 = result2.apply(lambda row: precision_at_k(row[col], row['actual'], k=3), axis=1).mean()
    rec5 = result2.apply(lambda row: recall_at_k(row[col], row['actual'], k=5), axis=1).mean()

    metrics.loc[col] = [prec5, prec3, rec5]

In [26]:
metrics.sort_index()

Unnamed: 0,Precision@5,Precision@3,Recall@5
cosine,0.135455,0.174012,0.014031
itemitem,0.145739,0.174012,0.016218
itemitem_15,0.157003,0.217597,0.01773
itemitem_2,0.162977,0.233758,0.019229
itemitem_3,0.19285,0.217597,0.022719
itemitem_9,0.157003,0.217597,0.01773
own_purchases,0.162292,0.219066,0.018309
popular_recommendation,0.15524,0.137773,0.024996
random_recommendation,0.000784,0.000979,5.2e-05
random_recommendation_1000,0.012635,0.013059,0.001303


Наилучший результат по метрике Precision@5 алгоритм itemitem показывает при k = 3, по метрике Precision@3 - при k = 2, по полноте - снова при k = 3. Т.е. 3 - оптимальное число соседей. Интересно, что 15 соседей (itemitem_15) дают чуть лучший результат, чем 5 (itemitem)