### Import

In [201]:
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [202]:
data = pd.read_csv('data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [203]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [204]:
results = pd.DataFrame({'user_id': data_test['user_id'].unique()})

In [205]:
results.head(3)

Unnamed: 0,user_id
0,338
1,2120
2,2324


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [206]:
def weighted_random_recommendation(items_weights, n=5):
    recs = np.random.choice(items_weights['item_id'], replace=False, p=items_weights['weight'], size=n)
    
    return recs.tolist()

Генерим дата-фремй с весами.

In [207]:
items_weights = pd.DataFrame(data_train.groupby('item_id')['sales_value'].sum())

items_weights['weight'] = np.log(items_weights['sales_value'])
items_weights['weight'].replace({-np.inf: np.inf}, inplace=True)
items_weights['weight'].replace({np.inf: items_weights['weight'].min(), np.nan: items_weights['weight'].min()},
                                inplace=True)
items_weights['weight'] += -items_weights['weight'].min()
items_weights['weight'] /= items_weights['weight'].sum()

items_weights.drop(columns=['sales_value'], inplace=True)
items_weights['item_id'] = items_weights.index
items_weights.reset_index(inplace=True, drop=True)
items_weights.sort_values('weight', ascending=False, inplace=True)

items_weights.head(3)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,weight,item_id
55470,1.5e-05,6534178
55430,1.4e-05,6533889
28895,1.4e-05,1029743


Сделайте предсказания

In [208]:
%%time

results['weighted_random'] = results['user_id'].\
    apply(lambda param: weighted_random_recommendation(items_weights, 5))

results.head(3)

Wall time: 3.77 s


Unnamed: 0,user_id,weighted_random
0,338,"[2436262, 10148668, 1050376, 891075, 14077523]"
1,2120,"[1049076, 3753633, 1946295, 1137342, 2460114]"
2,2324,"[6424123, 852208, 905239, 13416099, 2320323]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [209]:
results = pd.read_csv('predictions_basic.csv')
results.reset_index(drop=True, inplace=True)
results.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[2756336, 5995228, 15972849, 15596799, 5582951]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [210]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [211]:
metrics = set(results.columns)
metrics -= {'user_id', 'actual'}

metrics

{'cosine',
 'itemitem',
 'own_purchases',
 'popular_recommendation',
 'random_recommendation',
 'tfidf'}

In [212]:
%%time

k = 5

actual = results['actual'].values

for m in metrics:
    precision_results = []
    metrics_predicts = results[m]
    
    for idx in range(results.shape[0]):
        bought_list = actual[idx : idx+1]
        
        bought_list = ' '.join(bought_list[0].replace('\n', '').split()).replace(' ', ',')
        bought_list = json.loads(bought_list.replace('[,', '['))
        recommended_list = json.loads(metrics_predicts[idx : idx+1].values[0])
    
        precision_results.append(precision_at_k(recommended_list=recommended_list,
                                               bought_list=bought_list))
    
    results[f"{m}__precision@{k}"] = precision_results
    print(f'{m} - calculated')

print()

for m in metrics:
    print(f'precision@{k} for {m}: {round(results[f"{m}__precision@{k}"].mean(), 5)}')

print()

itemitem - calculated
popular_recommendation - calculated
cosine - calculated
tfidf - calculated
random_recommendation - calculated
own_purchases - calculated

precision@5 for itemitem: 0.13692
precision@5 for popular_recommendation: 0.15524
precision@5 for cosine: 0.13291
precision@5 for tfidf: 0.13898
precision@5 for random_recommendation: 0.00098
precision@5 for own_purchases: 0.17969

Wall time: 3.96 s


Нагуглил, что можно apply() применять не ко всему столбцу сразу, а в разрезе ячеек, тогда можно векторно так:

In [213]:
def str_to_list(text):
    text = ' '.join(text.replace('\n', '').split()).replace(' ', ',')
    text = json.loads(text.replace('[,', '['))
    return text

In [215]:
results['actual'] = results.apply(lambda row: str_to_list(row['actual']), axis=1)

In [224]:
for m in metrics:
    results[m] = results.apply(lambda row: json.loads(row[m]), axis=1)

In [225]:
%%time

for m in metrics:
    results[f"{m}__precision@{k}_"] = results.apply(lambda row: precision_at_k(recommended_list=row[m],
                                                                               bought_list=row['actual']), axis=1)

Wall time: 1.48 s


In [226]:
results.describe().loc['mean', :].sort_values(ascending=False)

user_id                                 1257.930950
own_purchases__precision@5_                0.179693
own_purchases__precision@5                 0.179693
popular_recommendation__precision@5_       0.155240
popular_recommendation__precision@5        0.155240
tfidf__precision@5_                        0.138981
tfidf__precision@5                         0.138981
itemitem__precision@5_                     0.136925
itemitem__precision@5                      0.136925
cosine__precision@5_                       0.132909
cosine__precision@5                        0.132909
random_recommendation__precision@5_        0.000979
random_recommendation__precision@5         0.000979
Name: mean, dtype: float64

In [186]:
for m in metrics:
    print(f'precision@{k} for {m}: {round(results[f"{m}__precision@{k}_"].mean(), 5)}')

precision@5 for itemitem: 0.13487
precision@5 for popular_recommendation: 0.15524
precision@5 for cosine: 0.11528
precision@5 for tfidf: 0.11528
precision@5 for random_recommendation: 0.0
precision@5 for own_purchases: 0.14956


In [30]:
results.head(30)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,itemitem__precision@5,popular_recommendation__precision@5,cosine__precision@5,tfidf__precision@5,random_recommendation__precision@5,own_purchases__precision@5,itemitem__precision@5_,popular_recommendation__precision@5_,cosine__precision@5_,tfidf__precision@5_,random_recommendation__precision@5_,own_purchases__precision@5_
0,1,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.4,0.2,0.2,0.2,0.0,0.4,0.4,0.4,0.4,0.4,0.0,0.2
1,3,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.4,0.4,0.4,0.0,0.2
2,6,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.2,0.0,0.0,0.0,0.0,0.0,0.4,0.4,0.4,0.4,0.0,0.2
3,7,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.2,0.2,0.4,0.4,0.0,0.2,0.4,0.4,0.4,0.4,0.0,0.2
4,8,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.2,0.4,0.2,0.2,0.0,0.4,0.4,0.4,0.4,0.4,0.0,0.2
5,9,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.2,0.2,0.0,0.0,0.0,0.2,0.4,0.4,0.4,0.4,0.0,0.2
6,13,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.2,0.6,0.2,0.4,0.0,0.6,0.4,0.4,0.4,0.4,0.0,0.2
7,14,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.2,0.2,0.2,0.2,0.0,0.2,0.4,0.4,0.4,0.4,0.0,0.2
8,15,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.2,0.2,0.2,0.2,0.0,0.2,0.4,0.4,0.4,0.4,0.0,0.2
9,16,"[852182, 856345, 923746, 948670, 1018007, 1044...","[6464159, 1450114, 226906, 13213208, 1149732]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]",0.2,0.2,0.2,0.2,0.0,0.0,0.4,0.4,0.4,0.4,0.0,0.2


Лучшее качество показывает алгоритм own_purchases, т.е. рекомендации на основе предыдущих интеракций пользователя.

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.
- Попробуйте стратегии ансамблирования изученных алгоритмов

In [None]:
# your_code