# ДЗ №2

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [42]:
data = pd.read_csv('../data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [43]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [81]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    items = items_weights['item_id']
    pvalue = items_weights['weight'] /  items_weights['weight'].sum()
    
    recs = np.random.choice(items, size=5, replace=False, p=pvalue)
    
    return recs.tolist()

In [82]:
wgh = data.groupby('item_id')['sales_value'].sum().reset_index()
wgh.columns=['item_id', 'weight']
wgh.loc[wgh['weight']>1, 'weight'] = np.log(wgh.loc[wgh['weight']>1, 'weight'])

In [86]:
weighted_random_recommendation(wgh)

[12187910, 1106301, 1045062, 950297, 996188]

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [45]:
result = pd.read_pickle('../data/predictions_basic.pkl')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1478085, 1671471, 1097497, 31534, 9880041]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[960718, 12731397, 9527303, 949920, 1066594]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [88]:
result['weighted_random'] = result['user_id'].apply(lambda x: weighted_random_recommendation(wgh, n=5))
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1478085, 1671471, 1097497, 31534, 9880041]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]","[1092815, 13158670, 6380484, 148181, 860221]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[960718, 12731397, 9527303, 949920, 1066594]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]","[16809367, 910608, 6554694, 8069111, 1092972]"


In [89]:
def precision_at_5(row):
    flds = ['random_recommendation', 'popular_recommendation', 'itemitem', 'cosine', 
            'tfidf', 'own_purchases', 'weighted_random']
    for i in flds:
        row[f'precision_{i}'] = len(set(row[i]) & set(row['actual'])) / 5
    return row

precision = result.apply(lambda row: precision_at_5(row), axis=1)

In [90]:
precision.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,2042.0,1257.93095,718.052041,1.0,648.5,1260.5,1879.75,2500.0
precision_random_recommendation,2042.0,0.000588,0.010828,0.0,0.0,0.0,0.0,0.2
precision_popular_recommendation,2042.0,0.15524,0.174668,0.0,0.0,0.2,0.2,0.8
precision_itemitem,2042.0,0.033595,0.085772,0.0,0.0,0.0,0.0,0.6
precision_cosine,2042.0,0.03526,0.087261,0.0,0.0,0.0,0.0,0.6
precision_tfidf,2042.0,0.036141,0.087462,0.0,0.0,0.0,0.0,0.6
precision_own_purchases,2042.0,0.179628,0.189525,0.0,0.0,0.2,0.2,0.8
precision_weighted_random,2042.0,0.000686,0.011693,0.0,0.0,0.0,0.0,0.2


> **судя по результатам лучше всех оказался popular_recommendation**

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [95]:
popularity = data_train.groupby('item_id')[['quantity', 'sales_value']].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top5000 = popularity.sort_values('n_sold', ascending=False).head(5000)
top5000['weight'] = 0.0
top5000.loc[top5000['sales_value']>1, 'weight'] = np.log(top5000.loc[top5000['sales_value']>1, 'sales_value'])

In [100]:
# RANDOM RECOMENDATION
items = top5000.item_id.unique()
result['random_recommendation'] = result['user_id'].apply(lambda x: np.random.choice(items, size=5, replace=False))

# POPULAR RECOMENDATION
popular_recs = top5000.head(5).item_id.to_list()
result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)

# WEIGHT RANDOM
result['weighted_random'] = result['user_id'].apply(lambda x: weighted_random_recommendation(top5000, n=5))

result.head(5)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[830795, 845208, 948086, 851146, 1050534]","[6534178, 6533889, 6534166, 6544236, 1404121]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]","[910109, 9707340, 883616, 5565784, 1137775]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[820361, 888476, 9859182, 845715, 10284966]","[6534178, 6533889, 6534166, 6544236, 1404121]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]","[835278, 1128016, 1127825, 875979, 8293343]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[920799, 1052920, 950575, 1133926, 891520]","[6534178, 6533889, 6534166, 6544236, 1404121]","[1098066, 826249, 1106523, 923746, 1058997]","[1098066, 826249, 860776, 854852, 1068719]","[1098066, 826249, 860776, 1068719, 916122]","[999999, 1082185, 1029743, 6534178, 1127831]","[935570, 885586, 827999, 8067128, 1084036]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[9396821, 1010164, 1128464, 15778533, 862010]","[6534178, 6533889, 6534166, 6544236, 1404121]","[981760, 1098066, 840361, 883404, 916122]","[981760, 1098066, 883404, 1004906, 859075]","[981760, 883404, 1098066, 859075, 916122]","[999999, 1082185, 1029743, 1127831, 995785]","[1121694, 1131344, 994928, 10356231, 853317]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[971807, 1128861, 938141, 1128744, 891217]","[6534178, 6533889, 6534166, 6544236, 1404121]","[904360, 13115903, 13189726, 13190294, 15596515]","[904360, 5588666, 1096036, 979707, 1013321]","[904360, 1096036, 5588666, 979707, 1013321]","[999999, 1082185, 1029743, 1098066, 6534178]","[1054030, 913210, 997200, 1106523, 871680]"


In [101]:
precision = result.apply(lambda row: precision_at_5(row), axis=1)

In [102]:
precision.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,2042.0,1257.93095,718.052041,1.0,648.5,1260.5,1879.75,2500.0
precision_random_recommendation,2042.0,0.005485,0.032671,0.0,0.0,0.0,0.0,0.2
precision_popular_recommendation,2042.0,0.046131,0.088801,0.0,0.0,0.0,0.0,0.4
precision_itemitem,2042.0,0.033595,0.085772,0.0,0.0,0.0,0.0,0.6
precision_cosine,2042.0,0.03526,0.087261,0.0,0.0,0.0,0.0,0.6
precision_tfidf,2042.0,0.036141,0.087462,0.0,0.0,0.0,0.0,0.6
precision_own_purchases,2042.0,0.179628,0.189525,0.0,0.0,0.2,0.2,0.8
precision_weighted_random,2042.0,0.00666,0.037495,0.0,0.0,0.0,0.0,0.4
