In [1]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.6.1-cp38-cp38-win_amd64.whl (646 kB)
     -------------------------------------- 646.1/646.1 kB 3.1 MB/s eta 0:00:00




Installing collected packages: implicit
Successfully installed implicit-0.6.1


In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [24]:
data = pd.read_csv('./retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [25]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [26]:
data_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.0,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,0.0,0.0
2277418,338,41260573635,636,5592737,2,1.58,369,-0.2,112,92,0.0,0.0
2277419,338,41260573635,636,7441679,1,3.69,369,0.0,112,92,0.0,0.0
2277420,338,41260573635,636,7442317,1,2.69,369,0.0,112,92,0.0,0.0


In [27]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [28]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    items = np.array(items_weights['item_id']) 
    weights = items_weights['weight'].tolist()
    recs = np.random.choice(items, size=n, replace=False, p=weights)
    
    ## return recs['item_id'].tolist()
    
    return recs.tolist()

In [29]:
%%time

# Товары с рейтингом популярности.
items_weights = data.groupby('item_id')['sales_value'].sum().reset_index()

# Логарифм от объема продаж. Для товаров с суммой покупки меньше 1$ считаем вероятность купить этот товар равной нулю.
items_weights['log_sales_value'] = items_weights.sales_value.apply(lambda x: 0 if x < 1 else np.log(x))

# Сумма log_sales_value по всем товарам.
log_sales_value_sum = items_weights.log_sales_value.sum()

# Расчет веса товара по популярности.
items_weights['weight'] = items_weights.log_sales_value.apply(lambda x: x / log_sales_value_sum)

items_weights.head()

Wall time: 353 ms


Unnamed: 0,item_id,sales_value,log_sales_value,weight
0,25671,20.94,3.041661,1.3e-05
1,26081,0.99,0.0,0.0
2,26093,1.59,0.463734,2e-06
3,26190,1.54,0.431782,2e-06
4,26355,1.98,0.683097,3e-06


In [30]:
%%time

items_weights = data_train.groupby('item_id')['sales_value'].sum().reset_index()
items_weights = items_weights[items_weights['sales_value']>1]
items_weights['weight']= np.log(items_weights['sales_value'] + 0.001)
items_weights['weight'] = items_weights['weight'] / items_weights['weight'].sum()
items_weights.drop('sales_value', axis=1, inplace=True)

result['weighted_random_recommendation'] = result['user_id'].apply\
                                    (lambda x: weighted_random_recommendation(items_weights, n=5))
result.head(5)

Wall time: 15.9 s


Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6427877, 969263, 9677929, 972058, 1042582]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[12262778, 928640, 893168, 13777302, 12386181]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[838050, 1050274, 12301178, 7115628, 847010]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[965612, 12606075, 1003305, 1032766, 920455]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1100601, 12811160, 902644, 9296919, 882980]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [31]:
result = pd.read_csv('./predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [32]:
def p(v):
    return [int(id) for id in v[1:-1].split()]

result = pd.read_csv('predictions_basic.csv', converters={'actual': p, 'random_recommendation': eval, 'popular_recommendation': eval, 'itemitem': eval, 'cosine': eval, 'tfidf': eval, 'own_purchases': eval})
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [33]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list[:k])
    
    precision = flags.sum() / k
    
    return precision

In [34]:
metrics = {}
cols = ['random_recommendation', 'popular_recommendation', 'itemitem', 'cosine', 'tfidf', 'own_purchases']

for i, r in result.iterrows():

    for c in cols:
        if not c in metrics:
            metrics[c] = []
        
        metrics[c].append(precision_at_k(r[c], r['actual']))

In [35]:
mdf = pd.DataFrame(metrics)
mdf['user_id'] = result['user_id']

In [36]:
mdf.head(5)

Unnamed: 0,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,user_id
0,0.0,0.2,0.0,0.0,0.0,0.4,1
1,0.0,0.0,0.0,0.0,0.0,0.0,3
2,0.0,0.0,0.0,0.0,0.0,0.0,6
3,0.0,0.2,0.0,0.0,0.0,0.2,7
4,0.0,0.4,0.0,0.2,0.2,0.4,8
