In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

import warnings
warnings.filterwarnings("ignore", category=Warning) 

In [2]:
np.random.seed(42)

In [3]:
data = pd.read_csv('../data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
d = data[data['coupon_match_disc']<0]
d.head(20)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
134,718,26985360571,1,855325,1,1.1,324,-0.69,1115,1,-0.4,-0.4
154,718,26985360571,1,948756,3,2.12,324,-0.9,1115,1,-0.75,-0.25
210,718,26985360571,1,9368449,1,2.05,324,-0.49,1115,1,-0.55,-0.45
236,2305,26996870743,2,1043128,2,1.53,414,-1.0,1300,1,-0.55,-0.45
594,122,27008905909,3,6514251,3,4.25,330,-1.87,1156,1,-2.25,-0.75
695,315,27008952267,3,1012941,1,1.55,327,-0.89,1707,1,-0.55,-0.45
2087,687,27045765725,6,890979,1,5.49,403,-1.0,1249,2,-0.5,-0.5
2094,687,27045765725,6,990453,1,4.69,403,0.0,1249,2,-0.5,-0.5
2100,687,27045765725,6,1128769,1,2.24,403,0.0,1249,2,-0.25,-0.25
2130,1549,27045791755,6,1108654,1,0.54,313,-0.9,1531,2,-0.25,-0.25


In [5]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [6]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """

    recs = np.random.choice(items_weights['item_id'], size=n, replace=False, p=items_weights['weight'])
    return recs.tolist()

In [7]:
def weighted_records(data_train, count=None):
    weighted_recs = data_train.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
    weighted_recs = weighted_recs[weighted_recs['item_id']!=999999]
    weighted_recs['weight'] = weighted_recs['sales_value'] * weighted_recs['quantity']+0.01
    weighted_recs['weight'] = weighted_recs[weighted_recs['weight']>0]
    if count is not None:
         weighted_recs = weighted_recs.sort_values('weight', ascending=False).head(5000)
    weighted_recs = weighted_recs.dropna()
    weighted_recs['weight'] = np.log(weighted_recs['weight'])
    weighted_recs['weight'] = weighted_recs['weight']/weighted_recs['weight'].sum()
    weighted_recs = weighted_recs.drop(['sales_value', 'quantity'], axis=1)
    return weighted_recs

In [8]:
%%time

weighted_recs = weighted_records(data_train)
weighted_recs.head(2)

Wall time: 644 ms


Unnamed: 0,item_id,weight
0,25671,8e-06
1,26081,8e-06


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [9]:
import re

def convert_to_array(value):
    result = list(map(int, re.sub(" +", " ", re.sub(r'[^0-9 ]+', r'', value)).strip().split(' ')))
    return result    

In [10]:
result = pd.read_csv('../predictions/predictions_basic.csv', converters={'actual': convert_to_array, 
                                                                        'random_recommendation': convert_to_array, 
                                                                        'popular_recommendation': convert_to_array, 
                                                                        'itemitem': convert_to_array, 
                                                                         'cosine': convert_to_array, 
                                                                         'tfidf': convert_to_array, 
                                                                         'own_purchases': convert_to_array
                                                                        })
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


Добавим безлайн с учетом веса продаж

In [11]:
result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(weighted_recs, n=5))
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]","[1089069, 15452561, 9553150, 6467736, 905972]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]","[905953, 823626, 12781254, 6533236, 9369967]"


In [12]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
    return precision

In [13]:
for column in result.columns[2:]:
    precision_value = result.apply(lambda row: precision_at_k(row[column], row['actual'], k=5), axis=1).mean()
    print(f'precision of {column}:{precision_value:0.8f}')

precision of random_recommendation:0.00058766
precision of popular_recommendation:0.15523996
precision of itemitem:0.03359452
precision of cosine:0.03525955
precision of tfidf:0.03614104
precision of own_purchases:0.17998694
precision of weighted_random_recommendation:0.00058766


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров

In [14]:
top_5000 = weighted_records(data_train, 5000)
top_5000.head()

Unnamed: 0,item_id,weight
86864,17829232,0.000202
86863,17827644,0.000202
86862,17383227,0.000201
86861,17382205,0.000201
86860,17381856,0.000201


### Random recommendation

In [15]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [16]:
result['random_recommendation_5000'] = result['user_id'].apply(lambda x: random_recommendation(top_5000['item_id'], n=5))
result['weighted_random_recommendation_5000'] = result['user_id'].apply(lambda x: weighted_random_recommendation(top_5000, n=5))

In [17]:
def recommend(Recommender,data_train, result, count_top=5000, neighbours=5):    
    popularity = data_train.groupby('item_id')['quantity', 'sales_value', 'retail_disc'].sum().reset_index()
    popularity['sold'] = np.log(popularity['sales_value'] *  popularity['quantity']+0.01)

    top_5000 = popularity.sort_values('sold', ascending=False).head(count_top).item_id.tolist()
    
    # Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
    data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999
    
    user_item_matrix = pd.pivot_table(data_train, 
                                      index='user_id', columns='item_id', 
                                      values='quantity',
                                      aggfunc='mean', 
                                      fill_value=0
                                     )

    user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
    user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

    # переведем в формат saprse matrix
    sparse_user_item = csr_matrix(user_item_matrix).tocsr()
    
    userids = user_item_matrix.index.values
    itemids = user_item_matrix.columns.values

    matrix_userids = np.arange(len(userids))
    matrix_itemids = np.arange(len(itemids))

    id_to_itemid = dict(zip(matrix_itemids, itemids))
    id_to_userid = dict(zip(matrix_userids, userids))

    itemid_to_id = dict(zip(itemids, matrix_itemids))
    userid_to_id = dict(zip(userids, matrix_userids))
    
    model = Recommender(K=neighbours, num_threads=4) # K - кол-во билжайших соседей

    model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
              show_progress=False)

    return result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=True, 
                                    filter_items=[999999], 
                                    recalculate_user=False)])   

In [18]:
 result['cosine_5000'] = recommend(CosineRecommender, data_train.copy(), result.copy(), count_top=5000, neighbours=5)

### Сравнение

In [19]:
filter_columns = ['own_purchases', 'popular_recommendation', #для сравнения
                  'random_recommendation',  'random_recommendation_5000',
                 'weighted_random_recommendation', 'weighted_random_recommendation_5000',
                 'cosine',  'cosine_5000']
for column in filter_columns: 
    precision_value = result.apply(lambda row: precision_at_k(row[column], row['actual'], k=5), axis=1).mean()
    print(f'{column}:{precision_value:0.8f}')

own_purchases:0.17998694
popular_recommendation:0.15523996
random_recommendation:0.00058766
random_recommendation_5000:0.00029383
weighted_random_recommendation:0.00058766
weighted_random_recommendation_5000:0.00107738
cosine:0.03525955
cosine_5000:0.03525955


- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

### Recommender

In [20]:
%%time
count_tops = [10, 15, 20, 50, 100, 500]
count_neighbours = [2, 3, 5, 8, 10]
recommender = [CosineRecommender]
best_value = 0
best_top = -1
best_k = -1
best_recomender = None
for z in recommender:
    for j in count_tops:
        for i in count_neighbours:
            name_column = f'{z} {j:3d} {i:2d}'
            result[name_column] = recommend(z, data_train.copy(), result.copy(), count_top=j, neighbours=i)
            precision_value = result.apply(lambda row: precision_at_k(row[name_column], row['actual'], k=5), axis=1).mean()
            if best_recomender is None or best_value<precision_value:
                best_value = precision_value
                best_top = j
                best_k = i
                best_recomender = z
            print(f'precision of {name_column:}:{precision_value:0.8f}')

precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  10  2:0.21263467
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  10  3:0.19585374
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  10  5:0.17140059
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  10  8:0.01234084
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  10 10:0.01116552
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  15  2:0.21175318
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  15  3:0.19495593
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  15  5:0.16307542
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  15  8:0.09226249
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  15 10:0.01312439
precision of <class 'implicit.nearest_neighbours.CosineRecommender'>  20  2:0.19480901
precision of <class 'implicit.nearest_neigh

In [21]:
print(f'Best precision:{best_value} recommender={best_recomender} count_top={best_top} count_neighbours={best_k}')

Best precision:0.2126346718903024 recommender=<class 'implicit.nearest_neighbours.CosineRecommender'> count_top=10 count_neighbours=2


### Вывод:  
Оставив минимум популярных товаров, метрика значительно вырастает. 