In [1]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import implicit
implicit.__version__

'0.4.2'

In [3]:
# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [5]:
#from tqdm.notebook import trange, tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

In [6]:
data = pd.read_csv('retail_train.csv')
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [7]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [8]:
pd.set_option('display.float_format', '{:.3f}'.format)

In [9]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,2396804.0,1271.904,726.564,1.0,655.0,1271.0,1914.0,2500.0
basket_id,2396804.0,33366432676.475,4284798175.929,26984851472.0,30087143433.0,32419978306.0,35145801967.0,41656790510.0
day,2396804.0,363.977,175.939,1.0,216.0,366.0,515.0,663.0
item_id,2396804.0,2827247.092,3732797.659,25671.0,916993.0,1027569.0,1132178.0,18024556.0
quantity,2396804.0,100.376,1152.379,0.0,1.0,1.0,1.0,89638.0
sales_value,2396804.0,3.101,4.21,0.0,1.29,2.0,3.49,840.0
store_id,2396804.0,3048.227,8785.542,1.0,330.0,370.0,422.0,34280.0
retail_disc,2396804.0,-0.54,1.246,-130.02,-0.69,-0.02,0.0,3.99
trans_time,2396804.0,1561.714,401.569,0.0,1307.0,1614.0,1844.0,2359.0
week_no,2396804.0,52.682,25.133,1.0,32.0,53.0,74.0,95.0


### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [10]:
%%time
result = data_train.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result

Wall time: 247 ms


Unnamed: 0,user_id,actual
0,1,"[825123, 831447, 840361, 845307, 852014, 85498..."
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55..."
2,3,"[866211, 878996, 882830, 904360, 921345, 93194..."
3,4,"[836163, 857849, 877523, 878909, 883932, 89142..."
4,5,"[938983, 5980822, 1012352, 825538, 1002499, 69..."
...,...,...
2494,2496,"[840361, 852159, 871756, 886703, 899624, 91612..."
2495,2497,"[838220, 1037840, 1052294, 5569230, 8090537, 1..."
2496,2498,"[824555, 835576, 901776, 904023, 911215, 91749..."
2497,2499,"[838186, 853197, 864143, 883665, 932949, 93383..."


In [11]:
def item_weights(df):
    sales_sum = df.groupby('item_id')['sales_value'].sum().reset_index()
    sales_sum = sales_sum[sales_sum['sales_value'] > 1]
    sales_sum['weight'] = np.log(sales_sum['sales_value'] + 0.000001)
    weight_sum = sales_sum['weight'].sum()
    sales_sum['weight'] = sales_sum['weight'] / weight_sum
    sales_sum.drop('sales_value', axis=1, inplace=True)
    return sales_sum

In [12]:
df_w = item_weights(data_train)
pd.set_option('display.float_format', '{:.6f}'.format)
df_w

Unnamed: 0,item_id,weight
0,25671,0.000013
2,26093,0.000002
3,26190,0.000002
4,26355,0.000003
5,26426,0.000004
...,...,...
86859,17330511,0.000010
86861,17382205,0.000009
86862,17383227,0.000007
86863,17827644,0.000004


In [13]:
round(df_w['weight'].sum(),10)

1.0

In [14]:
df_n = df_w.to_numpy()

In [15]:
def weighted_random_recommendation(items_weights, n=3):
    """Случайные рекоммендации
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    items = items_weights[:,0]
    recs = np.random.choice(items, size=n, replace=False, p=items_weights[:,1])
    
    return [int(x) for x in recs.tolist()]

In [16]:
%time
result['wr_recommend'] = result['user_id'].progress_apply(lambda x: weighted_random_recommendation(df_n, n=5))

Wall time: 0 ns


HBox(children=(FloatProgress(value=0.0, max=2499.0), HTML(value='')))




In [17]:
result.head()

Unnamed: 0,user_id,actual,wr_recommend
0,1,"[825123, 831447, 840361, 845307, 852014, 85498...","[849515, 998049, 1135886, 1136735, 912898]"
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55...","[1049826, 12781817, 12811534, 1077771, 865443]"
2,3,"[866211, 878996, 882830, 904360, 921345, 93194...","[1008792, 13189735, 935755, 6442725, 993680]"
3,4,"[836163, 857849, 877523, 878909, 883932, 89142...","[9677656, 9527603, 1096865, 17179382, 935086]"
4,5,"[938983, 5980822, 1012352, 825538, 1002499, 69...","[9214953, 7169123, 887887, 9552964, 869054]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [18]:
df_pred = pd.read_csv('predictions_basic.csv')
df_pred.head()

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[1014728, 5819183, 47200, 997850, 5570004]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[868689, 918365, 982009, 958473, 5133311]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"
2,6,[ 920308 926804 946489 1006718 1017061 ...,"[947094, 8202909, 13158390, 6704033, 1075170]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]"
3,7,[ 840386 889774 898068 909714 929067 ...,"[12325299, 971285, 1486331, 860439, 1009567]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 999999, 1127831, 961554]","[1082185, 981760, 1127831, 999999, 961554]","[999999, 1082185, 1029743, 1127831, 995785]"
4,8,[ 835098 872137 910439 924610 992977 ...,"[2611874, 982000, 844179, 1263041, 1098060]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 981760, 999999, 1098066, 826249]","[999999, 1082185, 1029743, 1098066, 6534178]"


In [19]:
df_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2042 entries, 0 to 2041
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   user_id                 2042 non-null   int64 
 1   actual                  2042 non-null   object
 2   random_recommendation   2042 non-null   object
 3   popular_recommendation  2042 non-null   object
 4   itemitem                2042 non-null   object
 5   cosine                  2042 non-null   object
 6   tfidf                   2042 non-null   object
 7   own_purchases           2042 non-null   object
dtypes: int64(1), object(7)
memory usage: 127.8+ KB


In [20]:
df_pred.actual[0]

'[  821867   834484   856942   865456   889248   907957   914190   943316\n   951954   954486   958046   962568   969231   971585   979707   986947\n   990656   995242  1004906  1005186  1042083  1050310  1060819  1062002\n  1064441  1069733  1074612  1082185  1131115  1132771  6534544 13876341\n 15971874 17178953   883616   917704   931860   961554  1002032  1031190\n  8090541  8293439  9297615  9527329 15926712  1049998   861272   869465\n   877373   908213   933913   940947   945809   959316   978974  1031697\n  1041796  1048918  1081189  1101422  1115576  1122428  1132231  1132814\n  5577022  8091601  9296986  9677939 10356149 13417048 15741823 15830875]'

In [21]:
df_pred.random_recommendation[0]

'[1014728, 5819183, 47200, 997850, 5570004]'

In [22]:
def str_to_int(row):
    list_int = []
    for i in row.strip('[]').split():
        list_int.append(int(i.rstrip(',')))
    return list_int

In [23]:
str_to_int(df_pred.random_recommendation[0])

[1014728, 5819183, 47200, 997850, 5570004]

In [24]:
for col in df_pred.select_dtypes(include='object').columns:
    df_pred[col] = df_pred.apply(lambda x: str_to_int(x[col]), axis=1)

In [29]:
%time
df_pred['wr_recommend'] = df_pred['user_id'].progress_apply(lambda x: weighted_random_recommendation(df_n, n=5))

Wall time: 0 ns


HBox(children=(FloatProgress(value=0.0, max=2042.0), HTML(value='')))




In [30]:
df_pred.head()

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,wr_recommend
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1014728, 5819183, 47200, 997850, 5570004]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]","[1064356, 919152, 1228798, 118005, 15511233]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[868689, 918365, 982009, 958473, 5133311]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]","[8379285, 9858664, 1069085, 1037245, 997423]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[947094, 8202909, 13158390, 6704033, 1075170]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]","[952478, 828935, 13382395, 1079221, 12384270]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[12325299, 971285, 1486331, 860439, 1009567]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 999999, 1127831, 961554]","[1082185, 981760, 1127831, 999999, 961554]","[999999, 1082185, 1029743, 1127831, 995785]","[6396169, 905197, 953039, 872568, 15830672]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[2611874, 982000, 844179, 1263041, 1098060]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 981760, 999999, 1098066, 826249]","[999999, 1082185, 1029743, 1098066, 6534178]","[877238, 5565304, 170997, 1040839, 1080429]"


In [31]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)

    return precision

In [32]:
for col in df_pred.iloc[:,2:]:
    print(f'{col.title():22s} : {df_pred.apply(lambda row: precision_at_k(row[col], row["actual"]), axis=1).mean():.5f}')

Random_Recommendation  : 0.00078
Popular_Recommendation : 0.15524
Itemitem               : 0.13692
Cosine                 : 0.13291
Tfidf                  : 0.13898
Own_Purchases          : 0.17969
Wr_Recommend           : 0.00147


#### Наилучший результат дал метод "Собственные покупки" (own_purchases)

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.
- Попробуйте стратегии ансамблирования изученных алгоритмов

In [33]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [34]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [35]:
df_pred2 = df_pred.copy()

#### Улучшение случайных рекомендаций

In [36]:
def random_recommend_top(items, n=5):
    """Случайные рекоммендации"""
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [37]:
%%time
items = top_5000
df_pred2['random_recommendation'] = df_pred2['user_id'].apply(lambda x: random_recommend_top(items))

Wall time: 720 ms


In [38]:
import warnings
warnings.filterwarnings('ignore')

#### Улучшение Item-Item Recommender

In [39]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [40]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [50]:
%%time
model = ItemItemRecommender(K=3, num_threads=8)                          # K - кол-во ближайших соседей
model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=True)    # На вход item-user matrix
recs = model.recommend(userid=userid_to_id[2],                           # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(), # на вход user-item matrix
                        N=5,                                             # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


Wall time: 484 ms


In [51]:
%%time
df_pred2['itemitem'] = df_pred2['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

Wall time: 40 ms


#### Улучшение Weights_top Recommender

In [52]:
def item_weights_top(df, top=5000):
    sales_sum = df.groupby('item_id')['sales_value'].sum().reset_index()
    sales_sum = sales_sum[sales_sum['sales_value'] > 1]
    sales_sum.sort_values('sales_value', ascending=False, inplace=True)
    sales_sum = sales_sum.head(top)
        
    sales_sum['weight'] = np.log(sales_sum['sales_value'] + 0.000001)
    weight_sum = sales_sum['weight'].sum()
    sales_sum['weight'] = sales_sum['weight'] / weight_sum
    sales_sum.drop('sales_value', axis=1, inplace=True)
    return sales_sum

In [53]:
df_w2 = item_weights_top(data_train)
df_w2

Unnamed: 0,item_id,weight
2381,999999,0.000497
4346,6534178,0.000434
4339,6533889,0.000354
2757,1029743,0.000350
4345,6534166,0.000344
...,...,...
4977,14043825,0.000096
4979,14050436,0.000096
4978,14043826,0.000094
4975,14043817,0.000093


In [54]:
round(df_w2['weight'].sum(),10)

1.0

In [55]:
df_n2 = df_w2.to_numpy()

In [56]:
%time
df_pred2['wr_recommend'] = df_pred2['user_id'].progress_apply(lambda x: weighted_random_recommendation(df_n2, n=5))

Wall time: 0 ns


HBox(children=(FloatProgress(value=0.0, max=2042.0), HTML(value='')))




#### Было:

In [57]:
for col in df_pred.iloc[:,2:]:
    print(f'{col.title():22s} : {df_pred.apply(lambda row: precision_at_k(row[col], row["actual"]), axis=1).mean():.5f}')

Random_Recommendation  : 0.00078
Popular_Recommendation : 0.15524
Itemitem               : 0.13692
Cosine                 : 0.13291
Tfidf                  : 0.13898
Own_Purchases          : 0.17969
Wr_Recommend           : 0.00147


#### Стало:

In [58]:
for col in df_pred2.iloc[:,2:]:
    print(f'{col.title():22s} : {df_pred2.apply(lambda row: precision_at_k(row[col], row["actual"]), axis=1).mean():.5f}')

Random_Recommendation  : 0.00509
Popular_Recommendation : 0.15524
Itemitem               : 0.18609
Cosine                 : 0.13291
Tfidf                  : 0.13898
Own_Purchases          : 0.17969
Wr_Recommend           : 0.00656


#### Сокращение кол-ва K - ближайших соседей до 3-х дало лучший результат безлайну "ItemItemRecommender"