In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.sparse import csr_matrix, coo_matrix

In [2]:
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [3]:
data = pd.read_csv('retail_train.csv')
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [4]:
test_size_weeks = 3
data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [5]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


### Сравнение результатов с разным количеством товаров в TOP-списке:

### 7000 товаров

In [6]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_7000 = popularity.sort_values('n_sold', ascending=False).head(7000).item_id.tolist()
data_train.loc[~data_train['item_id'].isin(top_7000), 'item_id'] = 999999
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )
user_item_matrix[user_item_matrix > 0] = 1
user_item_matrix = user_item_matrix.astype(float)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()
user_item_matrix.head(3)

userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values
matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))
id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))
itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

### ItemItem

model = ItemItemRecommender(K=5, num_threads=4)
model.fit(sparse_user_item,
          show_progress=True)
recs = model.recommend(userid=userid_to_id[2],
                        user_items=sparse_user_item,
                        N=5,
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['itemitem_7000'] = result['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

### CosineRecommender

model = CosineRecommender(K=5, num_threads=4)
model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)
recs = model.recommend(userid=userid_to_id[1], 
                        user_items=sparse_user_item,
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['cosine_7000'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]])

### TFIDFRecommender

model = TFIDFRecommender(K=5, num_threads=4)
model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)
recs = model.recommend(userid=userid_to_id[1], 
                        user_items=sparse_user_item,
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['tfidf_7000'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]])

  0%|          | 0/7001 [00:00<?, ?it/s]



  0%|          | 0/7001 [00:00<?, ?it/s]



  0%|          | 0/7001 [00:00<?, ?it/s]

### 5000 товаров

In [7]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )
user_item_matrix[user_item_matrix > 0] = 1
user_item_matrix = user_item_matrix.astype(float)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()
user_item_matrix.head(3)

userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values
matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))
id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))
itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

### ItemItem

model = ItemItemRecommender(K=2, num_threads=4)
model.fit(sparse_user_item,
          show_progress=True)
recs = model.recommend(userid=userid_to_id[2],
                        user_items=sparse_user_item,
                        N=5,
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['itemitem_5000_K2'] = result['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

model = ItemItemRecommender(K=5, num_threads=4)
model.fit(sparse_user_item,
          show_progress=True)
recs = model.recommend(userid=userid_to_id[2],
                        user_items=sparse_user_item,
                        N=5,
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['itemitem_5000_K5'] = result['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

model = ItemItemRecommender(K=8, num_threads=4)
model.fit(sparse_user_item,
          show_progress=True)
recs = model.recommend(userid=userid_to_id[2],
                        user_items=sparse_user_item,
                        N=5,
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['itemitem_5000_K8'] = result['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

### CosineRecommender

model = CosineRecommender(K=5, num_threads=4)
model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)
recs = model.recommend(userid=userid_to_id[1], 
                        user_items=sparse_user_item,
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['cosine_5000'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]])

result.head(2)

model = TFIDFRecommender(K=5, num_threads=4)
model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)
recs = model.recommend(userid=userid_to_id[1], 
                        user_items=sparse_user_item,
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['tfidf_5000'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]])

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]



  0%|          | 0/5000 [00:00<?, ?it/s]



  0%|          | 0/5000 [00:00<?, ?it/s]

### 3000 товаров

In [8]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_3000 = popularity.sort_values('n_sold', ascending=False).head(3000).item_id.tolist()
data_train.loc[~data_train['item_id'].isin(top_3000), 'item_id'] = 999999
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )
user_item_matrix[user_item_matrix > 0] = 1
user_item_matrix = user_item_matrix.astype(float)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()
user_item_matrix.head(3)

userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values
matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))
id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))
itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

### ItemItem

model = ItemItemRecommender(K=5, num_threads=4)
model.fit(sparse_user_item,
          show_progress=True)
recs = model.recommend(userid=userid_to_id[2],
                        user_items=sparse_user_item,
                        N=5,
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['itemitem_3000'] = result['user_id'].\
    map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

### CosineRecommender

model = CosineRecommender(K=5, num_threads=4)
model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)
recs = model.recommend(userid=userid_to_id[1], 
                        user_items=sparse_user_item,
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['cosine_3000'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]])

### TFIDFRecommender

model = TFIDFRecommender(K=5, num_threads=4)
model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)
recs = model.recommend(userid=userid_to_id[1], 
                        user_items=sparse_user_item,
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

result['tfidf_3000'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]])

  0%|          | 0/3000 [00:00<?, ?it/s]



  0%|          | 0/3000 [00:00<?, ?it/s]



  0%|          | 0/3000 [00:00<?, ?it/s]

### Результирующая таблица рекомендаций по методам:

In [9]:
result.head(3)

Unnamed: 0,user_id,actual,itemitem_7000,cosine_7000,tfidf_7000,itemitem_5000_K2,itemitem_5000_K5,itemitem_5000_K8,cosine_5000,tfidf_5000,itemitem_3000,cosine_3000,tfidf_3000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1098066, 995242, 981760, 1127831, 1082185]","[1098066, 961554, 1127831, 981760, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[904360, 1029743, 840361, 995242, 1082185]","[1098066, 995242, 981760, 1127831, 1082185]","[1127831, 840361, 995242, 1082185, 981760]","[961554, 1098066, 1127831, 981760, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[1098066, 995242, 1127831, 1082185, 981760]","[961554, 1098066, 1127831, 981760, 1082185]","[1098066, 1127831, 961554, 981760, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1098066, 995242, 981760, 1127831, 1082185]","[1098066, 961554, 1127831, 981760, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[904360, 1029743, 840361, 995242, 1082185]","[1098066, 995242, 981760, 1127831, 1082185]","[1127831, 840361, 995242, 1082185, 981760]","[961554, 1098066, 1127831, 981760, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[1098066, 995242, 1127831, 1082185, 981760]","[961554, 1098066, 1127831, 981760, 1082185]","[1098066, 1127831, 961554, 981760, 1082185]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1098066, 995242, 981760, 1127831, 1082185]","[1098066, 961554, 1127831, 981760, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[904360, 1029743, 840361, 995242, 1082185]","[1098066, 995242, 981760, 1127831, 1082185]","[1127831, 840361, 995242, 1082185, 981760]","[961554, 1098066, 1127831, 981760, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[1098066, 995242, 1127831, 1082185, 981760]","[961554, 1098066, 1127831, 981760, 1082185]","[1098066, 1127831, 961554, 981760, 1082185]"


In [10]:
def precision_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    flags = np.isin(recommended_list, bought_list)
    precision = flags.sum() / len(recommended_list) 
    return precision

In [11]:
def recall_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    flags = np.isin(bought_list, recommended_list[:k])
    recall = flags.sum() / len(bought_list)
    return recall

### Создание таблицы с результатами метрик:

In [12]:
dict_rez = {'itemitem_7000':[],'cosine_7000':[],'tfidf_7000':[],'itemitem_5000_K2':[],'itemitem_5000_K5':[],'itemitem_5000_K8':[],'cosine_5000':[],'tfidf_5000':[],'itemitem_3000':[],'cosine_3000':[],'tfidf_3000':[]}

In [13]:
dict_rez['itemitem_7000'].append(result.apply(lambda row: precision_at_k(row['itemitem_7000'], row['actual'],5), axis=1).mean())
dict_rez['itemitem_7000'].append(result.apply(lambda row: precision_at_k(row['itemitem_7000'], row['actual'],3), axis=1).mean())
dict_rez['itemitem_7000'].append(result.apply(lambda row: recall_at_k(row['itemitem_7000'], row['actual'],5), axis=1).mean())

dict_rez['cosine_7000'].append(result.apply(lambda row: precision_at_k(row['cosine_7000'], row['actual'],5), axis=1).mean())
dict_rez['cosine_7000'].append(result.apply(lambda row: precision_at_k(row['cosine_7000'], row['actual'],3), axis=1).mean())
dict_rez['cosine_7000'].append(result.apply(lambda row: recall_at_k(row['cosine_7000'], row['actual'],5), axis=1).mean())

dict_rez['tfidf_7000'].append(result.apply(lambda row: precision_at_k(row['tfidf_7000'], row['actual'],5), axis=1).mean())
dict_rez['tfidf_7000'].append(result.apply(lambda row: precision_at_k(row['tfidf_7000'], row['actual'],3), axis=1).mean())
dict_rez['tfidf_7000'].append(result.apply(lambda row: recall_at_k(row['tfidf_7000'], row['actual'],5), axis=1).mean())

dict_rez['itemitem_5000_K2'].append(result.apply(lambda row: precision_at_k(row['itemitem_5000_K2'], row['actual'],5), axis=1).mean())
dict_rez['itemitem_5000_K2'].append(result.apply(lambda row: precision_at_k(row['itemitem_5000_K2'], row['actual'],3), axis=1).mean())
dict_rez['itemitem_5000_K2'].append(result.apply(lambda row: recall_at_k(row['itemitem_5000_K2'], row['actual'],5), axis=1).mean())

dict_rez['itemitem_5000_K5'].append(result.apply(lambda row: precision_at_k(row['itemitem_5000_K5'], row['actual'],5), axis=1).mean())
dict_rez['itemitem_5000_K5'].append(result.apply(lambda row: precision_at_k(row['itemitem_5000_K5'], row['actual'],3), axis=1).mean())
dict_rez['itemitem_5000_K5'].append(result.apply(lambda row: recall_at_k(row['itemitem_5000_K5'], row['actual'],5), axis=1).mean())

dict_rez['itemitem_5000_K8'].append(result.apply(lambda row: precision_at_k(row['itemitem_5000_K8'], row['actual'],5), axis=1).mean())
dict_rez['itemitem_5000_K8'].append(result.apply(lambda row: precision_at_k(row['itemitem_5000_K8'], row['actual'],3), axis=1).mean())
dict_rez['itemitem_5000_K8'].append(result.apply(lambda row: recall_at_k(row['itemitem_5000_K8'], row['actual'],5), axis=1).mean())

dict_rez['cosine_5000'].append(result.apply(lambda row: precision_at_k(row['cosine_5000'], row['actual'],5), axis=1).mean())
dict_rez['cosine_5000'].append(result.apply(lambda row: precision_at_k(row['cosine_5000'], row['actual'],3), axis=1).mean())
dict_rez['cosine_5000'].append(result.apply(lambda row: recall_at_k(row['cosine_5000'], row['actual'],5), axis=1).mean())

dict_rez['tfidf_5000'].append(result.apply(lambda row: precision_at_k(row['tfidf_5000'], row['actual'],5), axis=1).mean())
dict_rez['tfidf_5000'].append(result.apply(lambda row: precision_at_k(row['tfidf_5000'], row['actual'],3), axis=1).mean())
dict_rez['tfidf_5000'].append(result.apply(lambda row: recall_at_k(row['tfidf_5000'], row['actual'],5), axis=1).mean())

dict_rez['itemitem_3000'].append(result.apply(lambda row: precision_at_k(row['itemitem_3000'], row['actual'],5), axis=1).mean())
dict_rez['itemitem_3000'].append(result.apply(lambda row: precision_at_k(row['itemitem_3000'], row['actual'],3), axis=1).mean())
dict_rez['itemitem_3000'].append(result.apply(lambda row: recall_at_k(row['itemitem_3000'], row['actual'],5), axis=1).mean())

dict_rez['cosine_3000'].append(result.apply(lambda row: precision_at_k(row['cosine_3000'], row['actual'],5), axis=1).mean())
dict_rez['cosine_3000'].append(result.apply(lambda row: precision_at_k(row['cosine_3000'], row['actual'],3), axis=1).mean())
dict_rez['cosine_3000'].append(result.apply(lambda row: recall_at_k(row['cosine_3000'], row['actual'],5), axis=1).mean())

dict_rez['tfidf_3000'].append(result.apply(lambda row: precision_at_k(row['tfidf_3000'], row['actual'],5), axis=1).mean())
dict_rez['tfidf_3000'].append(result.apply(lambda row: precision_at_k(row['tfidf_3000'], row['actual'],3), axis=1).mean())
dict_rez['tfidf_3000'].append(result.apply(lambda row: recall_at_k(row['tfidf_3000'], row['actual'],5), axis=1).mean())

In [14]:
df_rez = pd.DataFrame(dict_rez, index=['Precision@5', 'Precision@3', 'Recall@5'])
df_rez

Unnamed: 0,itemitem_7000,cosine_7000,tfidf_7000,itemitem_5000_K2,itemitem_5000_K5,itemitem_5000_K8,cosine_5000,tfidf_5000,itemitem_3000,cosine_3000,tfidf_3000
Precision@5,0.145739,0.135455,0.135455,0.162977,0.145739,0.157003,0.135455,0.135455,0.145739,0.135455,0.135455
Precision@3,0.119654,0.058929,0.058929,0.104799,0.119654,0.094842,0.058929,0.058929,0.076069,0.058929,0.058929
Recall@5,0.016218,0.014031,0.014031,0.019229,0.016218,0.01773,0.014031,0.014031,0.016218,0.014031,0.014031


### Вывод: увеличение или уменьшение количества выборки Top-товаров никак не повлияло на результаты рекомендаций и, соответственно, на качество метрик. Однако изменение К-соседей оказывает влияние на ItemItemRecommender, причем уменьшение значения оказывает лучший эффект на метрики, чем увеличение значения.