In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

# Data preparation

In [100]:
data = pd.read_csv('C:/Users/Вадим/Desktop/GeekBrains/Recommendation-systems/Lectures/Lecture_2/webinar_2/webinar_2/data/retail_train.csv')
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


Time interval

In [101]:
data['week_no'].nunique()

95

Let's determine the number of unique users, products, and interactions.

In [102]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [103]:
popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity.describe()

Unnamed: 0,item_id,sales_value
count,89051.0,89051.0
mean,5115772.0,83.458481
std,5178973.0,1628.715079
min,25671.0,0.0
25%,966583.0,3.5
50%,1448516.0,10.78
75%,9553042.0,46.105
max,18024560.0,467993.62


In [104]:
item_features = pd.read_csv('C:/Users/Вадим/Desktop/GeekBrains/Recommendation-systems/Lectures/Lecture_2/webinar_2/webinar_2/data/product.csv')
item_features.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [105]:
user_features = pd.read_csv('C:/Users/Вадим/Desktop/GeekBrains/Recommendation-systems/Lectures/Lecture_2/webinar_2/webinar_2/data/hh_demographic.csv')
user_features.head()

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


### Train-test split

In recommendation systems, it is more correct to use train-test-split by time, and not by chance  
Let's take the last 3 weeks as a test

In [106]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [107]:
data_train.shape[0], data_test.shape[0]

(2278490, 118314)

In [108]:
popularity_quantity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity_quantity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity_quantity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [109]:
def top_n_popularity(popularity, n):
    return popularity.sort_values('n_sold', ascending=False).head(n).item_id.tolist()

# 1. Baselines

Create a dataframe with user purchases on a test dataset (last 3 weeks)

In [110]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [111]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('There are {} users in the test data set'.format(test_users))
print('There are {} new users in the test data set'.format(new_test_users))

There are 2042 users in the test data set
There are 0 new users in the test data set


### 1.1 Random recommendation

In [112]:
import os, sys
    
from metrics import precision_at_k, recall_at_k

In [113]:
def random_recommendation(items, n=5):
    """Random recommendations"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [114]:
%%time

for i in [10000, 5000, 2000, 1000, 500, 100, 50]:
    top_n = top_n_popularity(popularity_quantity, i)
    items = data_train.loc[data_train['item_id'].isin(top_n)].item_id.unique()
    name_col = 'random_rec_top' + str(i)
    result[name_col] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))

CPU times: total: 3.11 s
Wall time: 3.11 s


In [115]:
result.head(2)

Unnamed: 0,user_id,actual,random_rec_top10000,random_rec_top5000,random_rec_top2000,random_rec_top1000,random_rec_top500,random_rec_top100,random_rec_top50
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1007136, 1055915, 969231, 1127511, 9835509]","[1055072, 845868, 15596515, 1048746, 821730]","[993466, 7441558, 917742, 877447, 848029]","[944317, 919644, 867519, 873902, 1038663]","[9526411, 890695, 6533765, 7167218, 1056651]","[862139, 202291, 1070820, 1013321, 981760]","[2690723, 6544236, 1426702, 6534178, 5712216]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1009631, 13511822, 927019, 948953, 1097024]","[9527494, 5568732, 831557, 831628, 1094924]","[10121965, 947798, 976065, 847789, 995211]","[822785, 985119, 905539, 968932, 956609]","[9705473, 823990, 1117514, 1016800, 994928]","[1044078, 1029743, 1065593, 8090521, 1071939]","[202291, 420647, 707683, 6544236, 2690723]"


In [116]:
for name_col in result.columns[1:]:
    print(f"{round(result.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4):.4f}:{name_col}")

1.0000:actual
0.0036:random_rec_top10000
0.0073:random_rec_top5000
0.0119:random_rec_top2000
0.0136:random_rec_top1000
0.0223:random_rec_top500
0.0532:random_rec_top100
0.0465:random_rec_top50


Лучший показатель метрики при рекомендации top_50 покупаемых товаров.

In [117]:
for i in [10000, 5000, 2000, 1000, 500, 100]:
    name_col = 'random_rec_top' + str(i)
    result.drop(columns=name_col, inplace=True)

result.head(2)

Unnamed: 0,user_id,actual,random_rec_top50
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[2690723, 6544236, 1426702, 6534178, 5712216]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[202291, 420647, 707683, 6544236, 2690723]"


### 1.2 Popularity-based recommendation

In [118]:
def popularity_recommendation(data, n=5):
    """Top "n" popular products"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [119]:
%%time

# This can be done, since the recommendation does not depend on the user
popular_recs = popularity_recommendation(data_train, n=5)

result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)
result.head(2)

CPU times: total: 219 ms
Wall time: 200 ms


Unnamed: 0,user_id,actual,random_rec_top50,popular_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[2690723, 6544236, 1426702, 6534178, 5712216]","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[202291, 420647, 707683, 6544236, 2690723]","[6534178, 6533889, 1029743, 6534166, 1082185]"


### 1.3 Weighted random recommender

- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [120]:
def items_weight(data):
    """adding weights"""
    
    df = data.copy()
    df['item_weight'] = df['sales_value']/df['sales_value'].sum()
   
    return df

In [121]:
def items_weight_recommendation(data, n=5):
    """Рекомендация по весам"""
    
    data = items_weight(data)
    recs = np.random.choice(data['item_id'], n, p=data['item_weight'], replace=False)
   
    return recs

In [122]:
%%time

for i in [10000, 5000, 2000, 1000, 500, 100, 50]:
    top_n = top_n_popularity(popularity_quantity, i)
    item_weight_recs = items_weight_recommendation(data_train.loc[data_train['item_id'].isin(top_n)], n=5)
    name_col = 'item_weight_rec_top' + str(i)
    result[name_col] = result['user_id'].apply(lambda x: item_weight_recs)

CPU times: total: 1.52 s
Wall time: 1.41 s


In [123]:
for name_col in result.columns[1:]:
    print(f"{round(result.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4):.4f}:{name_col}")

1.0000:actual
0.0465:random_rec_top50
0.1552:popular_recommendation
0.0073:item_weight_rec_top10000
0.0129:item_weight_rec_top5000
0.0521:item_weight_rec_top2000
0.1267:item_weight_rec_top1000
0.1259:item_weight_rec_top500
0.1319:item_weight_rec_top100
0.0595:item_weight_rec_top50


После двух прогонов лучший результат по метрике был при top_1000. Оставим его для дальнейшего сравнения.

In [124]:
for i in [10000, 5000, 2000, 500, 100, 50]:
    name_col = 'item_weight_rec_top' + str(i)
    result.drop(columns=name_col, inplace=True)

result.head(2)

Unnamed: 0,user_id,actual,random_rec_top50,popular_recommendation,item_weight_rec_top1000
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[2690723, 6544236, 1426702, 6534178, 5712216]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[202291, 420647, 707683, 6544236, 2690723]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]"


### Conclusions on Baselines
- Fix the basic quality;
- Baselines can be filters;
- Sometimes baselines are better than ML models

# 2. Deterministic algorithms

## 2.1 Item-Item Recommender / ItemKNN

**(!) Important** 

- Item-item algorithms have a lot of complexity ($O(I^2 log(I))$ или $O(I^3)$, depending on the implementation
- If there are a lot of item_ids in the dataset, then item-item models predict for a VERY long time. With all products predict on test ~2 hours
- Let's take from ~90k products only 5k of the most popular

*P.S.*  Taking the top popular ones and recommending only from them is a very popular strategy.

In [125]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [126]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [127]:
data_train.head(100)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,1060,26985040735,1,9553288,1,8.49,315,0.00,1251,1,0.0,0.0
96,1351,26985052379,1,903230,1,0.99,447,-0.30,1955,1,0.0,0.0
97,744,26985165432,1,5978648,0,0.00,31582,0.00,1119,1,0.0,0.0
98,212,26985205886,1,822346,1,1.25,288,-0.34,1341,1,0.0,0.0


In [128]:
# Let's get a fictitious item_id (if the user bought products from the top 5000, then he "bought" such a product)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 6666
data_train.head(100)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,1060,26985040735,1,9553288,1,8.49,315,0.00,1251,1,0.0,0.0
96,1351,26985052379,1,903230,1,0.99,447,-0.30,1955,1,0.0,0.0
97,744,26985165432,1,6666,0,0.00,31582,0.00,1119,1,0.0,0.0
98,212,26985205886,1,822346,1,1.25,288,-0.34,1341,1,0.0,0.0


In [129]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix)


## Разряженность матрицы

In [130]:
user_item_matrix.sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) * 100

5.33770796861036

In [131]:
# создаем словари мапинга между id бизнеса к строчному id матрицы

userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

# Fit

In [132]:
for i in range(1,11):
    model = ItemItemRecommender(K=i, num_threads=8) # K - кол-во билжайших соседей
    model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)
    
    recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)
    
    name_col = 'itemitem_neighbour_' + str(i)
    result[name_col] = result['user_id'].apply(lambda user_id: [id_to_itemid[rec[0]]  
                                                              for rec in model.recommend(userid=userid_to_id[user_id], 
                                                                user_items=sparse_user_item,   # на вход user-item matrix
                                                                N=5, 
                                                                filter_already_liked_items=False, 
                                                                filter_items=None, 
                                                                recalculate_user=True)
                                                                             ])

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [133]:
for name_col in result.columns[1:]:
    print(f"{round(result.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4):.4f}:{name_col}")

1.0000:actual
0.0465:random_rec_top50
0.1552:popular_recommendation
0.1267:item_weight_rec_top1000
0.1923:itemitem_neighbour_1
0.1920:itemitem_neighbour_2
0.1861:itemitem_neighbour_3
0.1449:itemitem_neighbour_4
0.1368:itemitem_neighbour_5
0.1421:itemitem_neighbour_6
0.1450:itemitem_neighbour_7
0.1472:itemitem_neighbour_8
0.1485:itemitem_neighbour_9
0.1509:itemitem_neighbour_10


Лучший результат по метрике был при neighbour_1. Оставим его для дальнейшего сравнения.

In [134]:
# itemitem_neighbour_
for i in range(2,11):
    name_col = 'itemitem_neighbour_' + str(i)
    result.drop(columns=name_col, inplace=True)

result.head(2)

Unnamed: 0,user_id,actual,random_rec_top50,popular_recommendation,item_weight_rec_top1000,itemitem_neighbour_1
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[2690723, 6544236, 1426702, 6534178, 5712216]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 995242, 1029743, 840361]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[202291, 420647, 707683, 6544236, 2690723]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 1098066, 6534178, 826249]"


### 2.3 Косинусное сходство и CosineRecommender

<img src="cosine_similarity.png">

In [135]:
for i in range(1,11):
    model = CosineRecommender(K=i, num_threads=8) # K - кол-во билжайших соседей
    
    model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)
    
    recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)
    
    name_col = 'cosine_neighbour_' + str(i)
    result[name_col] = result['user_id'].apply(lambda x: [id_to_itemid[rec[0]] for rec in
                                    model.recommend(userid=userid_to_id[x],
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [136]:
for name_col in result.columns[1:]:
    print(f"{round(result.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4):.4f}:{name_col}")

1.0000:actual
0.0465:random_rec_top50
0.1552:popular_recommendation
0.1267:item_weight_rec_top1000
0.1923:itemitem_neighbour_1
0.1728:cosine_neighbour_1
0.1498:cosine_neighbour_2
0.1427:cosine_neighbour_3
0.1363:cosine_neighbour_4
0.1329:cosine_neighbour_5
0.1339:cosine_neighbour_6
0.1313:cosine_neighbour_7
0.1307:cosine_neighbour_8
0.1311:cosine_neighbour_9
0.1320:cosine_neighbour_10


Лучший результат по метрике был при neighbour_1. Оставим его для дальнейшего сравнения.

In [137]:
for i in range(2,11):
    name_col = 'cosine_neighbour_' + str(i)
    result.drop(columns=name_col, inplace=True)

result.head(2)

Unnamed: 0,user_id,actual,random_rec_top50,popular_recommendation,item_weight_rec_top1000,itemitem_neighbour_1,cosine_neighbour_1
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[2690723, 6544236, 1426702, 6534178, 5712216]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 995242, 1029743, 840361]","[1082185, 1029743, 1081177, 904360, 6034857]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[202291, 420647, 707683, 6544236, 2690723]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 1098066, 6534178, 826249]","[1082185, 951590, 1044078, 9526410, 904360]"


### 4.3 TF-IDF взвешивание и TFIDFRecommender

<img src='tf_idf.png'>

Если 2 юзера оба купили очень популярный товар, то это еще не значит,что они похожи   
Если 2 юзера оба купили редкий товар, то они похожи

Занижаем вес популярных товаров при расчете расстояний между пользователями

In [138]:
for i in range(1,11):
    model = TFIDFRecommender(K=i, num_threads=8) # K - кол-во билжайших соседей
    
    model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)
    
    recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)
    
    name_col = 'tfidf_neighbour_' + str(i)
    result[name_col] = result['user_id'].apply(lambda x: [id_to_itemid[rec[0]] for rec in
                                    model.recommend(userid=userid_to_id[x],
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=False, 
                                    recalculate_user=False)])

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [139]:
for name_col in result.columns[1:]:
    print(f"{round(result.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4):.4f}:{name_col}")

1.0000:actual
0.0465:random_rec_top50
0.1552:popular_recommendation
0.1267:item_weight_rec_top1000
0.1923:itemitem_neighbour_1
0.1728:cosine_neighbour_1
0.1291:tfidf_neighbour_1
0.1560:tfidf_neighbour_2
0.1464:tfidf_neighbour_3
0.1408:tfidf_neighbour_4
0.1390:tfidf_neighbour_5
0.1380:tfidf_neighbour_6
0.1394:tfidf_neighbour_7
0.1382:tfidf_neighbour_8
0.1395:tfidf_neighbour_9
0.1415:tfidf_neighbour_10


Лучший результат по метрике был при neighbour_2. Оставим его для дальнейшего сравнения.

In [140]:
for i in range(1,11):
    if i==2:
        continue
    name_col = 'tfidf_neighbour_' + str(i)
    result.drop(columns=name_col, inplace=True)

result.head(2)

Unnamed: 0,user_id,actual,random_rec_top50,popular_recommendation,item_weight_rec_top1000,itemitem_neighbour_1,cosine_neighbour_1,tfidf_neighbour_2
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[2690723, 6544236, 1426702, 6534178, 5712216]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 995242, 1029743, 840361]","[1082185, 1029743, 1081177, 904360, 6034857]","[1082185, 6666, 961554, 840361, 979707]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[202291, 420647, 707683, 6544236, 2690723]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 1098066, 6534178, 826249]","[1082185, 951590, 1044078, 9526410, 904360]","[1082185, 1098066, 6666, 883404, 826249]"


### 2.5 Трюк

In [141]:
%%time

model = ItemItemRecommender(K=1, num_threads=8) # K - кол-во билжайших соседей


model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: total: 2.55 s
Wall time: 788 ms


In [142]:
%%time

result['own_purchases'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[6666]], 
                                    recalculate_user=True)])

CPU times: total: 531 ms
Wall time: 105 ms


### 2.6 Измерим качество по precision@5

In [143]:
result.head(5)

Unnamed: 0,user_id,actual,random_rec_top50,popular_recommendation,item_weight_rec_top1000,itemitem_neighbour_1,cosine_neighbour_1,tfidf_neighbour_2,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[2690723, 6544236, 1426702, 6534178, 5712216]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 995242, 1029743, 840361]","[1082185, 1029743, 1081177, 904360, 6034857]","[1082185, 6666, 961554, 840361, 979707]","[1082185, 995242, 1029743, 840361, 904360]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[202291, 420647, 707683, 6544236, 2690723]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 1098066, 6534178, 826249]","[1082185, 951590, 1044078, 9526410, 904360]","[1082185, 1098066, 6666, 883404, 826249]","[1082185, 1098066, 6534178, 826249, 1127831]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[916122, 6534178, 951590, 707683, 904360]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 981760, 995242, 1029743]","[1082185, 951590, 994928, 1029743, 1044078]","[1082185, 6666, 1127831, 981760, 1044078]","[1082185, 981760, 995242, 1029743, 840361]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[5716076, 6534178, 826249, 5712216, 1098066]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 995242, 1029743, 826249]","[1082185, 994928, 1044078, 1029743, 6034857]","[1082185, 1127831, 6666, 938700, 1013321]","[1082185, 995242, 1029743, 826249, 1127831]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[5845857, 2690723, 731106, 1029743, 5668996]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1081177, 6534178, 863802, 1082185, 6534166]","[6666, 1082185, 981760, 995242, 1029743]","[1082185, 854852, 951590, 1029743, 1044078]","[1082185, 1098066, 6666, 981760, 1044078]","[1082185, 981760, 995242, 1029743, 840361]"


# Metrics

In [144]:
for name_col in result.columns[1:]:
    print(f"{round(result.apply(lambda row: precision_at_k(row[name_col], row['actual']), axis=1).mean(),4)}:{name_col}")

1.0:actual
0.0465:random_rec_top50
0.1552:popular_recommendation
0.1267:item_weight_rec_top1000
0.1923:itemitem_neighbour_1
0.1728:cosine_neighbour_1
0.156:tfidf_neighbour_2
0.2199:own_purchases


  return flags.sum() / len(recommended_list)
