In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.sparse import csr_matrix, coo_matrix

from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

from tqdm import tqdm

In [4]:
data = pd.read_csv('/Users/mac/Downloads/webinar_2/data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


В рекомендательных системах корректнее использовать train-test split по времени, а не случайно.

Возмём последние 3 недели в качестве теста.

In [5]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]
data_train

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2282320,222,41297772783,635,1120741,1,0.59,304,0.00,1716,91,0.0,0.0
2282321,462,41297773713,635,993339,1,1.99,304,0.00,2040,91,0.0,0.0
2282322,462,41297773713,635,995242,1,1.00,304,-0.89,2040,91,0.0,0.0
2282323,462,41297773713,635,10180324,1,3.00,304,-0.29,2040,91,0.0,0.0


In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

result['actual'] = result['actual'].apply(lambda x: list(x))

result.head(10)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
5,9,"[864335, 990865, 1029743, 9297474, 10457112, 8..."
6,13,"[6534178, 1104146, 829197, 840361, 862070, 884..."
7,14,"[840601, 867293, 933067, 951590, 952408, 96569..."
8,15,"[910439, 1082185, 959076, 1023958, 1082310, 13..."
9,16,"[1062973, 1082185, 13007710]"


In [7]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


In [8]:
def random_recommendation(items, n=5):

    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [9]:
%%time

items = data_train.item_id.unique()
print(items)

result['rand rec'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result.head(5)

[ 1004906  1033142  1036325 ... 15722756 17170636 15716393]
CPU times: user 2.6 s, sys: 16.1 ms, total: 2.62 s
Wall time: 2.64 s


Unnamed: 0,user_id,actual,rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1049661, 6603362, 12330770, 990866, 5570446]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[9526701, 9195130, 855328, 1066608, 9673099]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[7431134, 725789, 922759, 17209779, 998741]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[6039787, 8025867, 9575554, 1304545, 12457046]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[15716609, 12524188, 890040, 13382040, 12352506]"


In [10]:
hit_rate = 0
for i in range(result.shape[0]):
    flags = np.isin(result.loc[i,'actual'], result.loc[i,'rand rec'])
    hit_rate += (flags.sum() > 0).astype(int)
hit_rate

6

In [11]:
def weighted_random_recommendation(items, p, n=5):

    
    recs = np.random.choice(items, size=n, replace=False, p=p)
    
    return recs.tolist()

In [12]:
def data_prep(df, func):
    items = df.item_id.unique()

    items = np.array(items)

    count_df = df.groupby('item_id').item_id.agg(['count'])

    count_df['weight'] = func(count_df['count'] / count_df['count'].sum())

    items_df = pd.DataFrame(items, columns = ['item_id'])

    probability_df = pd.merge(items_df, count_df, on='item_id', how='left')

    probability_df.fillna(value=0, inplace=True)

    p = np.array(probability_df.to_numpy()[:, 2])

    p /= p.sum()  # normalize
    
    return items, p

In [13]:
%%time

items, p = data_prep(data_train, np.log)

result['weighted log rand rec'] = \
        result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

result.head(3)

CPU times: user 1.79 s, sys: 68.7 ms, total: 1.86 s
Wall time: 1.89 s


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1049661, 6603362, 12330770, 990866, 5570446]","[1894237, 12188393, 1041381, 57314, 1009625]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[9526701, 9195130, 855328, 1066608, 9673099]","[1009851, 1116425, 1052087, 833890, 13910210]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[7431134, 725789, 922759, 17209779, 998741]","[6551741, 1322954, 9396885, 1957362, 922466]"


In [14]:
%%time

items, p = data_prep(data_train, np.sqrt)

result['weighted sqrt rand rec'] = \
            result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

result.head(2)

CPU times: user 1.71 s, sys: 35.4 ms, total: 1.74 s
Wall time: 1.75 s


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1049661, 6603362, 12330770, 990866, 5570446]","[1894237, 12188393, 1041381, 57314, 1009625]","[1065089, 5995423, 920227, 1136257, 1089323]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[9526701, 9195130, 855328, 1066608, 9673099]","[1009851, 1116425, 1052087, 833890, 13910210]","[959179, 848270, 1058322, 5995199, 9885520]"


In [15]:
%%time

items, p = data_prep(data_train, np.square)

result['weighted square rand rec'] = \
            result['user_id'].apply(lambda x: weighted_random_recommendation(items, p, n=5))

result.head(2)

CPU times: user 2 s, sys: 37.9 ms, total: 2.03 s
Wall time: 2.04 s


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1049661, 6603362, 12330770, 990866, 5570446]","[1894237, 12188393, 1041381, 57314, 1009625]","[1065089, 5995423, 920227, 1136257, 1089323]","[1082185, 995785, 860776, 953104, 904360]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[9526701, 9195130, 855328, 1066608, 9673099]","[1009851, 1116425, 1052087, 833890, 13910210]","[959179, 848270, 1058322, 5995199, 9885520]","[1106523, 6534178, 994928, 1044078, 1070820]"


In [16]:
def random_recommendation(items, n=5):

    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [17]:
%%time

items = data_train.item_id.unique()

result['rand rec'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))
result.head(3)

CPU times: user 2.5 s, sys: 14.4 ms, total: 2.52 s
Wall time: 2.53 s


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1117944, 8159536, 1052481, 864586, 14043825]","[1894237, 12188393, 1041381, 57314, 1009625]","[1065089, 5995423, 920227, 1136257, 1089323]","[1082185, 995785, 860776, 953104, 904360]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1010718, 861459, 13133985, 1132248, 869755]","[1009851, 1116425, 1052087, 833890, 13910210]","[959179, 848270, 1058322, 5995199, 9885520]","[1106523, 6534178, 994928, 1044078, 1070820]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[2794351, 1100951, 909955, 6424142, 9655004]","[6551741, 1322954, 9396885, 1957362, 922466]","[1103826, 13008328, 5584027, 15926844, 896666]","[923746, 1082185, 1029743, 1081177, 910032]"


In [18]:
def popularity_recommendation(data, n=5):
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [19]:
%%time

popular_recs = popularity_recommendation(data_train, n=5)

result['popular rec'] = result['user_id'].apply(lambda x: popular_recs)
result.head(3)

CPU times: user 96.2 ms, sys: 21.3 ms, total: 117 ms
Wall time: 122 ms


Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1117944, 8159536, 1052481, 864586, 14043825]","[1894237, 12188393, 1041381, 57314, 1009625]","[1065089, 5995423, 920227, 1136257, 1089323]","[1082185, 995785, 860776, 953104, 904360]","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1010718, 861459, 13133985, 1132248, 869755]","[1009851, 1116425, 1052087, 833890, 13910210]","[959179, 848270, 1058322, 5995199, 9885520]","[1106523, 6534178, 994928, 1044078, 1070820]","[6534178, 6533889, 1029743, 6534166, 1082185]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[2794351, 1100951, 909955, 6424142, 9655004]","[6551741, 1322954, 9396885, 1957362, 922466]","[1103826, 13008328, 5584027, 15926844, 896666]","[923746, 1082185, 1029743, 1081177, 910032]","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [20]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [21]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [22]:
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2497,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2498,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
user_item_matrix[user_item_matrix > 0] = 1 
user_item_matrix = user_item_matrix.astype(float) 

sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [25]:
interactions / (users*items)

0.010770291654185115

In [26]:
user_item_matrix.sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) * 100

5.33770796861036

In [27]:
np.sort(data.item_id.unique())

array([   25671,    26081,    26093, ..., 18000012, 18024155, 18024556])

In [28]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [29]:
itemid_to_id

{202291: 0,
 397896: 1,
 420647: 2,
 480014: 3,
 545926: 4,
 707683: 5,
 731106: 6,
 818980: 7,
 819063: 8,
 819227: 9,
 819255: 10,
 819304: 11,
 819308: 12,
 819330: 13,
 819518: 14,
 819594: 15,
 819643: 16,
 819765: 17,
 819840: 18,
 819845: 19,
 819927: 20,
 819978: 21,
 820082: 22,
 820122: 23,
 820165: 24,
 820291: 25,
 820301: 26,
 820321: 27,
 820361: 28,
 820486: 29,
 820518: 30,
 820560: 31,
 820701: 32,
 820895: 33,
 821025: 34,
 821083: 35,
 821200: 36,
 821209: 37,
 821219: 38,
 821344: 39,
 821464: 40,
 821556: 41,
 821562: 42,
 821695: 43,
 821730: 44,
 821735: 45,
 821787: 46,
 821867: 47,
 821976: 48,
 822049: 49,
 822073: 50,
 822101: 51,
 822140: 52,
 822178: 53,
 822225: 54,
 822241: 55,
 822339: 56,
 822346: 57,
 822407: 58,
 822517: 59,
 822524: 60,
 822646: 61,
 822677: 62,
 822739: 63,
 822785: 64,
 822936: 65,
 822965: 66,
 823099: 67,
 823176: 68,
 823356: 69,
 823704: 70,
 823721: 71,
 823758: 72,
 823775: 73,
 823862: 74,
 823915: 75,
 823990: 76,
 824005: 

In [30]:
%%time

model = ItemItemRecommender(K=5, num_threads=4) 
model.fit(csr_matrix(user_item_matrix).T.tocsr(),  
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  
                        user_items=csr_matrix(user_item_matrix).tocsr(),   
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 1.42 s, sys: 33.8 ms, total: 1.45 s
Wall time: 1.47 s


Рассчитываем для каждого пользователя его рекомендации и добавляем к итоговой таблице сравнений метрик.

In [31]:
%%time

result['itemitem'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

CPU times: user 54.7 ms, sys: 2.58 ms, total: 57.3 ms
Wall time: 57.3 ms


In [32]:
result.head(3)

Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1117944, 8159536, 1052481, 864586, 14043825]","[1894237, 12188393, 1041381, 57314, 1009625]","[1065089, 5995423, 920227, 1136257, 1089323]","[1082185, 995785, 860776, 953104, 904360]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1010718, 861459, 13133985, 1132248, 869755]","[1009851, 1116425, 1052087, 833890, 13910210]","[959179, 848270, 1058322, 5995199, 9885520]","[1106523, 6534178, 994928, 1044078, 1070820]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[2794351, 1100951, 909955, 6424142, 9655004]","[6551741, 1322954, 9396885, 1957362, 922466]","[1103826, 13008328, 5584027, 15926844, 896666]","[923746, 1082185, 1029743, 1081177, 910032]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]"


### Косинусное сходство и CosineRecommender

In [33]:
%%time

model = CosineRecommender(K=5, num_threads=4) 

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 1.42 s, sys: 24.8 ms, total: 1.45 s
Wall time: 1.45 s


In [34]:
%%time

result['cosine'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

CPU times: user 63.9 ms, sys: 1.57 ms, total: 65.5 ms
Wall time: 64.8 ms


In [35]:
result.head(3)

Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1117944, 8159536, 1052481, 864586, 14043825]","[1894237, 12188393, 1041381, 57314, 1009625]","[1065089, 5995423, 920227, 1136257, 1089323]","[1082185, 995785, 860776, 953104, 904360]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1010718, 861459, 13133985, 1132248, 869755]","[1009851, 1116425, 1052087, 833890, 13910210]","[959179, 848270, 1058322, 5995199, 9885520]","[1106523, 6534178, 994928, 1044078, 1070820]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[2794351, 1100951, 909955, 6424142, 9655004]","[6551741, 1322954, 9396885, 1957362, 922466]","[1103826, 13008328, 5584027, 15926844, 896666]","[923746, 1082185, 1029743, 1081177, 910032]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]"


### TF-IDF взвешивание и TFIDFRecommender

In [36]:
%%time

model = TFIDFRecommender(K=5, num_threads=4) 

model.fit(csr_matrix(user_item_matrix).T.tocsr(), 
          show_progress=True)

recs = model.recommend(userid=userid_to_id[1], 
                        user_items=csr_matrix(user_item_matrix).tocsr(),   
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))


CPU times: user 1.48 s, sys: 20.8 ms, total: 1.5 s
Wall time: 1.52 s


In [37]:
%%time

result['tfidf'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=False)])

CPU times: user 65.5 ms, sys: 1.77 ms, total: 67.3 ms
Wall time: 66.5 ms


In [38]:
result.head(3)

Unnamed: 0,user_id,actual,rand rec,weighted log rand rec,weighted sqrt rand rec,weighted square rand rec,popular rec,itemitem,cosine,tfidf
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1117944, 8159536, 1052481, 864586, 14043825]","[1894237, 12188393, 1041381, 57314, 1009625]","[1065089, 5995423, 920227, 1136257, 1089323]","[1082185, 995785, 860776, 953104, 904360]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1010718, 861459, 13133985, 1132248, 869755]","[1009851, 1116425, 1052087, 833890, 13910210]","[959179, 848270, 1058322, 5995199, 9885520]","[1106523, 6534178, 994928, 1044078, 1070820]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[2794351, 1100951, 909955, 6424142, 9655004]","[6551741, 1322954, 9396885, 1957362, 922466]","[1103826, 13008328, 5584027, 15926844, 896666]","[923746, 1082185, 1029743, 1081177, 910032]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]"


### Оценки моделей по precision@5

In [None]:
for column in result.columns[2:]:
    print(column, round(result.apply(lambda x: precision_at_k(x[column], x['actual'],  5), axis=1).mean(), 5))