### Implicit & LightFM


In [70]:
import os
import numpy as np 
import pandas as pd 

import scipy.sparse as sp
from itertools import islice, cycle
from more_itertools import pairwise
from tqdm.auto import tqdm

from src.validation import TimeRangeSplit
from src.metrics import compute_metrics
from src.sparse import get_coo_matrix
from src.implicit import generate_implicit_recs_mapper

from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender

In [42]:
cd D:\GIT_REPOES\Courses-Cheat_Sheets\Courses\Your First RecSys [ODS]\data

D:\GIT_REPOES\Courses-Cheat_Sheets\Courses\Your First RecSys [ODS]\data


In [43]:
# Data Loading
users_df = pd.read_pickle('users_preprocessed.pickle')
items_df = pd.read_pickle('items_preprocessed.pickle')
interactions_df = pd.read_pickle('interactions_preprocessed.pickle')

In [44]:
# Matrices Shape
print('Users: ', users_df.shape)
print('Items: ', items_df.shape)
print('Interatcions: ', interactions_df.shape)

Users:  (142888, 3)
Items:  (59599, 5)
Interatcions:  (1532998, 5)


In [45]:
# Mapping unique users from interactions_df
users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
len(users_mapping)

151600

In [46]:
# Mapping unique items from interactions_df
items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}
len(items_mapping)

59599

In [47]:
titles = pd.Series(items_df['title'].values, items_df['id']).to_dict()
print('Titles Count: ', len(titles))

# Accessing a title using id 
titles[221944]

Titles Count:  59599


'Праздный разговор'

In [48]:
title_items = items_df.groupby('title')['id'].agg(list)
title_items

title
# 20 восьмая                                                     [201623]
# DUO                                                             [72582]
# Me Too. Роман                                                  [171172]
# Партия                                                         [224512]
#1917: Человек из раньшего времени. Библиотека «Проекта 1917»    [230768]
                                                                   ...   
…хоть потоп!                                                      [55093]
№ 12, или История одного прекрасного юноши                        [20979]
伦巴德人的故事                                                          [119226]
地球への旅                                                            [148400]
�Baby blues�                                                      [98635]
Name: id, Length: 57358, dtype: object

In [49]:
title_count = title_items.map(len)
title_count.value_counts()

1     55826
2      1163
3       232
4        71
5        36
6        12
7         7
8         3
9         2
23        1
18        1
47        1
13        1
12        1
11        1
Name: id, dtype: int64

In [50]:
title_items[title_count > 1].tail()

title
Яма                                              [60156, 165785]
Янки из Коннектикута при дворе короля Артура      [14759, 56530]
Японская диета                                   [168986, 74652]
Яр                                                [168761, 5371]
Ящик Пандоры                                    [236465, 158851]
Name: id, dtype: object

In [51]:
items_df[items_df['title'] == 'Ящик Пандоры']

Unnamed: 0,id,title,genres,authors,year
40426,236465,Ящик Пандоры,"Любовно-фантастические романы,Научная фантастика",Филипп Хорват,2017
54854,158851,Ящик Пандоры,"Мистика,Современная зарубежная литература",Бернар Вербер,2018


In [52]:
interactions_df['rating'] = np.array(interactions_df['rating'].values, dtype=np.float32)

In [53]:
interactions_df.loc[interactions_df['item_id'].isin([44681, 162716])].groupby('item_id').agg({
    'progress': np.size,
    'rating': ['mean'],
    'start_date': ['min', 'max']
})

Unnamed: 0_level_0,progress,rating,start_date,start_date
Unnamed: 0_level_1,size,mean,min,max
item_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
44681,353,4.56,2018-01-24,2019-12-20
162716,59,4.8,2018-01-25,2019-12-30


### Validation
Select 7 last days and test sequentially 

In [54]:
last_date = interactions_df['start_date'].max().normalize()
folds = 7
start_date = last_date - pd.Timedelta(days=folds)
start_date, last_date

(Timestamp('2019-12-24 00:00:00'), Timestamp('2019-12-31 00:00:00'))

In [55]:
cv = TimeRangeSplit(start_date=start_date, periods=folds+1)

cv.max_n_splits, cv.get_n_splits(interactions_df, datetime_column='start_date')

(7, 7)

In [56]:
cv.date_range

DatetimeIndex(['2019-12-24', '2019-12-25', '2019-12-26', '2019-12-27',
               '2019-12-28', '2019-12-29', '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', freq='D')

In [57]:
folds_with_stats = list(cv.split(
    interactions_df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='start_date',
    fold_stats=True
))

folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])

In [58]:
folds_info_with_stats

Unnamed: 0,Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
0,2019-12-24,2019-12-25,1515946,3,3,0,0,0,2045
1,2019-12-25,2019-12-26,1517994,1,1,0,0,0,2141
2,2019-12-26,2019-12-27,1520136,0,0,0,0,0,2177
3,2019-12-27,2019-12-28,1522313,0,0,0,0,0,2110
4,2019-12-28,2019-12-29,1524423,2,4,0,0,0,2205
5,2019-12-29,2019-12-30,1526632,4,4,0,0,0,2118
6,2019-12-30,2019-12-31,1528754,1,1,0,0,0,2168


In [59]:
fold_dates = [(info['Start date'], info['End date']) for _, _, info in folds_with_stats]
fold_dates

[(Timestamp('2019-12-24 00:00:00', freq='D'),
  Timestamp('2019-12-25 00:00:00', freq='D')),
 (Timestamp('2019-12-25 00:00:00', freq='D'),
  Timestamp('2019-12-26 00:00:00', freq='D')),
 (Timestamp('2019-12-26 00:00:00', freq='D'),
  Timestamp('2019-12-27 00:00:00', freq='D')),
 (Timestamp('2019-12-27 00:00:00', freq='D'),
  Timestamp('2019-12-28 00:00:00', freq='D')),
 (Timestamp('2019-12-28 00:00:00', freq='D'),
  Timestamp('2019-12-29 00:00:00', freq='D')),
 (Timestamp('2019-12-29 00:00:00', freq='D'),
  Timestamp('2019-12-30 00:00:00', freq='D')),
 (Timestamp('2019-12-30 00:00:00', freq='D'),
  Timestamp('2019-12-31 00:00:00', freq='D'))]

### Implicit Library
- Target - Implicit
- Input data - Sparse matrices with float32/float64 datatype

In [60]:
train_idx, test_idx, info = folds_with_stats[0]

train = interactions_df.loc[train_idx]
test = interactions_df.loc[test_idx]


print('Train Shape: ', train.shape)
print('Test Shape: ', test.shape)

Train Shape:  (1515946, 5)
Test Shape:  (2045, 5)


In [61]:
train_sparse = get_coo_matrix(train, users_mapping=users_mapping, items_mapping=items_mapping).tocsr()
train_sparse

<151589x59599 sparse matrix of type '<class 'numpy.float32'>'
	with 1515946 stored elements in Compressed Sparse Row format>

### Item2Item Models
Similar items are defined using `cosine measure` for those objects that has been seen by a user (i.e. interacted)

Models: 
- `CosineRecommender`,
- `BM25Recommender`
- `TFIDFRecommender`

They take in a matrix (**item_user** not user_item), thus we have to pass `train_sparse.T`


In [62]:
# Model Building 
cosine_model = CosineRecommender(K=10)
cosine_model.fit(train_sparse.T) 

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=151589.0), HTML(value='')))




In [63]:
top_N = 10
user_id = test['user_id'].iloc[0]
row_id = users_mapping[user_id]
print(f'Рекомендации для пользователя {user_id}, номер строки - {row_id}')

Рекомендации для пользователя 139942, номер строки - 10346


`K` parameter defines the size of a top-k elements in a final recommendation list. Thus, parameter `N` better to be lower than `K`




In [64]:
recs = cosine_model.recommend(row_id, train_sparse, N=top_N, filter_already_liked_items=True)
recs = pd.DataFrame({'col_id': recs[0] , 'similarity': recs[1]})
recs

Unnamed: 0,col_id,similarity
0,91935,0.333333
1,126579,0.333333
2,137749,0.353553
3,132059,0.353553
4,132029,0.353553
5,148884,0.353553
6,151350,0.377964
7,47006,0.408248
8,5254,0.518545
9,58022,0.377964


In [68]:
# Add item name 
recs['item_id'] = recs['col_id'].map(items_inv_mapping.get)
recs['title'] = recs['item_id'].map(titles.get)
recs

Unnamed: 0,col_id,similarity,item_id,title
0,91935,0.333333,,
1,126579,0.333333,,
2,137749,0.353553,,
3,132059,0.353553,,
4,132029,0.353553,,
5,148884,0.353553,,
6,151350,0.377964,,
7,47006,0.408248,57989.0,39 ключей: Гнездо гадюки. Код императора
8,5254,0.518545,85506.0,Путь к теннисному Олимпу
9,58022,0.377964,148226.0,Известия 08-2019


In [72]:
mapper = generate_implicit_recs_mapper(cosine_model, train_sparse, top_N, users_mapping, items_inv_mapping)
mapper

<function src.implicit.generate_implicit_recs_mapper.<locals>._recs_mapper(user)>

In [76]:
# recs = pd.DataFrame({
#     'user_id': test['user_id'].unique()
# })
# recs['item_id'] = recs['user_id'].map(mapper)
# recs.head()