# Initialization

In [1]:
import logging
import scipy
import sys
import sklearn.preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise import NormalPredictor
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity
from catboost import CatBoostClassifier, Pool

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
items = pd.read_parquet("items.parquet")
events = pd.read_parquet("events.parquet")

In [3]:
# Задаем точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()
train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date

# Разбиваем события на train и test
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]


In [4]:
# Перекодируем идентификаторы пользователей:
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])


In [5]:
# Перекодируем идентификаторы объектов:
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])
events_train["item_id_enc"] = item_encoder.transform(events_train["item_id"])
events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["item_id_enc"] = item_encoder.transform(events_train["item_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])


In [6]:
# Определяем параметры для ALS-модели
factors = 50
iterations = 50
regularization = 0.05
random_state = 0

In [7]:
# создаём sparse-матрицу формата CSR 
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['item_id_enc'])),
    dtype=np.int8)

In [8]:
# Инициализируем и обучаем ALS-модель
als_model = AlternatingLeastSquares(factors=factors, iterations=iterations, regularization=regularization, random_state=random_state)
als_model.fit(user_item_matrix_train)

  check_blas_config()
100%|██████████| 50/50 [03:09<00:00,  3.80s/it]


In [9]:
# Определяем максимальное количество похожих объектов
max_similar_items = 10

# Получаем энкодированные идентификаторы всех объектов, известных нам из events_train
train_item_ids_enc = events_train['item_id_enc'].unique()

# Получаем списки похожих объектов, используя ранее полученную ALS-модель
# метод similar_items возвращает и сам объект, как наиболее похожий
# этот объект мы позже отфильтруем, но сейчас запросим на 1 больше
similar_items = als_model.similar_items(train_item_ids_enc, N=max_similar_items + 1)

In [10]:
# Выведем несколько первых элементов для проверки формата данных
for i, item in enumerate(similar_items):
    if i < 5:
        print(f"Item {i}: {item}")
    else:
        break

Item 0: [[ 2460  2458   806 ...  6528   231  9519]
 [38691 39575 40111 ... 33291 28606 39756]
 [38867 38023 38951 ...   262 33096 34839]
 ...
 [43151 19601 29207 ... 36847 29109 27863]
 [11649  9021 25953 ... 18027   309 41137]
 [38365 35201 36140 ... 35631 23687 32474]]
Item 1: [[0.9999998  0.92248917 0.8747575  ... 0.8054254  0.80437195 0.8015513 ]
 [0.9999998  0.93434316 0.93062454 ... 0.89426976 0.8928981  0.89064574]
 [0.9999999  0.9388754  0.9345328  ... 0.9074671  0.90322596 0.9018409 ]
 ...
 [0.99999976 0.7168633  0.68175954 ... 0.61550957 0.60553604 0.60014474]
 [1.0000001  0.69465107 0.6693783  ... 0.6611025  0.65485984 0.65391916]
 [0.99999964 0.94169396 0.81464666 ... 0.50771886 0.49635783 0.4834801 ]]


In [11]:
events_train.head(10)

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,user_id_enc,item_id_enc
0,1000000,22034,2015-07-12,2015-07-17,True,5,False,0,2460
1,1000000,22318578,2015-06-07,2015-08-09,True,5,True,0,38691
2,1000000,22551730,2015-06-24,2015-07-11,True,4,True,0,38867
3,1000000,22816087,2015-09-27,2015-11-04,True,5,True,0,39109
5,1000000,17910054,2015-03-04,2015-07-28,True,3,False,0,35638
6,1000000,17306293,2014-07-20,2014-07-22,True,3,False,0,34747
7,1000000,17860739,2014-07-26,2014-09-21,True,4,True,0,35533
8,1000000,5064,2014-04-17,2014-04-27,True,5,True,0,549
9,1000000,13526165,2015-03-10,2015-03-18,True,5,True,0,31909
10,1000000,3977,2013-02-10,2013-02-16,True,4,True,0,429


In [12]:
# преобразуем полученные списки в табличный формат
sim_item_item_ids_enc = similar_items[0]
sim_item_scores = similar_items[1]

In [13]:
similar_items = pd.DataFrame({
    "item_id_enc":train_item_ids_enc.tolist(),
    "sim_item_id_enc": sim_item_item_ids_enc.tolist(), 
    "score": sim_item_scores.tolist()})

In [14]:
similar_items = similar_items.explode(["sim_item_id_enc", "score"], ignore_index=True)

In [15]:
similar_items.head()

Unnamed: 0,item_id_enc,sim_item_id_enc,score
0,2460,2460,1.0
1,2460,2458,0.922489
2,2460,806,0.874758
3,2460,2459,0.873761
4,2460,12528,0.850651


In [16]:
# приводим типы данных
similar_items["sim_item_id_enc"] = similar_items["sim_item_id_enc"].astype("int")
similar_items["score"] = similar_items["score"].astype("float")

In [17]:
# получаем изначальные идентификаторы
similar_items["item_id_1"] = item_encoder.inverse_transform(similar_items["item_id_enc"])
similar_items["item_id_2"] = item_encoder.inverse_transform(similar_items["sim_item_id_enc"])
similar_items = similar_items.drop(columns=["item_id_enc", "sim_item_id_enc"])


In [18]:
# убираем пары с одинаковыми объектами
similar_items = similar_items.query("item_id_1 != item_id_2") 

In [19]:
similar_items.to_parquet("similar_items.parquet") 

In [20]:
similar_items.query("item_id_1 == 7126")

Unnamed: 0,score,item_id_1,item_id_2
25873,0.948722,7126,7190
25874,0.940996,7126,24280
25875,0.930141,7126,1953
25876,0.925065,7126,58696
25877,0.916336,7126,38296
25878,0.916012,7126,2932
25879,0.913947,7126,7184
25880,0.911429,7126,387749
25881,0.909868,7126,7733
25882,0.909452,7126,30597


In [21]:
def print_sim_items(item_id, similar_items):

    item_columns_to_use = ["item_id", "author", "title", "genre_and_votes", "average_rating", "ratings_count"]
    
    item_id_1 = items.query("item_id == @item_id")[item_columns_to_use]
    display(item_id_1)
    
    si = similar_items.query("item_id_1 == @item_id")
    si = si.merge(items[item_columns_to_use].set_index("item_id"), left_on="item_id_2", right_index=True)
    display(si)

In [26]:
print_sim_items(17245, similar_items)

Unnamed: 0,item_id,author,title,genre_and_votes,average_rating,ratings_count
1058909,17245,"Bram Stoker, Nina Auerbach, David J. Skal",Dracula,"{'Classics': 19603, 'Horror': 10601, 'Fiction'...",3.98,636895


Unnamed: 0,score,item_id_1,item_id_2,author,title,genre_and_votes,average_rating,ratings_count
23937,0.928821,17245,480204,"Gaston Leroux, Alexander Teixeira de Mattos",The Phantom of the Opera,"{'Classics': 7010, 'Fiction': 2103, 'Horror': ...",3.97,144859
23938,0.900337,17245,51496,"Robert Louis Stevenson, Vladimir Nabokov, Merv...",The Strange Case of Dr. Jekyll and Mr. Hyde,"{'Classics': 12342, 'Fiction': 4037, 'Horror':...",3.79,229898
23939,0.898938,17245,93261,Washington Irving,The Legend of Sleepy Hollow,"{'Classics': 2594, 'Horror': 1182, 'Fiction': ...",3.74,26776
23940,0.897708,17245,295,Robert Louis Stevenson,Treasure Island,"{'Classics': 11249, 'Fiction': 4405, 'Adventur...",3.82,274424
23941,0.896466,17245,2623,"Charles Dickens, Marisa Sestino",Great Expectations,"{'Classics': 19645, 'Fiction': 6662, 'Literatu...",3.75,468462
23942,0.895993,17245,18254,"Charles Dickens, Philip Horne, Gerald Dickens",Oliver Twist,"{'Classics': 11450, 'Fiction': 3656, 'Historic...",3.85,235560
23943,0.886899,17245,7190,Alexandre Dumas,"The Three Musketeers (The D'Artagnan Romances,...","{'Classics': 9823, 'Fiction': 3256, 'Historica...",4.06,198892
23944,0.881914,17245,24213,"Lewis Carroll, John Tenniel, Martin Gardner",Alice's Adventures in Wonderland & Through the...,"{'Classics': 11568, 'Fantasy': 6184, 'Fiction'...",4.06,344482
23945,0.878391,17245,2932,"Daniel Defoe, Virginia Woolf",Robinson Crusoe,"{'Classics': 7725, 'Fiction': 3305, 'Adventure...",3.66,181415
23946,0.870234,17245,1953,"Charles Dickens, Richard Maxwell",A Tale of Two Cities,"{'Classics': 20021, 'Fiction': 6969, 'Historic...",3.82,646983
