# Initialization

In [1]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

# Загрузка данных

In [3]:
items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

# Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

# === Знакомство: "холодный" старт

In [4]:
# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# количество пользователей, которые есть и в train, и в test
common_users = list(set(users_train) & set(users_test))

print(len(users_train), len(users_test), len(common_users))

428220 123223 120858


In [5]:
cold_users = list(set(users_test) - set(users_train))

print(len(cold_users))

2365


In [6]:
events_train

Unnamed: 0,user_id,count_read_book,book_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month
0,1000000,29,27774758,2017-01-02,2017-02-02,True,5,False,2017-01-01
1,1000000,29,16101128,2016-09-17,2016-10-05,True,3,False,2016-09-01
2,1000000,29,18798983,2017-03-26,2017-05-12,True,3,False,2017-03-01
3,1000000,29,9969571,2016-08-26,2016-09-15,True,4,False,2016-08-01
4,1000000,29,17851885,2016-08-01,2016-08-09,True,4,False,2016-08-01
...,...,...,...,...,...,...,...,...,...
11751081,1430584,13,18243700,2016-04-29,2016-05-09,True,4,True,2016-04-01
11751082,1430584,13,6614960,2015-11-02,2015-12-25,True,3,False,2015-11-01
11751083,1430584,13,24817626,2015-11-02,2015-11-02,True,1,False,2015-11-01
11751084,1430584,13,7445,2016-04-24,2016-05-18,True,4,True,2016-04-01


In [None]:
from sklearn.preprocessing import MinMaxScaler

top_pop_start_date = pd.to_datetime("2015-01-01").date()

item_popularity = events_train \
    .query("started_at >= @top_pop_start_date") \
    .groupby(["book_id"]).agg(users=("user_id", "nunique"), avg_rating=("rating", "mean")).reset_index()

# нормализация пользователей и среднего рейтинга, требуется для их приведения к одному масштабу
scaler = MinMaxScaler()
item_popularity[["users_norm", "avg_rating_norm"]] = scaler.fit_transform(
    item_popularity[["users", "avg_rating"]]
)

# вычисляем popularity_score, как скор популярности со штрафом за низкий рейтинг
item_popularity["popularity_score"] = (
    item_popularity["users_norm"] * item_popularity["avg_rating_norm"]
)

# сортируем по убыванию popularity_score
item_popularity = item_popularity.sort_values(by = ["popularity_score"], ascending=False)

# выбираем первые 100 айтемов со средней оценкой avg_rating не меньше 4
top_k_pop_items = item_popularity[item_popularity['avg_rating']>=4].head(100)

In [None]:
item_popularity

In [None]:
top_k_pop_items

In [None]:
# добавляем информацию о книгах
top_k_pop_items = top_k_pop_items.merge(
    items.set_index("book_id")[["author", "title", "genre_and_votes", "publication_year"]], on="book_id")

with pd.option_context('display.max_rows', 100):
    display(top_k_pop_items[["book_id", "author", "title", "publication_year", "users", "avg_rating", "popularity_score", "genre_and_votes"]])

In [None]:
cold_users_events_with_recs = \
    events_test[events_test["user_id"].isin(cold_users)] \
    .merge(top_k_pop_items, on="book_id", how="left")

cold_user_items_no_avg_rating_idx = cold_users_events_with_recs["avg_rating"].isnull()
cold_user_recs = cold_users_events_with_recs[~cold_user_items_no_avg_rating_idx] \
    [["user_id", "book_id", "rating", "avg_rating"]]

In [None]:
events_test

In [None]:
cold_users_events_with_recs['popularity_score'].isna().sum()/cold_users_events_with_recs.shape[0]

In [None]:
# посчитаем метрики рекомендаций
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = mean_squared_error(cold_user_recs["rating"], cold_user_recs["avg_rating"], squared=False)
mae = mean_absolute_error(cold_user_recs["rating"], cold_user_recs["avg_rating"])
print(round(rmse, 2), round(mae, 2)) 

In [None]:
# посчитаем покрытие холодных пользователей рекомендациями

cold_users_hit_ratio = cold_users_events_with_recs.groupby("user_id").agg(hits=("avg_rating", lambda x: (~x.isnull()).mean()))

print(f"Доля пользователей без релевантных рекомендаций: {(cold_users_hit_ratio == 0).mean().iat[0]:.2f}")
print(f"Среднее покрытие пользователей: {cold_users_hit_ratio[cold_users_hit_ratio != 0].mean().iat[0]:.2f}")

# === Знакомство: первые персональные рекомендации

In [None]:
events[['user_id', 'book_id', 'rating']].shape[0]


In [None]:
events.query('rating == 0')

In [None]:
from surprise import Dataset, Reader
from surprise import SVD

# используем Reader из библиотеки surprise для преобразования событий (events)
# в формат, необходимый surprise
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(events_train[['user_id', 'book_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

# инициализируем модель
svd_model = SVD(n_factors=100, random_state=0)

# обучаем модель
svd_model.fit(surprise_train_set)

In [None]:
surprise_test_set = list(events_test[['user_id', 'book_id', 'rating']].itertuples(index=False))

# получаем рекомендации для тестовой выборки
svd_predictions = svd_model.test(surprise_test_set)

In [None]:
from surprise import accuracy

rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)
                     
print(rmse, mae)

In [None]:
from surprise import NormalPredictor

# инициализируем состояние генератора, это необходимо для получения
# одной и той же последовательности случайных чисел, только в учебных целях
np.random.seed(0)

random_model = NormalPredictor()

random_model.fit(surprise_train_set)
random_predictions = random_model.test(surprise_test_set)

In [None]:
rmse = accuracy.rmse(random_predictions)
mae = accuracy.mae(random_predictions)
                     
print(rmse, mae) 

In [None]:
events[events['user_id']==1000100]['book_id']

In [None]:
def get_recommendations_svd(user_id, all_items, events, model, include_seen=True, n=10):

    """ возвращает n рекомендаций для user_id """
    
    # получим список идентификаторов всех книг
    all_items = set(events['book_id'].unique())
        
    # учитываем флаг, стоит ли уже прочитанные книги включать в рекомендации
    if include_seen:
        items_to_predict = list(all_items)
    else:
        # получим список книг, которые пользователь уже прочитал ("видел")
        seen_items = set(events[events['user_id']==user_id]['book_id'].unique())
        
        # книги, которые пользователь ещё не читал
        # только их и будем включать в рекомендации
        items_to_predict = list(all_items - seen_items)
    
    # получаем скоры для списка книг, т. е. рекомендации
    predictions = [model.predict(user_id, book_id) for book_id in items_to_predict]
    
    # сортируем рекомендации по убыванию скора и берём только n первых
    recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    
    return pd.DataFrame([(pred.iid, pred.est) for pred in recommendations], columns=["book_id", "score"])

In [None]:
get_recommendations_svd(1296647, items, events_test, svd_model)

In [None]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = events_train['user_id'].sample().iat[0]

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("book_id")[["author", "title", "genre_and_votes"]], on="book_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_svd(user_id, items, events_train, svd_model)
user_recommendations = user_recommendations.merge(items[["book_id", "author", "title", "genre_and_votes"]], on="book_id")
display(user_recommendations)

# === Базовые подходы: коллаборативная фильтрация

In [None]:
events

In [None]:
events_train = events_train.rename(columns = {'book_id':'item_id'})

In [None]:
events_test = events_test.rename(columns = {'book_id':'item_id'})

In [None]:
items = items.rename(columns = {'book_id':'item_id'})

In [None]:
import scipy
import sklearn.preprocessing

# перекодируем идентификаторы пользователей: 
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов: item_id_encoder
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["book_id_enc"] = item_encoder.transform(items["item_id"])
events_train["book_id_enc"] = item_encoder.transform(events_train["item_id"])
events_test["book_id_enc"] = item_encoder.transform(events_test["item_id"])

NameError: name 'events_train' is not defined

In [None]:
events_train['book_id_enc'].max()

In [None]:
a = events['book_id'].nunique() * events['user_id'].nunique() / (1024**3) 
a 

In [None]:
# создаём sparse-матрицу формата CSR 
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['book_id_enc'])),
    dtype=np.int8)

In [None]:
import sys

sum([sys.getsizeof(i) for i in user_item_matrix_train.data])/1024**3

In [None]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train)

In [None]:
def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5):
    """
    Возвращает отранжированные рекомендации для заданного пользователя
    """
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
         user_id_enc, 
         user_item_matrix[user_id_enc], 
         filter_already_liked_items=not include_seen,
         N=n)
    recommendations = pd.DataFrame({"book_id_enc": recommendations[0], "score": recommendations[1]})
    recommendations["book_id"] = item_encoder.inverse_transform(recommendations["book_id_enc"])
    
    return recommendations

In [None]:
# получаем список всех возможных user_id (перекодированных)
user_ids_encoded = range(len(user_encoder.classes_))

# получаем рекомендации для всех пользователей
als_recommendations = als_model.recommend(
    user_ids_encoded, 
    user_item_matrix_train[user_ids_encoded], 
    filter_already_liked_items=False, N=100)

In [None]:
# преобразуем полученные рекомендации в табличный формат
book_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "book_id_enc": book_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["book_id_enc", "score"], ignore_index=True)

# приводим типы данных
als_recommendations["book_id_enc"] = als_recommendations["book_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
als_recommendations["book_id"] = item_encoder.inverse_transform(als_recommendations["book_id_enc"])
als_recommendations = als_recommendations.drop(columns=["user_id_enc", "book_id_enc"])

In [None]:
als_recommendations = als_recommendations[["user_id", "book_id", "score"]]
als_recommendations.to_parquet("als_recommendations.parquet")

In [None]:
als_recommendations = (
    als_recommendations
    .merge(events_test[["user_id", "book_id", "rating"]]
               .rename(columns={"rating": "rating_test"}), 
           on=["user_id", "book_id"], how="left")
)

In [None]:
import sklearn.metrics

def compute_ndcg(rating: pd.Series, score: pd.Series, k):

    """ подсчёт ndcg
    rating: истинные оценки
    score: оценки модели
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """
    
    # если кол-во объектов меньше 2, то NDCG - не определена
    if len(rating) < 2:
        return np.nan

    ndcg = sklearn.metrics.ndcg_score(np.asarray([rating.to_numpy()]), np.asarray([score.to_numpy()]), k=k)

    return ndcg

In [None]:
rating_test_idx = ~als_recommendations["rating_test"].isnull()
ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5))

In [None]:
print(ndcg_at_5_scores.mean())

# === Базовые подходы: контентные рекомендации

In [None]:
items["genre_and_votes"] = items["genre_and_votes"].apply(eval)

In [None]:
def get_genres(items):

    """ 
    извлекает список жанров по всем книгам, 
    подсчитывает долю голосов по каждому их них
    """
    
    genres_counter = {}
    
    for k, v, in items.iterrows():
        genre_and_votes = v['genre_and_votes']
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            # увеличиваем счётчик жанров
            try:
                genres_counter[genre] += votes  
            except KeyError:
                genres_counter[genre] = 0

    genres = pd.Series(genres_counter, name="votes")
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = "genre_id"
    
    return genres
   
genres = get_genres(items)

In [None]:
genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10)

In [None]:
def get_item2genre_matrix(genres, items):

    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict()
    
    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []
    
    for item_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None:
            continue
        for genre_name, votes in v["genre_and_votes"].items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(item_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)
    
    return genres_csr

In [None]:
items

In [None]:
items

In [None]:
items = items.sort_values(by="book_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items)

In [None]:
events_train

In [None]:
user_id = 1000010
user_events = events_train.query("user_id == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]

user_items_genres_csr = get_item2genre_matrix(genres, user_items)
user_items_genres_csr

In [None]:
user_events

In [None]:
# вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

# преобразуем пользовательские оценки из списка в вектор-столбец
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0))

In [None]:
# выведем список жанров, которые предпочитает пользователь

user_genres = genres.copy()
user_genres["score"] = np.ravel(user_genres_scores)
user_genres = user_genres[user_genres["score"] > 0].sort_values(by=["score"], ascending=False)

user_genres.head(5)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# вычисляем сходство между вектором пользователя и векторами по книгам
similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)

# преобразуем в одномерный массив
similarity_scores = similarity_scores.flatten()

# получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)
k = 5
top_k_indices = np.argsort(similarity_scores)[::-1][:k]

In [None]:
selected_items = items[items["book_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
   display(selected_items[["author", "title", "genre_and_votes"]])

# === Базовые подходы: валидация

In [None]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None, item_col_name='item_id'):

    """
    размечает пары <user_id, book_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")
    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["user_id", "score"], ascending=[True, False])

    # оставляет только те book_id, которые были в events_train, 
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    events_for_common_users = events_for_common_users[events_for_common_users[item_col_name].isin(events_train[item_col_name].unique())]
    
    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", item_col_name, "gt"]].merge(
        recs_for_common_users[["user_id", item_col_name, "score"]], 
        on=["user_id", item_col_name], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common

In [None]:
events_test

In [None]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=5)

In [None]:
def compute_cls_metrics(events_recs_for_binary_metric):
    
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fn"].sum())
    recall = recall.fillna(0).mean()
    return precision, recall

In [None]:
_,aaa = compute_cls_metrics(events_recs_for_binary_metrics)
print(aaa)

In [None]:
als_recommendations

In [None]:
aaa = als_recommendations[['book_id', 'score']].drop_duplicates()

In [None]:
a = als_recommendations['book_id'].nunique()
a

In [None]:
b = als_recommendations['book_id'][als_recommendations['rating_test']>0].nunique()
b

In [None]:
100/4055

In [None]:
b/a

In [None]:
aaa[aaa['book_id']==15881]

In [None]:
# разметим каждую рекомендацию признаком read
events_train["read"] = True
als_recommendations = als_recommendations.merge(events_train, on=["user_id", "book_id"], how="left")
als_recommendations["read"] = als_recommendations["read"].fillna(False).astype("bool")

# проставим ранги
als_recommendations = als_recommendations.sort_values(["user_id", "score"], ascending=[True, False])
als_recommendations["rank"] = als_recommendations.groupby("user_id").cumcount() + 1

# посчитаем novelty по пользователям
novelty_5 = (1-als_recommendations.query("rank <= 5").groupby("user_id")["read"].mean())

# посчитаем средний novelty
mean_novelty_5 = novelty_5.mean()
print(f"Средний novelty@5: {mean_novelty_5:.4f}")

# === Двухстадийный подход: метрики

In [9]:
# задаём точку разбиения
split_date_for_labels = pd.to_datetime("2017-09-15").date()

split_date_for_labels_idx = events_test["started_at"] < split_date_for_labels
events_labels = events_test[split_date_for_labels_idx].copy()
events_test_2 = events_test[~split_date_for_labels_idx].copy()

NameError: name 'events_test' is not defined

In [None]:
events_labels['user_id'].nunique()

In [None]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations = pd.read_parquet("candidates/training/als_recommendations.parquet")
content_recommendations = pd.read_parquet("candidates/training/content_recommendations.parquet")

candidates = pd.merge(
    als_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer") 

In [None]:
candidates.shape

In [None]:
events_labels = events_labels.rename(columns = {'book_id': 'item_id'})

In [None]:
events_labels.head()

In [None]:
events_labels.shape

In [None]:
# добавляем таргет к кандидатам со значением:
# — 1 для тех item_id, которые пользователь прочитал
# — 0, для всех остальных 

events_labels["target"] = 1
candidates = candidates.merge(events_labels[["user_id", "item_id", "target"]], 
                              on=["user_id", "item_id"], how = 'left')
candidates["target"] = candidates["target"].fillna(0).astype("int")

# в кандидатах оставляем только тех пользователей, у которых есть хотя бы один положительный таргет
candidates_to_sample = candidates.groupby("user_id").filter(lambda x: x["target"].sum() > 0)

# для каждого пользователя оставляем только 4 негативных примера
negatives_per_user = 4
candidates_for_train = pd.concat([
    candidates_to_sample.query("target == 1") ,
    candidates_to_sample.query("target == 0") \
        .groupby("user_id") \
        .apply(lambda x: x.sample(negatives_per_user, random_state=0))
    ])

In [None]:
candidates["target"].unique()

In [None]:
candidates_for_train.shape

# === Двухстадийный подход: модель

In [None]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score']
target = 'target'

# Create the Pool object
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0
)

# тренируем модель
cb_model.fit(train_data)

In [None]:
als_recommendations_2 = pd.read_parquet("candidates/inference/als_recommendations.parquet")
content_recommendations_2 = pd.read_parquet("candidates/inference/content_recommendations.parquet")

In [None]:
als_recommendations_2.shape

In [None]:
content_recommendations_2.shape

In [None]:
candidates_to_rank = pd.merge(als_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer")

In [None]:
candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test_2["user_id"].drop_duplicates())]
print(len(candidates_to_rank))

In [None]:
# # загружаем рекомендации от двух базовых генераторов
# als_recommendations_2 = pd.read_parquet("candidates/inference/als_recommendations.parquet")
# content_recommendations_2 = pd.read_parquet("candidates/inference/content_recommendations.parquet")

# candidates_to_rank = pd.merge(als_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
#     content_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
#     on=["user_id", "item_id"],
#     how="outer")

# # оставляем только тех пользователей, что есть в тестовой выборке, для экономии ресурсов
# candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test_2["user_id"].drop_duplicates())]
# print(len(candidates_to_rank))

In [None]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставляем rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank[candidates_to_rank["rank"] <= max_recommendations_per_user]

In [None]:
final_recommendations.shape

In [None]:
events_test_2.head()

In [None]:
final_recommendations = final_recommendations.rename(columns={'item_id':'book_id'})

In [None]:
events_inference

In [None]:
events_inference = pd.concat([events_train, events_labels])

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5,
    item_col_name = 'book_id')

In [None]:
events_labels.columns

In [None]:
events_inference = pd.concat([events_train, events_labels])

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5,
    item_col_name = 'item_id')

In [None]:
cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"recall: {cb_recall_5:.3f}")

# === Двухстадийный подход: построение признаков

In [None]:
# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# количество пользователей, которые есть и в train, и в test
common_users = list(set(users_train) & set(users_test))

print(len(users_train), len(users_test), len(common_users))

428220 123223 120858


In [14]:
items

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,publication_year,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,age
3,6066819,Jennifer Weiner,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,"{'Womens Fiction-Chick Lit': 739, 'Fiction': 442}",368,3.49,51184,3282,Atria Books,2009,US,eng,Hardcover,False,0743294297,9780743294294,"{'Academic': None, 'Academic-Academia': None, ...","Womens Fiction-Chick Lit 739, Fiction 442",9.0
6,378460,Michael Halberstam,The Wanting of Levine,,"{'Politics': 1, 'Humor': 1}",,4.38,12,4,Berkley Publishing Group,1979,US,,Paperback,False,0425040887,9780425040881,"{'Academic': None, 'Academic-Academia': None, ...","Politics 1user, Humor 1user",39.0
15,89375,"Don Piper, Cecil Murphey",90 Minutes in Heaven: A True Story of Death an...,As he is driving home from a minister's confer...,"{'Christian': 395, 'Nonfiction': 392, 'Religio...",,3.91,68157,2885,,,US,,,False,0800759494,9780800759490,"{'Academic': None, 'Academic-Academia': None, ...","Christian 395, Nonfiction 392, Religion 142, S...",
16,89376,Randy Alcorn,Heaven,What is Heaven really going to be like? What w...,"{'Christian': 225, 'Religion-Theology': 154, '...",533,4.26,7345,566,,,US,eng,,False,0842379428,9780842379427,"{'Academic': None, 'Academic-Academia': None, ...","Christian 225, Religion-Theology 154, Nonficti...",
17,89377,Jennifer L. Holm,Penny from Heaven,It's 1953 and 11-year-old Penny dreams of a su...,"{'Historical-Historical Fiction': 284, 'Childr...",288,3.98,6949,615,Random House Books for Young Readers,2006,US,,Hardcover,False,037583687X,9780375836879,"{'Academic': None, 'Academic-Academia': None, ...","Historical-Historical Fiction 284, Childrens-M...",12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2360257,279660,James Goldman,The Lion in Winter,Insecure siblings fighting for their parents' ...,"{'Plays': 294, 'Historical-Historical Fiction'...",103,4.22,8227,100,Random House,2004,US,eng,Paperback,False,0812973356,9780812973358,"{'Academic': None, 'Academic-Academia': None, ...","Plays 294, Historical-Historical Fiction 110, ...",14.0
2360258,7657484,"J. Michael Straczynski, Shane Davis, Sandra Ho...","Superman: Earth One, Volume 1",Forget everything you know about The Man of St...,"{'Sequential Art-Comics': 683, 'Sequential Art...",134,3.90,13221,578,DC Comics,2010,US,eng,Hardcover,False,1401224687,9781401224684,"{'Academic': None, 'Academic-Academia': None, ...","Sequential Art-Comics 683, Sequential Art-Grap...",8.0
2360322,7715664,Larissa Ione,Sin Undone (Demonica #5),HER TOUCH IS DEADLY\nAs the only female Seminu...,"{'Romance-Paranormal Romance': 703, 'Fantasy-P...",400,4.35,23091,819,Grand Central Publishing,2010,US,eng,Mass Market Paperback,False,0446556815,9780446556811,"{'Academic': None, 'Academic-Academia': None, ...","Romance-Paranormal Romance 703, Fantasy-Parano...",8.0
2360395,30367813,J.A. Owenby,The Truth She Knew (The Truth Series #1),"""A bittersweet story of young love, independen...","{'Fiction': 4, 'Romance': 3, 'Business-Amazon'...",238,4.34,111,67,,2016,US,eng,Paperback,False,1533660549,9781533660541,"{'Academic': None, 'Academic-Academia': None, ...","Fiction 4, Romance 3, Business-Amazon 3",2.0


In [None]:
items = items.rename(columns={'book_id':'item_id'})

In [None]:
events_train = events_train.rename(columns={'book_id':'item_id'})

Unnamed: 0,user_id,count_read_book,book_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month
0,1000000,29,27774758,2017-01-02,2017-02-02,True,5,False,2017-01-01
1,1000000,29,16101128,2016-09-17,2016-10-05,True,3,False,2016-09-01
2,1000000,29,18798983,2017-03-26,2017-05-12,True,3,False,2017-03-01
3,1000000,29,9969571,2016-08-26,2016-09-15,True,4,False,2016-08-01
4,1000000,29,17851885,2016-08-01,2016-08-09,True,4,False,2016-08-01
...,...,...,...,...,...,...,...,...,...
11751081,1430584,13,18243700,2016-04-29,2016-05-09,True,4,True,2016-04-01
11751082,1430584,13,6614960,2015-11-02,2015-12-25,True,3,False,2015-11-01
11751083,1430584,13,24817626,2015-11-02,2015-11-02,True,1,False,2015-11-01
11751084,1430584,13,7445,2016-04-24,2016-05-18,True,4,True,2016-04-01


In [None]:
events_labels["target"] = 1
candidates = candidates.merge(events_labels[["user_id", "item_id", "target"]], 
                              on=["user_id", "item_id"], how = 'left')
candidates["target"] = candidates["target"].fillna(0).astype("int")

# в кандидатах оставляем только тех пользователей, у которых есть хотя бы один положительный таргет
candidates_to_sample = candidates.groupby("user_id").filter(lambda x: x["target"].sum() > 0)

# для каждого пользователя оставляем только 4 негативных примера
negatives_per_user = 4
candidates_for_train = pd.concat([
    candidates_to_sample.query("target == 1") ,
    candidates_to_sample.query("target == 0") \
        .groupby("user_id") \
        .apply(lambda x: x.sample(negatives_per_user, random_state=0))
    ])

In [12]:
candidates_for_train.head()

NameError: name 'candidates_for_train' is not defined

In [6]:
items = items.rename(columns={'book_id':'item_id'})

In [7]:
items

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,publication_year,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str
3,6066819,Jennifer Weiner,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,"{'Womens Fiction-Chick Lit': 739, 'Fiction': 442}",368,3.49,51184,3282,Atria Books,2009,US,eng,Hardcover,False,0743294297,9780743294294,"{'Academic': None, 'Academic-Academia': None, ...","Womens Fiction-Chick Lit 739, Fiction 442"
6,378460,Michael Halberstam,The Wanting of Levine,,"{'Politics': 1, 'Humor': 1}",,4.38,12,4,Berkley Publishing Group,1979,US,,Paperback,False,0425040887,9780425040881,"{'Academic': None, 'Academic-Academia': None, ...","Politics 1user, Humor 1user"
15,89375,"Don Piper, Cecil Murphey",90 Minutes in Heaven: A True Story of Death an...,As he is driving home from a minister's confer...,"{'Christian': 395, 'Nonfiction': 392, 'Religio...",,3.91,68157,2885,,,US,,,False,0800759494,9780800759490,"{'Academic': None, 'Academic-Academia': None, ...","Christian 395, Nonfiction 392, Religion 142, S..."
16,89376,Randy Alcorn,Heaven,What is Heaven really going to be like? What w...,"{'Christian': 225, 'Religion-Theology': 154, '...",533,4.26,7345,566,,,US,eng,,False,0842379428,9780842379427,"{'Academic': None, 'Academic-Academia': None, ...","Christian 225, Religion-Theology 154, Nonficti..."
17,89377,Jennifer L. Holm,Penny from Heaven,It's 1953 and 11-year-old Penny dreams of a su...,"{'Historical-Historical Fiction': 284, 'Childr...",288,3.98,6949,615,Random House Books for Young Readers,2006,US,,Hardcover,False,037583687X,9780375836879,"{'Academic': None, 'Academic-Academia': None, ...","Historical-Historical Fiction 284, Childrens-M..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2360257,279660,James Goldman,The Lion in Winter,Insecure siblings fighting for their parents' ...,"{'Plays': 294, 'Historical-Historical Fiction'...",103,4.22,8227,100,Random House,2004,US,eng,Paperback,False,0812973356,9780812973358,"{'Academic': None, 'Academic-Academia': None, ...","Plays 294, Historical-Historical Fiction 110, ..."
2360258,7657484,"J. Michael Straczynski, Shane Davis, Sandra Ho...","Superman: Earth One, Volume 1",Forget everything you know about The Man of St...,"{'Sequential Art-Comics': 683, 'Sequential Art...",134,3.90,13221,578,DC Comics,2010,US,eng,Hardcover,False,1401224687,9781401224684,"{'Academic': None, 'Academic-Academia': None, ...","Sequential Art-Comics 683, Sequential Art-Grap..."
2360322,7715664,Larissa Ione,Sin Undone (Demonica #5),HER TOUCH IS DEADLY\nAs the only female Seminu...,"{'Romance-Paranormal Romance': 703, 'Fantasy-P...",400,4.35,23091,819,Grand Central Publishing,2010,US,eng,Mass Market Paperback,False,0446556815,9780446556811,"{'Academic': None, 'Academic-Academia': None, ...","Romance-Paranormal Romance 703, Fantasy-Parano..."
2360395,30367813,J.A. Owenby,The Truth She Knew (The Truth Series #1),"""A bittersweet story of young love, independen...","{'Fiction': 4, 'Romance': 3, 'Business-Amazon'...",238,4.34,111,67,,2016,US,eng,Paperback,False,1533660549,9781533660541,"{'Academic': None, 'Academic-Academia': None, ...","Fiction 4, Romance 3, Business-Amazon 3"


In [8]:
items["age"] = 2018 - items["publication_year"]
invalid_age_idx = items["age"] < 0
items.loc[invalid_age_idx, "age"] = np.nan
items["age"] = items["age"].astype("float")

candidates_for_train = candidates_for_train.merge(items[['item_id', 'age', 'average_rating']], on = 'item_id')
candidates_to_rank = candidates_to_rank.merge(items[['item_id', 'age', 'average_rating']], on = 'item_id') 

NameError: name 'candidates_for_train' is not defined

In [None]:
candidates_to_rank['age'].median()

In [None]:
events.head()

In [None]:
def get_user_features(events):
    """ считает пользовательские признаки """
    
    user_features = events.groupby("user_id").agg(
        reading_years=("started_at", lambda x: (x.max()-x.min()).days/365.25),
        books_read=("book_id", "nunique"),
        rating_avg=("rating", "mean"),
        rating_std=("rating", "std"))
    
    user_features["books_per_year"] = user_features["books_read"] / user_features["reading_years"]
    
    return user_features
    
user_features_for_train = get_user_features(events_train)
candidates_for_train = candidates_for_train.merge(user_features_for_train, on="user_id", how="left")
  
# оставим только тех пользователей, что есть в тесте, для экономии ресурсов
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test["user_id"].drop_duplicates())]

user_features_for_ranking = get_user_features(events_inference)
candidates_to_rank = candidates_to_rank.merge(user_features_for_ranking, on="user_id", how="left")

In [None]:
candidates_for_train['books_read'].median()

In [None]:
items

In [None]:
items = items.rename(columns = {'book_id': 'item_id'})

In [None]:
import scipy
import sklearn.preprocessing

# перекодируем идентификаторы объектов: 
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])

In [None]:
items

In [None]:
items

In [None]:
genres

In [None]:

genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
# нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)

In [None]:
import scipy

In [None]:
items = items.sort_values(by="item_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items)

In [None]:
items

In [None]:
# определяем индексы топ-10 жанров и всех остальных
genres_top_k = 10
genres_top_idx = genres.sort_values("votes", ascending=False).head(genres_top_k).index
genres_others_idx = list(set(genres.index) - set(genres_top_idx))


In [None]:
genres_top_idx

In [None]:
genres_top_columns = [f"genre_{id}" for id in genres_top_idx]
genres_others_column = "genre_others"
genre_columns = genres_top_columns + [genres_others_column]


In [None]:
genre_columns

In [None]:
aaa = pd.DataFrame(all_items_genres_csr[:, genres_top_idx].todense(), columns=genres_top_columns)

In [None]:
bbb = pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
bbb

In [None]:
# составляем таблицу принадлежности книг к жанрам
item_genres = (
    pd.concat([
        pd.DataFrame(all_items_genres_csr[:, genres_top_idx].todense(), columns=genres_top_columns),
        # все остальные жанры
        pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
        ],
        axis=1)
    .reset_index()
    .rename(columns={"index": "item_id_enc"})
)

In [None]:
item_genres

In [None]:
# объединяем информацию принадлежности книг к жанрам с основной информацией о книгах
items_new = items.merge(item_genres, on="item_id_enc", how="left")

In [None]:
items_new

In [None]:
def get_user_genres(events, items, item_genre_columns):
    user_genres = (
        events
        .merge(items[["item_id"] + item_genre_columns], on="item_id", how="left")
        .groupby("user_id")[item_genre_columns].mean()
    )
    return user_genres

In [None]:
events

In [None]:
events = events.rename(columns={'book_id':'item_id'})

In [None]:
items.columns

In [None]:
items

In [None]:
events_train = events_train.rename(columns={'book_id':'item_id'})

In [None]:
user_genres_for_train = get_user_genres(events_train, items_new, genre_columns)

In [None]:
candidates_for_train = candidates_for_train.merge(user_genres_for_train, on="user_id", how="left")

In [None]:
events_inference = events_inference.rename(columns={'book_id':'item_id'})

In [None]:
events_inference.columns

In [None]:
events_inference_new = events_inference[['user_id', 'count_read_book', 'item_id', 'started_at', 'read_at',
       'is_read', 'rating', 'is_reviewed', 'started_at_month']]

In [None]:
events_inference_new

In [None]:
user_genres_for_ranking = get_user_genres(events_inference, items_new, genre_columns)

In [None]:
candidates_to_rank = candidates_to_rank.merge(user_genres_for_ranking, on="user_id", how="left") 

In [None]:
genres[genres['name'] == 'Romance']

In [None]:
candidates_for_train.columns

In [None]:
candidates_for_train['genre_34'].median()

In [None]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score', 
    'age', 'average_rating', 'reading_years', 'books_read', 
    'rating_avg', 'rating_std', 
    'books_per_year'] + genre_columns
target = 'target'

# создаём Pool
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0,
)

# тренируем модель
cb_model.fit(train_data)

In [None]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

In [None]:
inference_data

In [None]:
predictions

In [None]:
candidates_to_rank["cb_score"] = predictions[:, 1]

In [None]:
candidates_to_rank

In [None]:

# для каждого пользователя проставим rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user")

In [None]:
final_recommendations['user_id'].nunique()

In [None]:
final_recommendations.to_parquet('final_recommendations_feat.parquet', index = False)

In [None]:
events_train, events_test, recs, top_k=None, item_col_name='item_id'

In [None]:
# для экономии ресурсов оставим события только тех пользователей, 
# для которых следует оценить рекомендации
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test_2["user_id"].drop_duplicates())]

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    final_recommendations.rename(columns={"cb_score": "score"}),
    "score",  # оценка рекомендации
    5,
    "item_id",  # идентификатор пользователя
    )

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}")

# Алгоритм онлайн-рекомендаций

In [1]:
events_train

NameError: name 'events_train' is not defined