# Инициализация

Загружаем библиотеки необходимые для выполнения кода ноутбука.

In [60]:
import logging
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import sys
import sklearn.preprocessing
import sklearn.metrics
import boto3

from dotenv import load_dotenv
from implicit.als import AlternatingLeastSquares
from pandas.tseries.offsets import MonthEnd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise import NormalPredictor
from sklearn.metrics.pairwise import cosine_similarity
from catboost import CatBoostClassifier, Pool

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

In [61]:
session = boto3.session.Session()
s3_client = session.client(
    service_name='s3',
    endpoint_url='https://storage.yandexcloud.net'
)

In [58]:
load_dotenv()
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")

# === ЭТАП 1 ===

# Загрузка первичных данных

Загружаем первичные данные из файлов:
- tracks.parquet
- catalog_names.parquet
- interactions.parquet

In [None]:
tracks = pd.read_parquet("./recsys/initial/tracks.parquet")
tracks.info()

In [None]:
catalog_names = pd.read_parquet("./recsys/initial/catalog_names.parquet")
catalog_names.info()

In [None]:
interactions = pd.read_parquet("./recsys/initial/interactions.parquet")
interactions.info()

# Обзор данных

Проверяем данные, есть ли с ними явные проблемы.

In [None]:
tracks.sample(3).T

In [None]:
tracks['track_id'].nunique()

In [None]:
catalog_names.sample(3).T

In [None]:
catalog_names['id'].nunique()

In [None]:
catalog_names['type'].value_counts()

In [None]:
interactions.sample(3).T

In [None]:
interactions['user_id'].value_counts()

In [None]:
interactions["started_at"].dt.to_period('M').value_counts()

In [None]:
interactions["started_at"].dt.to_period('Y').value_counts()

# Выводы

Приведём выводы по первому знакомству с данными:
- есть ли с данными явные проблемы,
- какие корректирующие действия (в целом) были предприняты.

1. Необходимо удалить пользователей с 5 или меньше событиями  
2. Для сокращения размера лучше оставить информацию за последние 2 месяца
3. Необходимо объединить информацию о треках с треками

In [None]:
interactions = interactions[interactions.groupby(['user_id']).transform('size') > 5]
interactions['user_id'].value_counts()

In [None]:
interactions.info()

In [None]:
event_cutoff_date = pd.to_datetime("2022-11-01").date()
interactions = interactions.query("started_at > @event_cutoff_date").copy()

In [None]:
interactions.info()

In [None]:
catalog_names['type'].value_counts()

In [None]:
track_names = catalog_names.loc[catalog_names['type'] == "track"]
track_names = track_names.drop(columns=["type"])
track_names.info()

In [None]:
album_names = catalog_names.loc[catalog_names['type'] == "album"]
album_names = album_names.drop(columns=["type"])
album_names = album_names.rename(columns={"name": "album_name"})
album_names.info()

In [None]:
album_names['id'].nunique()

In [None]:
artist_names = catalog_names.loc[catalog_names['type'] == "artist"]
artist_names = artist_names.drop(columns=["type"])
artist_names = artist_names.rename(columns={"name": "artist_name"})
artist_names.info()

In [None]:
genre_names = catalog_names.loc[catalog_names['type'] == "genre"]
genre_names = genre_names.drop(columns=["type"])
genre_names = genre_names.rename(columns={"name": "genre_name"})
genre_names.info()

In [None]:
tracks = tracks.merge(track_names, left_on='track_id', right_on='id')
tracks.head()

In [25]:
genre_names = genre_names.set_index("id")

In [26]:
tracks['genre_names'] = np.NAN

In [None]:
gnr = genre_names.T.to_dict('list')
gnr

In [None]:
for k,item in tracks['genres'].items():
    arr = []
    for i in range(len(item)):
        if item[i] in gnr.keys():
            arr.append(gnr[item[i]][0])
    tracks['genre_names'][k] = arr        

In [None]:
tracks.head(5)

In [49]:
artist_names = artist_names.set_index("id")

In [None]:
artists = artist_names.T.to_dict('list')
artists

In [None]:
artist_names.head()

In [None]:
tracks['artist_names'] = np.NAN
for k,item in tracks['artists'].items():
    arr = []
    for i in range(len(item)):
        for k_a, i_a in artist_names.items():
            if item[i] in i_a.keys():
                arr.append(i_a[item[i]])
    tracks['artist_names'][k] = arr 

In [None]:
tracks.head()

In [9]:
album_names = album_names.set_index("id")

In [None]:
tracks['album_names'] = np.NAN
for k,item in tracks['albums'].items():
    arr = []
    for i in range(len(item)):
        for k_a, i_a in album_names.items():
            if item[i] in i_a.keys():
                arr.append(i_a[item[i]])
    tracks['album_names'][k] = arr 

In [None]:
tracks.head(5)

In [34]:
tracks = tracks.drop(columns=["id"])

# === ЭТАП 2 ===

# EDA

Распределение количества прослушанных треков.

In [7]:
s1 = interactions \
    .groupby(["track_id"]).agg(popularity=("user_id", "nunique"))

In [None]:
s1.sort_values(by="popularity",ascending=False).head(10)

In [14]:
tracks = tracks.merge(s1, how='left', on=['track_id'])

In [15]:
tracks['popularity'] = tracks['popularity'].fillna(0)

In [None]:
tracks.head()

In [None]:
plt.subplots(figsize=(16, 6))
plt.plot(tracks['track_id'], tracks['popularity'])

Наиболее популярные треки

In [None]:
tracks.sort_values(by="popularity",ascending=False).head()

Наиболее популярные жанры

In [20]:
genre_pop = {}
    
    
for k,item in tracks['genres'].items():
    for i in range(len(item)):
        if item[i] in genre_pop.keys():
            genre_pop[item[i]] += tracks['popularity'][item[i]]
        else:
            genre_pop[item[i]] = tracks['popularity'][item[i]]


In [None]:
genre_pop

In [None]:
genre_popularity = pd.DataFrame.from_dict(genre_pop, orient='index')
genre_popularity

In [None]:
genre_popularity = genre_popularity.reset_index()
genre_popularity

In [None]:
genre_popularity = genre_popularity.merge(genre_names, how="right", on="id")
genre_popularity

In [None]:
genre_popularity = genre_popularity.rename(columns={0:"genre_popularity"})
genre_popularity["genre_popularity"] = genre_popularity["genre_popularity"].fillna(0)
genre_popularity.head()

In [None]:
genre_popularity.sort_values(by='genre_popularity',ascending=False)

Треки, которые никто не прослушал

In [None]:
not_list = tracks[~tracks['track_id'].isin(interactions['track_id'])]
not_list

# Преобразование данных

Преобразуем данные в формат, более пригодный для дальнейшего использования в расчётах рекомендаций.

In [None]:
interactions['started_at'] = interactions['started_at'].dt.date
interactions.head()

In [None]:
interactions['rating'] = interactions.groupby("user_id")['track_seq'].rank(method='dense', ascending=False)
interactions.head()

# Сохранение данных

Сохраним данные в двух файлах в персональном S3-бакете по пути `recsys/data/`:
- `items.parquet` — все данные о музыкальных треках,
- `events.parquet` — все данные о взаимодействиях.

In [32]:
tracks.to_parquet("./recsys/data/items.parquet")
interactions.to_parquet("./recsys/data/events.parquet")
genre_popularity.to_parquet("./recsys/data/genre_popularity.parquet")

In [62]:
s3_client.upload_file('./recsys/data/items.parquet', S3_BUCKET_NAME, 'recsys/data/items.parquet')

In [64]:
s3_client.upload_file('./recsys/data/events.parquet', S3_BUCKET_NAME, 'recsys/data/events.parquet')

# Очистка памяти

Здесь, может понадобится очистка памяти для высвобождения ресурсов для выполнения кода ниже. 

Приведите соответствующие код, комментарии, например:
- код для удаление более ненужных переменных,
- комментарий, что следует перезапустить kernel, выполнить такие-то начальные секции и продолжить с этапа 3.

# === ЭТАП 3 ===

# Загрузка данных

Если необходимо, то загружаем items.parquet, events.parquet.

In [13]:
items = pd.read_parquet("./recsys/data/items.parquet")
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   track_id      1000000 non-null  int64  
 1   albums        1000000 non-null  object 
 2   artists       1000000 non-null  object 
 3   genres        1000000 non-null  object 
 4   name          1000000 non-null  object 
 5   genre_names   1000000 non-null  object 
 6   artist_names  1000000 non-null  object 
 7   album_names   1000000 non-null  object 
 8   popularity    1000000 non-null  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 68.7+ MB


In [3]:
events = pd.read_parquet("./recsys/data/events.parquet")
events.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63918740 entries, 31 to 291
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   user_id     int32  
 1   track_id    int32  
 2   track_seq   int16  
 3   started_at  object 
 4   rating      float64
dtypes: float64(1), int16(1), int32(2), object(1)
memory usage: 2.0+ GB


In [None]:
genre_popularity = pd.read_parquet("./recsys/data/genre_popularity.parquet")
genre_popularity.info()

# Разбиение данных

Разбиваем данные на тренировочную, тестовую выборки.

In [4]:
train_test_global_time_split_date = pd.to_datetime("2022-12-16").date()
train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date

events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()

common_users = list(set(users_train) & set(users_test))
print(len(users_train), len(users_test), len(common_users)) 

1140855 781881 743264


In [5]:
cold_users = list(set(users_test) - set(common_users))
print(len(cold_users)) 

38617


# Топ популярных

Рассчитаем рекомендации как топ популярных.

In [None]:
items['score'] = (items['popularity']/items['popularity'].sum())*1000
items.sort_values(by="score",ascending=False).head(5) 

In [None]:
top_popular = items.sort_values(by="score",ascending=False).head(100).reset_index(drop=True)
top_popular.head(5) 

In [10]:
top_popular.to_parquet("./recsys/recommendations/top_popular.parquet")

# Персональные

Рассчитаем персональные рекомендации.

In [None]:
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["track_id"])
items["track_id_enc"] = item_encoder.transform(items["track_id"])
events_train["track_id_enc"] = item_encoder.transform(events_train["track_id"])
events_test["track_id_enc"] = item_encoder.transform(events_test["track_id"])

In [None]:
events_train.head()

In [13]:
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['track_id_enc'])),
    dtype=np.int8)

In [None]:
als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train) 

In [16]:
user_ids_encoded = range(len(user_encoder.classes_))

als_recommendations = als_model.recommend(
    user_ids_encoded, 
    user_item_matrix_train[user_ids_encoded], 
    filter_already_liked_items=False, N=50)

In [None]:
item_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "track_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["track_id_enc", "score"], ignore_index=True)

als_recommendations["track_id_enc"] = als_recommendations["track_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
als_recommendations["item_id"] = item_encoder.inverse_transform(als_recommendations["track_id_enc"])
als_recommendations = als_recommendations.drop(columns=["user_id_enc", "track_id_enc"])

In [None]:
als_recommendations = als_recommendations[["user_id", "item_id", "score"]]
als_recommendations.head(5)

In [None]:
als_recommendations = als_recommendations.rename(columns = {"item_id":"track_id"})
als_recommendations.head(5)

In [23]:
als_recommendations.to_parquet("./recsys/recommendations/personal_als.parquet")

In [20]:
user_item_matrix_test = scipy.sparse.csr_matrix((
    events_test["rating"],
    (events_test['user_id_enc'], events_test['track_id_enc'])),
    dtype=np.int8)

In [None]:
als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_test) 

In [22]:
user_ids_encoded = range(len(user_encoder.classes_))

als_recommendations_test = als_model.recommend(
    user_ids_encoded, 
    user_item_matrix_test[user_ids_encoded], 
    filter_already_liked_items=False, N=50)

In [23]:
item_ids_enc = als_recommendations_test[0]
als_scores = als_recommendations_test[1]

als_recommendations_test = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "track_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations_test = als_recommendations_test.explode(["track_id_enc", "score"], ignore_index=True)

als_recommendations_test["track_id_enc"] = als_recommendations_test["track_id_enc"].astype("int")
als_recommendations_test["score"] = als_recommendations_test["score"].astype("float")

als_recommendations_test["user_id"] = user_encoder.inverse_transform(als_recommendations_test["user_id_enc"])
als_recommendations_test["track_id"] = item_encoder.inverse_transform(als_recommendations_test["track_id_enc"])
als_recommendations_test = als_recommendations_test.drop(columns=["user_id_enc", "track_id_enc"])

In [None]:
als_recommendations_test = als_recommendations_test[["user_id", "track_id", "score"]]
als_recommendations_test.head(5)

In [25]:
als_recommendations_test.to_parquet("./recsys/recommendations/personal_als_test.parquet")

# Похожие

Рассчитаем похожие, они позже пригодятся для онлайн-рекомендаций.

In [None]:
train_item_ids_enc = events_train['track_id_enc'].unique()

max_similar_items = 10

similar_items = als_model.similar_items(train_item_ids_enc, N=max_similar_items+1)

sim_item_item_ids_enc = similar_items[0]
sim_item_scores = similar_items[1]

similar_items = pd.DataFrame({
    "track_id_enc": train_item_ids_enc,
    "sim_track_id_enc": sim_item_item_ids_enc.tolist(), 
    "score": sim_item_scores.tolist()})
similar_items

In [None]:
similar_items = similar_items.explode(["sim_track_id_enc", "score"], ignore_index=True)
similar_items.head()

In [None]:
similar_items["sim_track_id_enc"] = similar_items["sim_track_id_enc"].astype("int")
similar_items["score"] = similar_items["score"].astype("float")
similar_items.head()

In [None]:
similar_items["track_id_1"] = item_encoder.inverse_transform(similar_items["track_id_enc"])
similar_items["track_id_2"] = item_encoder.inverse_transform(similar_items["sim_track_id_enc"])
similar_items = similar_items.drop(columns=["track_id_enc", "sim_track_id_enc"])

similar_items = similar_items.query("track_id_1 != track_id_2")
similar_items.head()

In [33]:
similar_items.to_parquet("./recsys/recommendations/similar.parquet")

In [65]:
s3_client.upload_file('./recsys/recommendations/top_popular.parquet', S3_BUCKET_NAME, 'recsys/recommendations/top_popular.parquet')
s3_client.upload_file('./recsys/recommendations/personal_als_test.parquet', S3_BUCKET_NAME, 'recsys/recommendations/personal_als_test.parquet')
s3_client.upload_file('./recsys/recommendations/similar.parquet', S3_BUCKET_NAME, 'recsys/recommendations/similar.parquet')

# Построение признаков

Построим три признака, можно больше, для ранжирующей модели.

In [None]:
personal_als = pd.read_parquet("./recsys/recommendations/personal_als.parquet")
personal_als.info()

In [None]:
personal_als_test = pd.read_parquet("./recsys/recommendations/personal_als_test.parquet")
personal_als_test.info()

In [None]:
user_features = events_train.groupby("user_id").agg(
    count_tracks=("track_id", "count"))
user_features.head()

In [None]:
user_features_test = events_test.groupby("user_id").agg(
    count_tracks=("track_id", "count"))
user_features_test.head()

In [None]:
candidates_for_test = personal_als_test.merge(user_features_test, on="user_id", how="left")
candidates_for_test.head()

In [None]:
candidates_for_train = personal_als.merge(user_features, on="user_id", how="left")
candidates_for_train.head()

In [None]:
genre_popularity['genre_rank'] = genre_popularity['genre_popularity'].rank(method='dense', ascending=True)
genre_popularity.sort_values(by="genre_rank",ascending=False)

In [None]:
genre_features = genre_popularity.drop(columns=['id','genre_popularity','genre_name'])
genre_features.head()

In [None]:
items['genre_rank'] = np.NAN
for k,item in items['genres'].items():
    pop = 0
    for i in range(len(item)):
        for k_a, i_a in genre_features.items():
            if item[i] in i_a.keys():
                if pop <= i_a[item[i]]:
                    pop = i_a[item[i]]
    items['genre_rank'][k] = pop 

In [None]:
candidates_for_train = candidates_for_train.merge(items[['track_id','genre_rank']], on="track_id", how="left")
candidates_for_train.head()

In [None]:
candidates_for_test = candidates_for_test.merge(items[['track_id','genre_rank']], on="track_id", how="left")
candidates_for_test.head()

In [20]:
candidates_for_test.to_parquet("./recsys/recommendations/candidates_for_test.parquet")
candidates_for_train.to_parquet("./recsys/recommendations/candidates_for_train.parquet")

In [5]:
candidates_for_train = pd.read_parquet("./recsys/recommendations/candidates_for_train.parquet")

Признаки: Количество прослушаных треков у пользователя, als_score и пополярность жанра (если у трека два жанра, то выбирается большая)

# Ранжирование рекомендаций

Построим ранжирующую модель, чтобы сделать рекомендации более точными. Отранжируем рекомендации.

In [None]:
events_train["target"] = 1
events_train.head()

In [None]:
candidates_for_train = candidates_for_train.merge(events_train[["user_id", "track_id", "target"]],
                              on=['user_id','track_id'], 
                              how="left")
candidates_for_train.head()

In [14]:
candidates_for_train["target"] = candidates_for_train["target"].fillna(0).astype("int")

In [15]:
candidates_for_train = candidates_for_train.groupby("user_id").filter(lambda x: x["target"].sum() > 0)

In [18]:
candidates_for_train.to_parquet("./recsys/recommendations/candidates_for_train.parquet")

In [3]:
candidates_for_train = pd.read_parquet("./recsys/recommendations/candidates_for_train.parquet")
candidates_for_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51213750 entries, 0 to 58973599
Data columns (total 6 columns):
 #   Column        Dtype  
---  ------        -----  
 0   user_id       int32  
 1   track_id      int64  
 2   score         float64
 3   count_tracks  float64
 4   genre_rank    float64
 5   target        int64  
dtypes: float64(3), int32(1), int64(2)
memory usage: 2.5 GB


In [4]:
features = ['score', 'count_tracks', 'genre_rank']
target = 'target'

train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

cb_model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0,
)

cb_model.fit(train_data) 

0:	learn: 0.5950044	total: 4.37s	remaining: 14m 29s
100:	learn: 0.3219332	total: 6m 9s	remaining: 6m 1s
199:	learn: 0.3215186	total: 11m 57s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f246032a680>

In [5]:
candidates_for_test = pd.read_parquet("./recsys/recommendations/candidates_for_test.parquet")

In [6]:
inference_data = Pool(data=candidates_for_test[features])
predictions = cb_model.predict_proba(inference_data)

candidates_for_test["cb_score"] = predictions[:, 1]

candidates_for_test = candidates_for_test.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_for_test["rank"] = candidates_for_test.groupby("user_id")['cb_score'].rank(method='dense', ascending=False)
candidates_for_test['rank'] = np.where(candidates_for_test['rank'] > 100, 100, candidates_for_test['rank'])

max_recommendations_per_user = 100
final_recommendations = candidates_for_test\
        .groupby("user_id") \
        .head(100)

In [7]:
final_recommendations.to_parquet("./recsys/recommendations/recommendations.parquet")

In [66]:
s3_client.upload_file('./recsys/recommendations/recommendations.parquet', S3_BUCKET_NAME, 'recsys/recommendations/recommendations.parquet')

In [8]:
final_recommendations.head()

Unnamed: 0,user_id,track_id,score,count_tracks,genre_rank,cb_score,rank
0,3,78194999,0.008977,1.0,89.0,0.031185,1.0
4,3,75944934,0.007619,1.0,89.0,0.029039,2.0
7,3,84382282,0.007052,1.0,89.0,0.0277,3.0
13,3,78608850,0.006225,1.0,89.0,0.0277,3.0
16,3,75630144,0.006041,1.0,89.0,0.0277,3.0


# Оценка качества

Проверим оценку качества трёх типов рекомендаций: 

- топ популярных,
- персональных, полученных при помощи ALS,
- итоговых
  
по четырем метрикам: recall, precision, coverage, novelty.

In [9]:
final_recommendations = pd.read_parquet("./recsys/recommendations/recommendations.parquet")

In [11]:
final_recommendations.head()

Unnamed: 0,user_id,track_id,score,count_tracks,genre_rank,cb_score,rank
0,3,78194999,0.008977,1.0,89.0,0.031185,1.0
4,3,75944934,0.007619,1.0,89.0,0.029039,2.0
7,3,84382282,0.007052,1.0,89.0,0.0277,3.0
13,3,78608850,0.006225,1.0,89.0,0.0277,3.0
16,3,75630144,0.006041,1.0,89.0,0.0277,3.0


In [6]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):


    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["track_id", "cb_score"], ascending=[True, False])
    events_for_common_users = events_for_common_users[events_for_common_users["track_id"].isin(events_train["track_id"].unique())]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", "track_id", "gt"]].merge(
        recs_for_common_users[["user_id", "track_id", "cb_score"]], 
        on=["user_id", "track_id"], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["cb_score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common

In [7]:
def compute_cls_metrics(events_recs_for_binary_metric):
    
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fn"].sum())# ваш код здесь #
    recall = recall.fillna(0).mean()
    return precision, recall

In [12]:
cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_train,
    events_test,
    final_recommendations,
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


precision: 0.041, recall: 0.006


In [44]:
sm = final_recommendations['track_id'].nunique()
sm


18894

In [45]:
cov_items = sm/items.shape[0]
print(f"coverage: {cov_items:.2f}") 

coverage: 0.02


In [None]:
events_test["listened"] = True
final_recommendations = final_recommendations.merge(events_test[["user_id", "track_id", "listened"]], on=["user_id", "track_id"], how="left")
final_recommendations["listened"] = final_recommendations["listened"].fillna(False).astype("bool")
final_recommendations = final_recommendations.sort_values(by='cb_score', ascending=False)
final_recommendations["rank"] = final_recommendations.groupby("user_id").cumcount() + 1

In [54]:
novelty_5 = (1-final_recommendations.query("rank <= 5").groupby("user_id")["listened"].mean())
print(f"novelty: {novelty_5.mean():.2f}") 

novelty: 1.00


# === Выводы, метрики ===

Основные выводы при работе над расчётом рекомендаций, рассчитанные метрики.

In [55]:
print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}, coverage: {cov_items:.2f}, novelty: {novelty_5.mean():.2f}")

precision: 0.041, recall: 0.006, coverage: 0.02, novelty: 1.00


precision: 0.041, recall: 0.006, coverage: 0.02, novelty: 1.00  
Вывод: Метрики слабые, на это повлияло сокращение events до двух месяцев, что показывает метрика покрытие. Сократил чтобы тратилось меньше времени на обработку рекомендаций, возможно стояло сокращать по количеству пользователей. Но из-за этого предлагаются новые треки, что показывает метрика новизны.