In [None]:
import clearml
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from sentence_transformers import SentenceTransformer

from utils.evaluation import evaluate_recommendations, generate_recommendations
from utils.recommender_data_preprocessor import get_movies, get_recommender_data

np.random.seed(42)

In [None]:
task = clearml.Task.init(
    project_name="MoviesGRS_MFDP",
    task_name="MovieEmbeddingsRecommender",
    tags=["MovieEmbeddingsRecommender", "paraphrase-distilroberta-base-v1"],
)

In [3]:
GROUPS_LIST = [f"group{i}" for i in range(1, 8)]

In [4]:
recommender_data: pd.DataFrame = get_recommender_data(GROUPS_LIST)

In [None]:
transformer = SentenceTransformer("paraphrase-distilroberta-base-v1")
movies_data: pd.DataFrame = get_movies()
movies_data["embedding"] = movies_data["plot"].apply(transformer.encode)

In [None]:
users_data: pd.DataFrame = (
    pd.read_parquet("../data/ratings_train.pq")
    .groupby("userId")
    .agg({col: list for col in ["movieId", "rating"]})
    .reset_index()
)
users_data["movieId"] = users_data["movieId"].apply(np.array)
users_data["rating"] = users_data["rating"].apply(np.array)
users_data["embedding"] = users_data.apply(
    lambda row: np.mean(
        movies_data[movies_data.movieId.isin(row["movieId"])].embedding * row["rating"],
        axis=0,
    ),
    axis=1,
)
users_data = users_data[["userId", "embedding"]]

## Average user

In [7]:
def recommend_avg_user(row: pd.Series) -> np.array:
    unwatched_films: pd.DataFrame = movies_data[movies_data.movieId.isin(row["unwatched"])].reset_index(drop=True)
    unwatched_films_embeddings: np.array = np.stack(unwatched_films.embedding.values)

    avg_user_embedding: np.array = np.mean(users_data[users_data.userId.isin(row["userId"])].embedding, axis=0).reshape(
        1, -1
    )

    dist_matrix: np.array = cdist(avg_user_embedding, unwatched_films_embeddings).reshape(-1)
    closest_films: np.array = dist_matrix.argsort()[:10]

    top_movies: np.array = unwatched_films.loc[closest_films, "movieId"].values

    return top_movies

In [None]:
recommends_avg_user: pd.DataFrame = generate_recommendations(recommend_avg_user, recommender_data, GROUPS_LIST)

In [9]:
average_user_results: pd.DataFrame = evaluate_recommendations(recommends_avg_user, GROUPS_LIST)
average_user_results  # pylint: disable=pointless-statement

Unnamed: 0,MAP,NDCG
group1,0.018488,0.155676
group2,0.014617,0.130071
group3,0.013024,0.117984
group4,0.011828,0.110164
group5,0.012067,0.111144
group6,0.011507,0.10665
group7,0.010789,0.100937


## Group Sum

In [10]:
def recommend_group_sum(row: pd.Series) -> np.array:
    unwatched_films: pd.DataFrame = movies_data[movies_data.movieId.isin(row["unwatched"])].reset_index(drop=True)
    unwatched_films_embeddings: np.array = np.stack(unwatched_films.embedding.values)

    group_users: pd.DataFrame = users_data[users_data.userId.isin(row["userId"])]
    group_users_embeddings: np.array = np.stack(group_users.embedding.values)

    dist_matrix = cdist(group_users_embeddings, unwatched_films_embeddings)
    closest_films: np.array = np.sum(dist_matrix, axis=0).argsort()[:10]

    top_movies: np.array = unwatched_films.loc[closest_films, "movieId"].values

    return top_movies

In [None]:
recommends_group_sum: pd.DataFrame = generate_recommendations(recommend_group_sum, recommender_data, GROUPS_LIST)

In [12]:
group_sum_results: pd.DataFrame = evaluate_recommendations(recommends_group_sum, GROUPS_LIST)
group_sum_results  # pylint: disable=pointless-statement

Unnamed: 0,MAP,NDCG
group1,0.018488,0.155676
group2,0.014598,0.129973
group3,0.012994,0.117759
group4,0.011788,0.109953
group5,0.012037,0.110922
group6,0.011465,0.106377
group7,0.010744,0.100582


In [None]:
task.upload_artifact("avg_user_metrics", average_user_results)
task.upload_artifact("group_sum_metrics", group_sum_results)

In [14]:
task.close()