In [None]:
# temporary solution with working directory hoping for correct imports
%cd ".." # pylint: disable=wrong-import-position

import clearml
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from sentence_transformers import SentenceTransformer

from utils.dataset_for_evaluation_collector import collect_unwatched_train_watched_test_movies, load_movies_data
from utils.recommendations_evaluation import evaluate_recommendations, generate_recommendations

np.random.seed(42)

%cd "models"  # pylint: disable=wrong-import-position

In [None]:
task = clearml.Task.init(
    project_name="MoviesGRS_MFDP",
    task_name="MovieEmbeddingsRecommender",
    tags=["MovieEmbeddingsRecommender", "paraphrase-distilroberta-base-v1"],
)

In [3]:
GROUPS_LIST = [f"group{i}" for i in range(1, 8)]

In [4]:
unwatched_train_watched_test_movies: pd.DataFrame = collect_unwatched_train_watched_test_movies(GROUPS_LIST)

In [None]:
transformer = SentenceTransformer("paraphrase-distilroberta-base-v1")
movies_data: pd.DataFrame = load_movies_data()
movies_data["embedding"] = movies_data["plot"].apply(transformer.encode)

In [None]:
users_data: pd.DataFrame = (
    pd.read_parquet("../data/ratings_train.pq")
    .groupby("userId")
    .agg({col: list for col in ["movieId", "rating"]})
    .reset_index()
)
users_data["movieId"] = users_data["movieId"].apply(np.array)
users_data["rating"] = users_data["rating"].apply(np.array)
users_data["embedding"] = users_data.apply(
    lambda row: np.mean(
        movies_data[movies_data.movieId.isin(row["movieId"])].embedding * row["rating"],
        axis=0,
    ),
    axis=1,
)
users_data = users_data[["userId", "embedding"]]

## Average user

In [8]:
def recommend_avg_user(row: pd.Series) -> np.array:
    unwatched_films: pd.DataFrame = movies_data[movies_data.movieId.isin(row["unwatched"])].reset_index(drop=True)
    unwatched_films_embeddings: np.array = np.stack(unwatched_films.embedding.values)

    avg_user_embedding: np.array = np.mean(users_data[users_data.userId.isin(row["userId"])].embedding, axis=0).reshape(
        1, -1
    )

    dist_matrix: np.array = cdist(avg_user_embedding, unwatched_films_embeddings, metric="cosine").reshape(-1)
    closest_films: np.array = dist_matrix.argsort()[:10]

    top_movies: np.array = unwatched_films.loc[closest_films, "movieId"].values

    return top_movies

In [None]:
recommends_avg_user: pd.DataFrame = generate_recommendations(
    recommend_avg_user, unwatched_train_watched_test_movies, GROUPS_LIST
)

In [10]:
average_user_results: pd.DataFrame = evaluate_recommendations(recommends_avg_user, GROUPS_LIST)
average_user_results  # pylint: disable=pointless-statement

Unnamed: 0,MAP,NDCG
group1,0.018087,0.152896
group2,0.014279,0.12783
group3,0.012682,0.115725
group4,0.011553,0.108357
group5,0.011819,0.109612
group6,0.011309,0.1054
group7,0.010644,0.099867


## Group Sum

In [11]:
def recommend_group_sum(row: pd.Series) -> np.array:
    unwatched_films: pd.DataFrame = movies_data[movies_data.movieId.isin(row["unwatched"])].reset_index(drop=True)
    unwatched_films_embeddings: np.array = np.stack(unwatched_films.embedding.values)

    group_users: pd.DataFrame = users_data[users_data.userId.isin(row["userId"])]
    group_users_embeddings: np.array = np.stack(group_users.embedding.values)

    dist_matrix: np.array = cdist(group_users_embeddings, unwatched_films_embeddings, metric="cosine")
    closest_films: np.array = np.sum(dist_matrix, axis=0).argsort()[:10]

    top_movies: np.array = unwatched_films.loc[closest_films, "movieId"].values

    return top_movies

In [None]:
recommends_group_sum: pd.DataFrame = generate_recommendations(
    recommend_group_sum, unwatched_train_watched_test_movies, GROUPS_LIST
)

In [13]:
group_sum_results: pd.DataFrame = evaluate_recommendations(recommends_group_sum, GROUPS_LIST)
group_sum_results  # pylint: disable=pointless-statement

Unnamed: 0,MAP,NDCG
group1,0.018087,0.152896
group2,0.014245,0.127674
group3,0.012681,0.115756
group4,0.011552,0.108302
group5,0.011827,0.109697
group6,0.01129,0.105105
group7,0.010617,0.099679


In [None]:
task.upload_artifact("avg_user_metrics", average_user_results)
task.upload_artifact("group_sum_metrics", group_sum_results)

In [15]:
task.close()