In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

from evaluation import generate_recommendations, evaluate_recommendations
from recommender_data_preprocessor import get_movies, get_recommender_data

from scipy.spatial.distance import cdist
from sentence_transformers import SentenceTransformer

import clearml

from dotenv import load_dotenv

load_dotenv()

True

In [2]:
task = clearml.Task.init(
    project_name="MoviesGRS_MFDP",
    task_name="MovieEmbeddingsRecommender",
    tags=["MovieEmbeddingsRecommender", "paraphrase-distilroberta-base-v1"],
)

ClearML Task: created new task id=5ad6c347ae84474c9989b749d8417737
2023-06-02 12:23:50,355 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/f3cb8157bfe7443abdc531a44bb15332/experiments/5ad6c347ae84474c9989b749d8417737/output/log


In [3]:
GROUPS_LIST = [f"group{i}" for i in range(1, 8)]

In [4]:
recommender_data = get_recommender_data(GROUPS_LIST)

In [5]:
BigBadModel = SentenceTransformer("paraphrase-distilroberta-base-v1")
movies_data = get_movies()
movies_data["embedding"] = movies_data["plot"].apply(BigBadModel.encode)
movies_data.head(1)

2023-06-02 12:24:09,752 - clearml.model - INFO - Selected model id: edda0fad26674d208e01879b3a5cf727


Unnamed: 0,movieId,tmdb_id,title,year,production_countries,runtime,revenue,tmdb_popularity_score,tmdb_rating_avg,tmdb_votes_count,cast,director,genres,plot,embedding
0,0,862.0,Toy Story,1995,[United States of America],81.0,373554033,73.640445,7.7,5269,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],"[Family, Comedy, Children, Animation, Adventur...","A group of living toys, who assume lifelessnes...","[-0.1895642, 0.39579535, 0.101590574, -0.13271..."


In [6]:
users_data = (
    pd.read_parquet("data/ratings_train.pq")
    .groupby("userId")
    .agg({col: list for col in ["movieId", "rating"]})
    .reset_index()
)
users_data["movieId"] = users_data["movieId"].apply(np.array)
users_data["rating"] = users_data["rating"].apply(np.array)
users_data["embedding"] = users_data.apply(
    lambda row: np.mean(
        movies_data[movies_data.movieId.isin(row["movieId"])].embedding * row["rating"],
        axis=0,
    ),
    axis=1,
)
users_data = users_data[["userId", "embedding"]]
users_data.head(1)

Unnamed: 0,userId,embedding
0,0,"[-0.30371347, 1.0123417, 0.6322011, 0.60798097..."


## Average user

In [7]:
def recommend(row):
    unwatched_films = movies_data[
        movies_data.movieId.isin(row["unwatched"])
    ].reset_index(drop=True)
    unwatched_films_embeddings = np.stack(unwatched_films.embedding.values)

    avg_user_embedding = np.mean(
        users_data[users_data.userId.isin(row["userId"])].embedding, axis=0
    ).reshape(1, -1)

    dist_matrix = cdist(avg_user_embedding, unwatched_films_embeddings).reshape(-1)
    closest_films = dist_matrix.argsort()[:10]

    top_movies = unwatched_films.loc[closest_films, "movieId"].values

    return top_movies

In [8]:
recommends = generate_recommendations(recommend, recommender_data, GROUPS_LIST)
recommends.head(1)

ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


Unnamed: 0,userId,group1,group2,group3,group4,group5,group6,group7,movieId,rating,unwatched,group1_rec,group2_rec,group3_rec,group4_rec,group5_rec,group6_rec,group7_rec
0,1,1,14465,39625,6774,4424,23830,7737,"[613, 176, 734, 114, 270, 485, 352, 201, 571, ...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...","[22, 80, 14, 26, 310, 376, 99, 96, 75, 70, 74,...","[693, 57, 204, 376, 496, 74, 715, 597, 461, 361]","[693, 57, 204, 361, 376, 417, 461, 465, 496, 315]","[57, 315, 461, 408, 417, 95, 571, 796, 251, 715]","[693, 57, 315, 74, 417, 361, 95, 408, 496, 130]","[315, 461, 204, 417, 57, 287, 693, 130, 248, 316]","[204, 57, 74, 693, 417, 95, 130, 465, 361, 283]","[57, 417, 461, 361, 283, 337, 596, 95, 512, 326]"


In [9]:
average_user_results = evaluate_recommendations(recommends, GROUPS_LIST)
average_user_results

Unnamed: 0,MAP,NDCG
group1,0.018488,0.155676
group2,0.014617,0.130071
group3,0.013024,0.117984
group4,0.011828,0.110164
group5,0.012067,0.111144
group6,0.011507,0.10665
group7,0.010789,0.100937


## Group Sum

In [10]:
def recommend(row):
    unwatched_films = movies_data[
        movies_data.movieId.isin(row["unwatched"])
    ].reset_index(drop=True)
    unwatched_films_embeddings = np.stack(unwatched_films.embedding.values)

    group_users = users_data[
        users_data.userId.isin(row["userId"])
    ]
    group_users_embeddings = np.stack(group_users.embedding.values)

    dist_matrix = cdist(group_users_embeddings, unwatched_films_embeddings)
    closest_films = np.sum(dist_matrix, axis=0).argsort()[:10]
    
    top_movies = unwatched_films.loc[closest_films, "movieId"].values

    return top_movies

In [11]:
recommends = generate_recommendations(recommend, recommender_data, GROUPS_LIST)
recommends.head(1)

Unnamed: 0,userId,group1,group2,group3,group4,group5,group6,group7,movieId,rating,unwatched,group1_rec,group2_rec,group3_rec,group4_rec,group5_rec,group6_rec,group7_rec
0,1,1,14465,39625,6774,4424,23830,7737,"[613, 176, 734, 114, 270, 485, 352, 201, 571, ...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...","[22, 80, 14, 26, 310, 376, 99, 96, 75, 70, 74,...","[693, 57, 204, 376, 496, 74, 715, 597, 461, 361]","[693, 57, 204, 361, 376, 417, 461, 465, 496, 315]","[57, 315, 461, 408, 417, 95, 571, 796, 251, 715]","[693, 57, 315, 74, 417, 361, 95, 408, 496, 130]","[315, 461, 204, 417, 57, 693, 287, 130, 248, 316]","[204, 57, 74, 693, 417, 95, 130, 465, 361, 283]","[57, 417, 461, 361, 337, 283, 596, 95, 512, 326]"


In [12]:
group_sum_results = evaluate_recommendations(recommends, GROUPS_LIST)
group_sum_results

Unnamed: 0,MAP,NDCG
group1,0.018488,0.155676
group2,0.014598,0.129973
group3,0.012994,0.117759
group4,0.011788,0.109953
group5,0.012037,0.110922
group6,0.011465,0.106377
group7,0.010744,0.100582


In [13]:
task.upload_artifact("avg_user_metrics", average_user_results)
task.upload_artifact("group_sum_metrics", group_sum_results)

True

In [14]:
task.close()