In [1]:
import os
import pickle
import pandas as pd
import numpy as np
np.random.seed(42)

from evaluation import generate_recommendations, evaluate_recommendations
from recommender_data_preprocessor import get_recommender_data

from surprise import Dataset, Reader, SVD

import clearml

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
task = clearml.Task.init(
    project_name = 'MoviesGRS_MFDP', 
    task_name = 'SVDRecommender', 
    tags = ['SVDRecommender', 'Evaluation', 'TimeSeriesSplit']
)

ClearML Task: created new task id=bae45d91eb274416b3fad843420adf4b
2023-06-01 22:22:08,367 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/f3cb8157bfe7443abdc531a44bb15332/experiments/bae45d91eb274416b3fad843420adf4b/output/log


In [3]:
GROUPS_LIST = [f'group{i}' for i in range(1, 8)]

In [4]:
recommender_data = get_recommender_data(GROUPS_LIST)

In [5]:
if os.path.exists(f"svd_trained.pkl"):
    with open("svd_trained.pkl", "rb") as f:
        svd = pickle.load(f)
else:
    min_rating = 1
    max_rating = 5
    
    train_data = pd.read_parquet('data/ratings_train.pq')
    reader = Reader(rating_scale=(min_rating, max_rating))
    surprise_train_dataset = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
    trainset = surprise_train_dataset.build_full_trainset()
    
    svd = SVD(n_factors=17, n_epochs=30)
    svd.fit(trainset)
    
    with open("svd_trained.pkl", "wb") as f:
        pickle.dump(svd, f)

## Average user

In [6]:
def recommend(row):
    movie_pseudorating = svd.bi[row["unwatched"]] + (
        svd.qi[row["unwatched"]] @ np.mean(svd.pu[row["userId"]], axis=0)
    )
    top_movies = row["unwatched"][np.argsort(-movie_pseudorating)][:10]
    return top_movies

In [7]:
recommends = generate_recommendations(recommend, recommender_data, GROUPS_LIST)
recommends.head(1)

Unnamed: 0,userId,group1,group2,group3,group4,group5,group6,group7,movieId,rating,unwatched,group1_rec,group2_rec,group3_rec,group4_rec,group5_rec,group6_rec,group7_rec
0,1,1,14465,39625,6774,4424,23830,7737,"[613, 176, 734, 114, 270, 485, 352, 201, 571, ...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...","[22, 80, 14, 26, 310, 376, 99, 96, 75, 70, 74,...","[240, 245, 62, 309, 107, 83, 109, 597, 243, 216]","[245, 240, 83, 110, 107, 109, 303, 92, 238, 216]","[85, 65, 248, 309, 25, 241, 216, 52, 250, 597]","[83, 240, 71, 245, 110, 92, 238, 309, 86, 18]","[240, 243, 62, 92, 245, 83, 126, 308, 341, 246]","[83, 240, 62, 110, 309, 596, 103, 341, 377, 92]","[83, 596, 377, 110, 240, 360, 241, 25, 309, 92]"


In [8]:
average_user_results = evaluate_recommendations(recommends, GROUPS_LIST)
average_user_results

ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


Unnamed: 0,MAP,NDCG
group1,0.013029,0.119228
group2,0.013027,0.117886
group3,0.012804,0.115516
group4,0.012553,0.11302
group5,0.012593,0.113308
group6,0.012281,0.11105
group7,0.012091,0.109125


## Group Sum

In [9]:
def recommend(row):
    movie_pseudorating = svd.bi[row["unwatched"]] + (
        np.sum(svd.qi[row["unwatched"]] @ svd.pu[row["userId"]].T, axis=1)
    )
    movie_pseudorating.sum()
    top_movies = row["unwatched"][np.argsort(-movie_pseudorating)][:10]
    return top_movies

In [10]:
recommends = generate_recommendations(recommend, recommender_data, GROUPS_LIST)
recommends.head(1)

Unnamed: 0,userId,group1,group2,group3,group4,group5,group6,group7,movieId,rating,unwatched,group1_rec,group2_rec,group3_rec,group4_rec,group5_rec,group6_rec,group7_rec
0,1,1,14465,39625,6774,4424,23830,7737,"[613, 176, 734, 114, 270, 485, 352, 201, 571, ...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...","[22, 80, 14, 26, 310, 376, 99, 96, 75, 70, 74,...","[240, 245, 62, 309, 107, 83, 109, 597, 243, 216]","[245, 107, 109, 303, 240, 110, 216, 439, 238, ...","[85, 248, 250, 52, 65, 309, 216, 25, 597, 241]","[107, 109, 245, 284, 240, 282, 303, 71, 439, 373]","[240, 245, 107, 243, 308, 283, 109, 92, 303, 62]","[240, 107, 341, 109, 284, 62, 439, 309, 110, 499]","[360, 606, 596, 492, 377, 416, 118, 282, 110, ..."


In [11]:
group_sum_results = evaluate_recommendations(recommends, GROUPS_LIST)
group_sum_results

Unnamed: 0,MAP,NDCG
group1,0.013029,0.119228
group2,0.011667,0.109666
group3,0.010804,0.103025
group4,0.010118,0.097659
group5,0.010304,0.098881
group6,0.009949,0.096097
group7,0.009546,0.093057


In [12]:
task.upload_artifact('avg_user_metrics', average_user_results)
task.upload_artifact('group_sum_metrics', group_sum_results)

True

In [13]:
task.close()