In [1]:
from functools import reduce
import pickle
import os

import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score, average_precision_score
from surprise import Dataset, Reader, SVD

from clearml import Task

from evaluation import evaluate_recommendations

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
data_path = 'data/'
train_data = pd.read_parquet(data_path + 'ratings_train.pq')
test_data = pd.read_parquet(data_path + 'ratings_test.pq')
groups5 = pd.read_parquet(data_path + 'groups5.pq')
groups6 = pd.read_parquet(data_path + 'groups6.pq')
groups7 = pd.read_parquet(data_path + 'groups7.pq')

for i, group in enumerate([groups5, groups6, groups7]):
    test_data = test_data.merge(group, on='userId').rename(columns={'group': f'group{i+5}'})
del groups5, groups6, groups7
test_data

Unnamed: 0,userId,movieId,rating,group5,group6,group7
0,41988,790,4.0,31361,14281,11298
1,41988,524,3.0,31361,14281,11298
2,41988,608,4.0,31361,14281,11298
3,41988,695,3.0,31361,14281,11298
4,41988,566,4.0,31361,14281,11298
...,...,...,...,...,...,...
3596663,7343,164,5.0,17403,2054,4080
3596664,7343,193,4.0,17403,2054,4080
3596665,7343,253,4.0,17403,2054,4080
3596666,7343,483,2.0,17403,2054,4080


In [3]:
task = Task.init(
    project_name = 'MoviesGRS_MFDP', 
    task_name = 'SVDRecommender',
    tags = ['SVD', 'Evaluation']
)

ClearML Task: created new task id=82ebefb9c402454c9405b5826b522d1a
2023-05-29 19:48:31,709 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/f3cb8157bfe7443abdc531a44bb15332/experiments/82ebefb9c402454c9405b5826b522d1a/output/log


ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


In [4]:
if os.path.exists(f"svd_trained.pkl"):
    with open("svd_trained.pkl", "rb") as f:
        svd = pickle.load(f)
else:
    min_rating = 1
    max_rating = 5
    
    reader = Reader(rating_scale=(min_rating, max_rating))
    surprise_train_dataset = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
    trainset = surprise_train_dataset.build_full_trainset()
    
    svd = SVD(n_factors=17, n_epochs=30)
    svd.fit(trainset)
    
    with open("svd_trained.pkl", "wb") as f:
        pickle.dump(svd, f)
    
    del trainset

In [5]:
movie_ids = train_data.movieId.unique()

In [6]:
unwatched = (
    train_data
    .groupby(by='userId')
    .agg({'movieId': list})
    .reset_index()
)
unwatched['unwatched'] = (
    unwatched.movieId
    .apply(
        lambda x: movie_ids[
            np.isin(movie_ids, x, invert=True)
        ]
    )
)

unwatched = (
    test_data
    .merge(unwatched[['userId', 'unwatched']], on='userId')
    [['userId', *[f'group{i}' for i in range(5, 8)], 'unwatched']]
    .groupby(by=['userId', *[f'group{i}' for i in range(5, 8)]])
    .unwatched
    .first()
    .reset_index()
)
unwatched

Unnamed: 0,userId,group5,group6,group7,unwatched
0,1,4424,23830,7737,"[459, 310, 18, 108, 57, 228, 16, 291, 15, 191,..."
1,2,18784,2598,13437,"[459, 18, 108, 57, 228, 16, 786, 291, 15, 191,..."
2,3,29761,20828,14279,"[459, 310, 519, 18, 108, 57, 228, 786, 291, 15..."
3,4,12577,17824,10523,"[459, 310, 519, 18, 108, 57, 786, 291, 15, 191..."
4,5,22262,607,10989,"[459, 310, 519, 18, 108, 57, 228, 16, 786, 291..."
...,...,...,...,...,...
128586,128660,32028,27049,10083,"[459, 310, 519, 18, 108, 57, 228, 16, 786, 291..."
128587,128661,22934,22798,20087,"[459, 310, 18, 108, 57, 228, 16, 291, 15, 191,..."
128588,128662,23328,26324,7171,"[459, 519, 18, 108, 57, 228, 16, 786, 291, 15,..."
128589,128663,13396,19445,7989,"[459, 310, 18, 108, 57, 228, 16, 786, 291, 15,..."


In [7]:
users_watch_history_test: pd.DataFrame = (
    test_data
    .sort_values(by='rating', ascending=False)
    .groupby(by='userId')
    .agg(
        {
        **{f'group{i}': 'first' for i in range(5, 8)},
        'movieId': list,
        'rating': list
        }
    )
    .reset_index()
)
users_watch_history_test['movieId'] = users_watch_history_test.movieId.apply(np.array)
users_watch_history_test['rating'] = users_watch_history_test.rating.apply(np.array)

In [8]:
def generate_recommendations(make_recommendations, users_watch_history_test, unwatched):
    for group in [f'group{i}' for i in range(5, 8)]:
        group_unwatched = (
            unwatched
            .groupby(by=group)
            .agg({
                'userId': list,
                'unwatched': lambda x: np.array(reduce(np.intersect1d, x))
            })
            .reset_index()
        )
        group_unwatched['userId'] = group_unwatched.userId.apply(np.array)
        group_unwatched[f'{group}_rec'] = group_unwatched.apply(make_recommendations, axis=1)
        
        users_watch_history_test = users_watch_history_test.merge(group_unwatched[[group, f'{group}_rec']], on=group)
    
    return users_watch_history_test

## Average user

In [21]:
def recommend(row):
    movie_pseudorating = svd.bi[row["unwatched"]] + (
        svd.qi[row["unwatched"]] @ np.mean(svd.pu[row["userId"]], axis=0)
    )
    top_movies = row["unwatched"][np.argsort(-movie_pseudorating)][:10]
    return top_movies

In [22]:
recommends = generate_recommendations(recommend, users_watch_history_test, unwatched)
recommends

Unnamed: 0,userId,group5,group6,group7,movieId,rating,group5_rec,group6_rec,group7_rec
0,1,4424,23830,7737,"[384, 613, 181, 16, 114, 533, 584, 131, 572, 5...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, ...","[377, 700, 308, 206, 239, 422, 171, 672, 2, 11]","[206, 307, 596, 234, 196, 569, 575, 438, 479, ...","[206, 575, 308, 377, 385, 422, 11, 415, 12, 316]"
1,128307,19099,24828,7737,"[99, 142, 33, 244, 97, 80, 181, 223, 176, 157,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...","[206, 377, 7, 243, 308, 306, 385, 307, 596, 192]","[117, 112, 308, 700, 206, 461, 314, 608, 711, ...","[206, 575, 308, 377, 385, 422, 11, 415, 12, 316]"
2,5104,30892,8245,7737,"[198, 8, 85, 290, 206, 172, 208, 112, 184, 88,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[377, 308, 293, 1, 12, 385, 415, 537, 325, 11]","[206, 239, 377, 293, 171, 537, 87, 385, 450, 415]","[206, 575, 308, 377, 385, 422, 11, 415, 12, 316]"
3,47993,16791,1281,7737,"[90, 784, 394, 59, 734, 779, 324, 193, 328, 82...","[5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, ...","[206, 377, 117, 7, 308, 307, 297, 414, 575, 629]","[206, 2, 157, 103, 377, 575, 7, 596, 213, 306]","[206, 575, 308, 377, 385, 422, 11, 415, 12, 316]"
4,32850,281,7953,7737,"[384, 730, 55, 394, 548, 90, 793]","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 3.5]","[377, 308, 455, 111, 165, 615, 409, 206, 171, ...","[377, 206, 308, 111, 385, 575, 306, 297, 379, 7]","[206, 575, 308, 377, 385, 422, 11, 415, 12, 316]"
...,...,...,...,...,...,...,...,...,...
128586,90835,5409,12500,20074,"[59, 572, 561, 544, 529, 301, 326, 354, 531]","[5.0, 4.0, 4.0, 3.5, 3.5, 3.0, 2.5, 2.5, 2.0]","[206, 157, 33, 246, 83, 306, 87, 12, 438, 31]","[206, 246, 196, 347, 213, 239, 157, 450, 12, 7]","[72, 206, 575, 422, 321, 46, 62, 410, 694, 601]"
128587,96812,26072,7682,20074,"[332, 367, 267, 100, 274, 329, 259, 153, 344, ...","[5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, ...","[206, 575, 596, 157, 239, 7, 539, 72, 307, 24]","[206, 7, 192, 414, 575, 307, 147, 512, 186, 479]","[72, 206, 575, 422, 321, 46, 62, 410, 694, 601]"
128588,65482,20574,10362,12742,"[90, 135, 175, 51, 177, 188, 220]","[5.0, 4.0, 4.0, 4.0, 3.0, 2.0, 1.0]","[422, 95, 76, 70, 700, 575, 377, 455, 596, 209]","[206, 306, 438, 377, 295, 87, 347, 239, 596, 96]","[422, 95, 76, 700, 615, 537, 780, 496, 354, 377]"
128589,37798,10175,5664,12742,"[59, 22, 64, 96, 97, 26, 14, 79]","[5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0]","[377, 308, 186, 7, 537, 11, 115, 385, 293, 629]","[117, 377, 206, 7, 629, 111, 496, 183, 467, 239]","[422, 95, 76, 700, 615, 537, 780, 496, 354, 377]"


In [23]:
average_user_results = evaluate_recommendations(recommends)
average_user_results

Unnamed: 0,MAP,NDCG
group5,0.01258,0.120895
group6,0.012191,0.117232
group7,0.01147,0.113328


## Group Sum

In [12]:
svd.qi.shape

(802, 17)

In [17]:
def recommend(row):
    movie_pseudorating = svd.bi[row["unwatched"]] + (
        np.sum(svd.qi[row["unwatched"]] @ svd.pu[row["userId"]].T, axis=1)
    )
    movie_pseudorating.sum()
    top_movies = row["unwatched"][np.argsort(-movie_pseudorating)][:10]
    return top_movies

In [18]:
recommends = generate_recommendations(recommend, users_watch_history_test, unwatched)
recommends

Unnamed: 0,userId,group5,group6,group7,movieId,rating,group5_rec,group6_rec,group7_rec
0,1,4424,23830,7737,"[384, 613, 181, 16, 114, 533, 584, 131, 572, 5...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, ...","[163, 672, 492, 700, 62, 422, 377, 216, 403, 322]","[401, 445, 443, 635, 677, 307, 569, 242, 379, ...","[141, 250, 422, 316, 575, 95, 241, 76, 336, 308]"
1,128307,19099,24828,7737,"[99, 142, 33, 244, 97, 80, 181, 223, 176, 157,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...","[243, 117, 147, 192, 183, 298, 393, 572, 307, ...","[477, 461, 662, 524, 214, 112, 117, 443, 233, ...","[141, 250, 422, 316, 575, 95, 241, 76, 336, 308]"
2,5104,30892,8245,7737,"[198, 8, 85, 290, 206, 172, 208, 112, 184, 88,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[1, 293, 268, 425, 12, 336, 361, 295, 586, 494]","[239, 107, 336, 191, 206, 472, 293, 780, 256, ...","[141, 250, 422, 316, 575, 95, 241, 76, 336, 308]"
3,47993,16791,1281,7737,"[90, 784, 394, 59, 734, 779, 324, 193, 328, 82...","[5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, ...","[117, 214, 443, 245, 297, 147, 377, 569, 205, ...","[45, 157, 2, 22, 103, 104, 213, 202, 393, 76]","[141, 250, 422, 316, 575, 95, 241, 76, 336, 308]"
4,32850,281,7953,7737,"[384, 730, 55, 394, 548, 90, 793]","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 3.5]","[455, 419, 163, 615, 594, 750, 593, 650, 296, ...","[599, 348, 296, 379, 350, 241, 313, 677, 187, ...","[141, 250, 422, 316, 575, 95, 241, 76, 336, 308]"
...,...,...,...,...,...,...,...,...,...
128586,90835,5409,12500,20074,"[59, 572, 561, 544, 529, 301, 326, 354, 531]","[5.0, 4.0, 4.0, 3.5, 3.5, 3.0, 2.5, 2.5, 2.0]","[23, 83, 157, 133, 284, 342, 33, 246, 737, 211]","[75, 531, 109, 85, 323, 229, 470, 246, 52, 213]","[72, 46, 410, 62, 601, 422, 76, 340, 603, 791]"
128587,96812,26072,7682,20074,"[332, 367, 267, 100, 274, 329, 259, 153, 344, ...","[5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, ...","[222, 539, 40, 525, 383, 24, 601, 29, 157, 575]","[192, 368, 414, 245, 183, 147, 298, 634, 205, ...","[72, 46, 410, 62, 601, 422, 76, 340, 603, 791]"
128588,65482,20574,10362,12742,"[90, 135, 175, 51, 177, 188, 220]","[5.0, 4.0, 4.0, 4.0, 3.0, 2.0, 1.0]","[422, 95, 76, 70, 258, 465, 455, 209, 493, 354]","[618, 23, 715, 323, 801, 429, 211, 135, 667, 229]","[422, 95, 76, 780, 615, 496, 354, 343, 672, 495]"
128589,37798,10175,5664,12742,"[59, 22, 64, 96, 97, 26, 14, 79]","[5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0]","[377, 403, 115, 537, 186, 11, 163, 205, 750, 117]","[117, 183, 393, 496, 214, 116, 60, 377, 485, 121]","[422, 95, 76, 780, 615, 496, 354, 343, 672, 495]"


In [19]:
group_sum_results = evaluate_recommendations(recommends)
group_sum_results

Unnamed: 0,MAP,NDCG
group5,0.014649,0.132971
group6,0.014191,0.129823
group7,0.013505,0.124702


In [24]:
task.upload_artifact('avg_user_metrics', average_user_results)
task.upload_artifact('group_sum_metrics', group_sum_results)

True

In [25]:
task.close()