In [1]:
from functools import reduce

import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score, average_precision_score

import clearml

from evaluation import evaluate_recommendations

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
data_path = 'data/'
train_data = pd.read_parquet(data_path + 'ratings_train.pq')
test_data = pd.read_parquet(data_path + 'ratings_test.pq')
groups5 = pd.read_parquet(data_path + 'groups5.pq')
groups6 = pd.read_parquet(data_path + 'groups6.pq')
groups7 = pd.read_parquet(data_path + 'groups7.pq')

In [3]:
for i, group in enumerate([groups5, groups6, groups7]):
    test_data = test_data.merge(group, on='userId').rename(columns={'group': f'group{i+5}'})
test_data

Unnamed: 0,userId,movieId,rating,group5,group6,group7
0,41988,790,4.0,31361,14281,11298
1,41988,524,3.0,31361,14281,11298
2,41988,608,4.0,31361,14281,11298
3,41988,695,3.0,31361,14281,11298
4,41988,566,4.0,31361,14281,11298
...,...,...,...,...,...,...
3596663,7343,164,5.0,17403,2054,4080
3596664,7343,193,4.0,17403,2054,4080
3596665,7343,253,4.0,17403,2054,4080
3596666,7343,483,2.0,17403,2054,4080


In [4]:
task = clearml.Task.init(
    project_name = 'MoviesGRS_MFDP', 
    task_name = 'PopularMoviesRecommender', 
    tags = ['Baseline','PopularMoviesRecommender']
)

ClearML Task: created new task id=f8a623ed028b415095d9c87196c336ae
2023-05-29 21:47:53,494 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/f3cb8157bfe7443abdc531a44bb15332/experiments/f8a623ed028b415095d9c87196c336ae/output/log


In [5]:
top_popular_movies = (
    train_data
    .groupby(by='movieId')
    .agg({'userId': 'nunique'})
    .sort_values(by='userId', ascending=False)
    .rename(columns={'userId': 'userCount'})
    .reset_index()
)
top_popular_movies

Unnamed: 0,movieId,userCount
0,68,53310
1,59,51873
2,55,51843
3,100,48477
4,353,48000
...,...,...
797,335,3915
798,158,3910
799,206,3902
800,430,3900


In [6]:
movie_ids = top_popular_movies.movieId.values

In [7]:
unwatched = (
    train_data
    .groupby(by='userId')
    .agg({'movieId': list})
    .reset_index()
)
unwatched['unwatched'] = (
    unwatched.movieId
    .apply(
        lambda x: movie_ids[
            np.isin(movie_ids, x, invert=True)
        ]
    )
)

unwatched = (
    test_data
    .merge(unwatched[['userId', 'unwatched']], on='userId')
    [['userId', *[f'group{i}' for i in range(5, 8)], 'unwatched']]
    .groupby(by=['userId', *[f'group{i}' for i in range(5, 8)]])
    .unwatched
    .first()
    .reset_index()
)
unwatched

Unnamed: 0,userId,group5,group6,group7,unwatched
0,1,4424,23830,7737,"[68, 59, 353, 51, 22, 16, 0, 167, 384, 14, 581..."
1,2,18784,2598,13437,"[68, 59, 100, 353, 82, 90, 22, 97, 394, 16, 38..."
2,3,29761,20828,14279,"[59, 55, 100, 353, 51, 82, 90, 22, 97, 394, 51..."
3,4,12577,17824,10523,"[353, 90, 394, 519, 0, 175, 384, 548, 581, 128..."
4,5,22262,607,10989,"[59, 55, 353, 82, 90, 22, 394, 519, 16, 0, 175..."
...,...,...,...,...,...
128586,128660,32028,27049,10083,"[68, 59, 55, 100, 353, 51, 82, 90, 22, 97, 165..."
128587,128661,22934,22798,20087,"[59, 82, 90, 22, 97, 165, 16, 0, 167, 175, 14,..."
128588,128662,23328,26324,7171,"[68, 59, 55, 100, 353, 51, 82, 90, 97, 394, 51..."
128589,128663,13396,19445,7989,"[68, 59, 55, 100, 353, 51, 82, 90, 22, 97, 394..."


In [8]:
users_watch_history_test: pd.DataFrame = (
    test_data
    .sort_values(by='rating', ascending=False)
    .groupby(by='userId')
    .agg(
        {
        **{f'group{i}': 'first' for i in range(5, 8)},
        'movieId': list,
        'rating': list
        }
    )
    .reset_index()
)
users_watch_history_test['movieId'] = users_watch_history_test.movieId.apply(np.array)
users_watch_history_test['rating'] = users_watch_history_test.rating.apply(np.array)

In [9]:
users_watch_history_test

Unnamed: 0,userId,group5,group6,group7,movieId,rating
0,1,4424,23830,7737,"[384, 613, 181, 16, 114, 533, 584, 131, 572, 5...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, ..."
1,2,18784,2598,13437,"[801, 609, 647, 540, 748, 548, 689, 353, 779, ...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, 4.5, ..."
2,3,29761,20828,14279,"[230, 62, 51, 80, 100, 144, 50, 55, 141, 96, 8...","[5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, ..."
3,4,12577,17824,10523,"[96, 54, 135, 79, 239, 80, 74, 242, 244, 216, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, ..."
4,5,22262,607,10989,"[45, 175, 75, 225, 191, 217, 133, 67, 98, 127,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, ..."
...,...,...,...,...,...,...
128586,128660,32028,27049,10083,"[130, 68, 100, 59, 204, 461, 334, 549, 284, 73...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ..."
128587,128661,22934,22798,20087,"[714, 653, 42, 791, 15, 638, 513, 354, 90, 516...","[4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, ..."
128588,128662,23328,26324,7171,"[251, 134, 167, 186, 68, 154, 157, 82, 496, 495]","[5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 3.0, ..."
128589,128663,13396,19445,7989,"[656, 524, 695, 664, 715, 674, 488, 398, 630, ...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.0, 4.0, ..."


In [10]:
def generate_recommendations(make_recommendations, users_watch_history_test, unwatched):
    for group in [f'group{i}' for i in range(5, 8)]:
        group_unwatched = (
            unwatched
            .groupby(by=group)
            .agg({
                'userId': list,
                'unwatched': lambda x: np.array(reduce(np.intersect1d, x))
            })
            .reset_index()
        )
        group_unwatched['userId'] = group_unwatched.userId.apply(np.array)
        group_unwatched[f'{group}_rec'] = group_unwatched.apply(make_recommendations, axis=1)
        
        users_watch_history_test = users_watch_history_test.merge(group_unwatched[[group, f'{group}_rec']], on=group)
    
    return users_watch_history_test

In [11]:
def recommend(row):
    top_movies = []
    for movie in top_popular_movies.movieId:
        if movie in row["unwatched"]:
            top_movies.append(movie)
            if len(top_movies) == 10:
                return np.array(top_movies)

In [12]:
recommends = generate_recommendations(recommend, users_watch_history_test, unwatched)
recommends

Unnamed: 0,userId,group5,group6,group7,movieId,rating,group5_rec,group6_rec,group7_rec
0,1,4424,23830,7737,"[384, 613, 181, 16, 114, 533, 584, 131, 572, 5...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, ...","[167, 80, 26, 115, 243, 98, 204, 197, 71, 86]","[68, 59, 353, 51, 22, 167, 384, 581, 80, 26]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
1,128307,19099,24828,7737,"[99, 142, 33, 244, 97, 80, 181, 223, 176, 157,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...","[100, 165, 519, 0, 384, 548, 376, 310, 441, 70]","[26, 310, 70, 243, 98, 265, 161, 91, 164, 237]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
2,5104,30892,8245,7737,"[198, 8, 85, 290, 206, 172, 208, 112, 184, 88,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[353, 394, 519, 0, 548, 581, 199, 310, 115, 70]","[167, 488, 96, 75, 689, 142, 511, 156, 263, 584]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
3,47993,16791,1281,7737,"[90, 784, 394, 59, 734, 779, 324, 193, 328, 82...","[5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, ...","[59, 353, 22, 394, 167, 96, 74, 75, 689, 188]","[353, 82, 90, 22, 394, 167, 441, 96, 74, 75]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
4,32850,281,7953,7737,"[384, 730, 55, 394, 548, 90, 793]","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 3.5]","[55, 82, 22, 394, 165, 175, 128, 199, 310, 441]","[16, 548, 80, 441, 70, 99, 488, 74, 75, 98]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
...,...,...,...,...,...,...,...,...,...
128586,90835,5409,12500,20074,"[59, 572, 561, 544, 529, 301, 326, 354, 531]","[5.0, 4.0, 4.0, 3.5, 3.5, 3.0, 2.5, 2.5, 2.0]","[68, 100, 90, 167, 14, 128, 199, 115, 75, 689]","[59, 51, 97, 165, 519, 16, 0, 175, 128, 199]","[22, 16, 167, 128, 199, 80, 115, 441, 70, 99]"
128587,96812,26072,7682,20074,"[332, 367, 267, 100, 274, 329, 259, 153, 344, ...","[5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, ...","[167, 199, 115, 99, 488, 96, 481, 75, 188, 204]","[165, 16, 548, 128, 199, 96, 481, 98, 265, 197]","[22, 16, 167, 128, 199, 80, 115, 441, 70, 99]"
128588,65482,20574,10362,12742,"[90, 135, 175, 51, 177, 188, 220]","[5.0, 4.0, 4.0, 4.0, 3.0, 2.0, 1.0]","[68, 55, 90, 22, 519, 0, 384, 548, 581, 199]","[90, 548, 581, 80, 96, 74, 177, 151, 561, 106]","[59, 353, 51, 90, 22, 394, 519, 16, 0, 167]"
128589,37798,10175,5664,12742,"[59, 22, 64, 96, 97, 26, 14, 79]","[5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0]","[97, 0, 199, 115, 689, 265, 197, 166, 177, 71]","[22, 0, 384, 548, 199, 26, 376, 115, 96, 481]","[59, 353, 51, 90, 22, 394, 519, 16, 0, 167]"


In [13]:
results = evaluate_recommendations(recommends)
results

Unnamed: 0,MAP,NDCG
group5,0.115769,0.457071
group6,0.106048,0.435921
group7,0.095524,0.413474


In [14]:
task.upload_artifact('metrics', results)

True

In [15]:
task.close()

ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start
