In [1]:
from functools import reduce

import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score, average_precision_score

import clearml

from evaluation import generate_recommendations, evaluate_recommendations

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
data_path = 'data/'
train_data = pd.read_parquet(data_path + 'ratings_train.pq')
test_data = pd.read_parquet(data_path + 'ratings_test.pq')
groups2 = pd.read_parquet(data_path + 'groups2.pq')
groups3 = pd.read_parquet(data_path + 'groups3.pq')
groups4 = pd.read_parquet(data_path + 'groups4.pq')
groups5 = pd.read_parquet(data_path + 'groups5.pq')
groups6 = pd.read_parquet(data_path + 'groups6.pq')
groups7 = pd.read_parquet(data_path + 'groups7.pq')

In [3]:
groups_list = [f'group{i}' for i in range(2, 8)]
for i, group in enumerate([groups2, groups3, groups4, groups5, groups6, groups7]):
    test_data = test_data.merge(group, on='userId').rename(columns={'group': f'group{i+2}'})
test_data

Unnamed: 0,userId,movieId,rating,group2,group3,group4,group5,group6,group7
0,41988,790,4.0,12988,9525,27113,31361,14281,11298
1,41988,524,3.0,12988,9525,27113,31361,14281,11298
2,41988,608,4.0,12988,9525,27113,31361,14281,11298
3,41988,695,3.0,12988,9525,27113,31361,14281,11298
4,41988,566,4.0,12988,9525,27113,31361,14281,11298
...,...,...,...,...,...,...,...,...,...
3596596,7343,164,5.0,28907,7252,4507,17403,2054,4080
3596597,7343,193,4.0,28907,7252,4507,17403,2054,4080
3596598,7343,253,4.0,28907,7252,4507,17403,2054,4080
3596599,7343,483,2.0,28907,7252,4507,17403,2054,4080


In [4]:
task = clearml.Task.init(
    project_name = 'MoviesGRS_MFDP', 
    task_name = 'PopularMoviesRecommender', 
    tags = ['Baseline','PopularMoviesRecommender', 'TimeSeriesSplit']
)

ClearML Task: created new task id=ba28a1eafb4148a3b6e09cf29506dc3d
2023-05-30 11:58:45,657 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/f3cb8157bfe7443abdc531a44bb15332/experiments/ba28a1eafb4148a3b6e09cf29506dc3d/output/log


In [5]:
top_popular_movies = (
    train_data
    .groupby(by='movieId')
    .agg({'userId': 'nunique'})
    .sort_values(by='userId', ascending=False)
    .rename(columns={'userId': 'userCount'})
    .reset_index()
)
top_popular_movies

Unnamed: 0,movieId,userCount
0,68,53310
1,59,51873
2,55,51843
3,100,48477
4,353,48000
...,...,...
797,335,3915
798,158,3910
799,206,3902
800,430,3900


In [6]:
movie_ids = top_popular_movies.movieId.values

In [7]:
unwatched = (
    train_data
    .groupby(by='userId')
    .agg({'movieId': list})
    .reset_index()
)
unwatched['unwatched'] = (
    unwatched.movieId
    .apply(
        lambda x: movie_ids[
            np.isin(movie_ids, x, invert=True)
        ]
    )
)

unwatched = (
    test_data
    .merge(unwatched[['userId', 'unwatched']], on='userId')
    [['userId', *[f'group{i}' for i in range(2, 8)], 'unwatched']]
    .groupby(by=['userId', *[f'group{i}' for i in range(2, 8)]])
    .unwatched
    .first()
    .reset_index()
)
unwatched

Unnamed: 0,userId,group2,group3,group4,group5,group6,group7,unwatched
0,1,14465,39625,6774,4424,23830,7737,"[68, 59, 353, 51, 22, 16, 0, 167, 384, 14, 581..."
1,2,10418,7556,11348,18784,2598,13437,"[68, 59, 100, 353, 82, 90, 22, 97, 394, 16, 38..."
2,3,53801,35849,12281,29761,20828,14279,"[59, 55, 100, 353, 51, 82, 90, 22, 97, 394, 51..."
3,4,3990,24019,11784,12577,17824,10523,"[353, 90, 394, 519, 0, 175, 384, 548, 581, 128..."
4,5,39404,9579,25927,22262,607,10989,"[59, 55, 353, 82, 90, 22, 394, 519, 16, 0, 175..."
...,...,...,...,...,...,...,...,...
128583,128660,22670,42046,12206,32028,27049,10083,"[68, 59, 55, 100, 353, 51, 82, 90, 22, 97, 165..."
128584,128661,30264,32218,16183,22934,22798,20087,"[59, 82, 90, 22, 97, 165, 16, 0, 167, 175, 14,..."
128585,128662,24301,22982,420,23328,26324,7171,"[68, 59, 55, 100, 353, 51, 82, 90, 97, 394, 51..."
128586,128663,55355,18586,11297,13396,19445,7989,"[68, 59, 55, 100, 353, 51, 82, 90, 22, 97, 394..."


In [8]:
users_watch_history_test: pd.DataFrame = (
    test_data
    .sort_values(by='rating', ascending=False)
    .groupby(by='userId')
    .agg(
        {
        **{f'group{i}': 'first' for i in range(2, 8)},
        'movieId': list,
        'rating': list
        }
    )
    .reset_index()
)
users_watch_history_test['movieId'] = users_watch_history_test.movieId.apply(np.array)
users_watch_history_test['rating'] = users_watch_history_test.rating.apply(np.array)

In [9]:
users_watch_history_test

Unnamed: 0,userId,group2,group3,group4,group5,group6,group7,movieId,rating
0,1,14465,39625,6774,4424,23830,7737,"[384, 613, 181, 16, 114, 533, 131, 584, 572, 4...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, ..."
1,2,10418,7556,11348,18784,2598,13437,"[647, 540, 609, 801, 572, 716, 548, 689, 711, ...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, 4.5, ..."
2,3,53801,35849,12281,29761,20828,14279,"[51, 230, 62, 80, 100, 115, 144, 50, 55, 141, ...","[5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, ..."
3,4,3990,24019,11784,12577,17824,10523,"[79, 239, 80, 74, 242, 135, 54, 96, 0, 227, 17...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, ..."
4,5,39404,9579,25927,22262,607,10989,"[225, 67, 133, 217, 191, 45, 75, 175, 98, 142,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, ..."
...,...,...,...,...,...,...,...,...,...
128583,128660,22670,42046,12206,32028,27049,10083,"[100, 59, 204, 461, 334, 68, 130, 284, 549, 73...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ..."
128584,128661,30264,32218,16183,22934,22798,20087,"[90, 516, 354, 653, 714, 513, 15, 638, 42, 791...","[4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, ..."
128585,128662,24301,22982,420,23328,26324,7171,"[167, 186, 68, 134, 251, 82, 154, 157, 496, 495]","[5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 3.0, ..."
128586,128663,55355,18586,11297,13396,19445,7989,"[524, 695, 664, 656, 715, 488, 674, 561, 630, ...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.0, 4.0, ..."


In [10]:
def recommend(row):
    top_movies = []
    for movie in top_popular_movies.movieId:
        if movie in row["unwatched"]:
            top_movies.append(movie)
            if len(top_movies) == 10:
                return np.array(top_movies)

In [11]:
recommends = generate_recommendations(recommend, users_watch_history_test, unwatched, groups_list)
recommends

Unnamed: 0,userId,group2,group3,group4,group5,group6,group7,movieId,rating,group2_rec,group3_rec,group4_rec,group5_rec,group6_rec,group7_rec
0,1,14465,39625,6774,4424,23830,7737,"[384, 613, 181, 16, 114, 533, 131, 584, 572, 4...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, ...","[68, 59, 353, 51, 16, 0, 14, 80, 26, 376]","[22, 16, 0, 80, 99, 96, 74, 166, 142, 181]","[167, 115, 70, 99, 96, 243, 98, 161, 267, 164]","[167, 80, 26, 115, 243, 98, 204, 197, 71, 86]","[68, 59, 353, 51, 22, 167, 384, 581, 80, 26]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
1,5104,48420,12472,25594,30892,8245,7737,"[208, 85, 8, 290, 198, 206, 172, 184, 112, 195...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[51, 82, 22, 165, 0, 167, 175, 14, 581, 80]","[82, 519, 70, 96, 481, 142, 63, 71, 86, 511]","[51, 22, 394, 165, 519, 175, 548, 581, 26, 376]","[353, 394, 519, 0, 548, 581, 199, 310, 115, 70]","[167, 488, 96, 75, 689, 142, 511, 156, 263, 584]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
2,109704,31819,26528,10792,15512,26177,7737,"[176, 59, 458, 164, 462, 114, 689, 353, 14, 43...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...","[68, 59, 353, 0, 14, 26, 441, 70, 488, 98]","[68, 59, 22, 97, 165, 0, 581, 26, 70, 98]","[22, 97, 519, 548, 581, 441, 70, 488, 188, 265]","[59, 353, 22, 165, 519, 0, 548, 14, 581, 26]","[90, 97, 581, 488, 689, 188, 142, 63, 511, 520]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
3,128307,463,31895,6380,19099,24828,7737,"[97, 244, 33, 142, 80, 99, 181, 223, 176, 157,...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...","[59, 55, 100, 353, 90, 394, 165, 519, 16, 0]","[59, 353, 97, 394, 519, 548, 581, 26, 376, 310]","[68, 519, 0, 384, 548, 26, 70, 99, 488, 96]","[100, 165, 519, 0, 384, 548, 376, 310, 441, 70]","[26, 310, 70, 243, 98, 265, 161, 91, 164, 237]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
4,32850,15425,9692,2595,281,7953,7737,"[55, 394, 730, 384, 90, 548, 793]","[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 3.5]","[68, 82, 97, 394, 165, 16, 0, 175, 384, 548]","[55, 22, 394, 16, 384, 14, 128, 80, 115, 441]","[68, 90, 394, 0, 384, 441, 74, 265, 197, 161]","[55, 82, 22, 394, 165, 175, 128, 199, 310, 441]","[16, 548, 80, 441, 70, 99, 488, 74, 75, 98]","[204, 572, 95, 114, 1, 50, 11, 190, 313, 361]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128583,98075,55317,35155,28186,15593,19307,16497,"[51, 353, 707, 0, 625, 199, 786, 510, 167, 644...","[5.0, 4.0, 4.0, 4.0, 4.0, 3.5, 3.5, 3.0, 3.0, ...","[59, 51, 82, 22, 97, 165, 519, 16, 0, 167]","[90, 22, 97, 175, 14, 581, 128, 80, 26, 376]","[90, 519, 70, 99, 74, 197, 161, 166, 267, 91]","[59, 90, 14, 199, 115, 99, 243, 75, 98, 188]","[90, 376, 441, 481, 267, 71, 237, 181, 454, 357]","[51, 82, 22, 97, 167, 548, 199, 80, 26, 115]"
128584,1764,6389,1952,29114,1967,6059,16497,"[584, 462, 55, 519, 695, 181, 662, 68, 166, 62...","[5.0, 4.5, 4.5, 4.5, 4.0, 4.0, 4.0, 4.0, 3.5, ...","[68, 55, 100, 51, 82, 97, 165, 519, 16, 0]","[55, 16, 0, 199, 115, 99, 96, 74, 98, 265]","[100, 82, 22, 394, 519, 16, 0, 384, 548, 14]","[82, 97, 165, 16, 0, 167, 175, 14, 199, 80]","[68, 100, 82, 22, 97, 394, 165, 167, 14, 199]","[51, 82, 22, 97, 167, 548, 199, 80, 26, 115]"
128585,18641,5515,7461,18994,10646,4255,20065,"[376, 199, 68, 99, 310, 572, 535, 75, 548, 443...","[5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.0, 4.0, 4.0, ...","[68, 59, 100, 90, 22, 97, 16, 0, 167, 548]","[68, 100, 90, 97, 0, 167, 14, 199, 80, 26]","[22, 26, 310, 441, 75, 98, 265, 730, 267, 142]","[22, 97, 16, 548, 188, 197, 730, 267, 177, 91]","[59, 167, 441, 481, 243, 204, 161, 566, 166, 267]","[59, 100, 90, 22, 97, 199, 80, 26, 310, 441]"
128586,46254,44498,8562,3795,15078,21003,20065,"[540, 561, 777, 513, 59, 707, 766, 715, 334, 7...","[5.0, 5.0, 5.0, 5.0, 5.0, 4.5, 4.5, 4.5, 4.5, ...","[100, 82, 90, 519, 175, 581, 26, 310, 115, 99]","[59, 82, 90, 22, 97, 519, 167, 175, 128, 80]","[22, 175, 128, 199, 26, 99, 96, 243, 75, 188]","[80, 99, 75, 689, 204, 566, 730, 142, 91, 63]","[97, 548, 581, 128, 199, 243, 98, 265, 204, 197]","[59, 100, 90, 22, 97, 199, 80, 26, 310, 441]"


In [12]:
results = evaluate_recommendations(recommends, groups_list)
results

ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


Unnamed: 0,MAP,NDCG
group2,0.15038,0.519706
group3,0.130156,0.486931
group4,0.113872,0.455692
group5,0.115771,0.45709
group6,0.106047,0.435922
group7,0.095527,0.413491


In [13]:
task.upload_artifact('metrics', results)

True

In [14]:
task.close()