In [1]:
import os
import sys
from typing import List, Dict
from pathlib import Path

sys.path.append(str(Path(__name__).resolve().parents[1]))

import pandas as pd
import numpy as np


from src.loader.movielens import MovieLensLoader
from src.utils.metrics import RecSysMetrics
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")



In [2]:
movielens_loader = MovieLensLoader(
    num_users=1000,
    num_test_items=5,
)

In [10]:
movielens_dataset = movielens_loader.load()

In [11]:
train = movielens_dataset.train
test = movielens_dataset.test
rank_test = movielens_dataset.test_user2item

In [12]:
class AverageRatingModel(object):
    def __init__(self, movielens_train: pd.DataFrame) -> None:
        self.movieid_index = dict(zip(movielens_train.movie_id.unique(), range(len(movielens_train.movie_id.unique()))))
        self.movie_average_rating = movielens_train.groupby("movie_id").agg({"rating": np.mean})
        
    
    def predict(self, movielens_test: pd.DataFrame) -> pd.DataFrame:
        pred_results = []
        
        for i, row in movielens_test.iterrows():
            user_id = row["user_id"]
            movie_id = row["movie_id"]
            
            if movie_id not in self.movieid_index:
                pred_results.append(0)
                continue
            
            movie_index = self.movieid_index[movie_id]
            
            pred_rating = self.movie_average_rating.loc[movie_id, "rating"]
            pred_results.append(pred_rating)
            
        return pred_results

In [13]:
average_model = AverageRatingModel(train)
pred_ratings = average_model.predict(test)

In [32]:
class PopularRankingModel(object):
    def __init__(self, movielens_train:pd.DataFrame, minimum_num_ratings:int = 30):
        
        self.unique_user_ids = movielens_train.user_id.unique()
        
        self.movie_rank = movielens_train.groupby(["movie_id"]).agg({"rating": [np.size, np.mean]})
        self.user_evaluated_movies = movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()
        minimum_num_indices = self.movie_rank["rating"]["size"] >= minimum_num_ratings
        self.movie_sorted_by_rating = self.movie_rank[minimum_num_indices].sort_values(by=("rating", "mean"), ascending=False).index.tolist()

    def predict(self, top_k:int = 10) -> Dict[int, List[int]]:
        pred_user2items = defaultdict(list)      
        for user_id in self.unique_user_ids:
            for movie_id in self.movie_sorted_by_rating:
                if movie_id not in self.user_evaluated_movies:
                    pred_user2items[user_id].append(movie_id)
                if len(pred_user2items[user_id]) >= top_k:
                    break
        return pred_user2items

In [33]:
popular_rank_model = PopularRankingModel(movielens_train=train, minimum_num_ratings=30)
pred_ranking = popular_rank_model.predict(top_k=10)

In [34]:
print("Test MAE rating", RecSysMetrics().mae(test["rating"], pred_ratings))
print("Test MSE rating", RecSysMetrics().mse(test["rating"], pred_ratings))
print("Test RMSE rating", RecSysMetrics().rmse(test["rating"], pred_ratings))
print(
    "Test Precision@k",
    RecSysMetrics().calc_precision_at_k(
        rank_test, pred_ranking, 5
    ),
)

print(
    "Test Recall@k",
    RecSysMetrics().calc_recall_at_k(
        rank_test, pred_ranking, 5
    ),
)

Test MAE rating 0.8700729642627675
Test MSE rating 1.1846674307496854
Test RMSE rating 1.0884242880190085
Test Precision@k 0.0022085889570552146
Test Recall@k 0.003190184049079754


In [39]:

def average_precision(relevant, retrieved):
    hits = 0
    sum_precisions = 0
    for i, movie in enumerate(retrieved):
        if movie in relevant:
            hits += 1
            sum_precisions += hits / (i + 1)
    return sum_precisions / len(relevant) if relevant else 0

def mean_average_precision(data, predictions):
    map_score = 0
    for user in data:
        map_score += average_precision(data[user], predictions[user])
    return map_score / len(data)

def dcg(relevant, retrieved, k):
    dcg_score = 0
    for i, movie in enumerate(retrieved[:k]):
        if movie in relevant:
            dcg_score += 1 / np.log2(i + 2)
    return dcg_score

def ndcg(relevant, retrieved, k):
    ideal_relevant = sorted(relevant, reverse=True)[:k]
    ideal_dcg = dcg(ideal_relevant, ideal_relevant, k)
    if not ideal_dcg:
        return 0
    return dcg(relevant, retrieved, k) / ideal_dcg

def mean_ndcg(data, predictions, k):
    ndcg_score = 0
    for user in data:
        ndcg_score += ndcg(data[user], predictions[user], k)
    return ndcg_score / len(data)

In [42]:
# Calculate MAP and nDCG:
map_score = mean_average_precision(rank_test, pred_ranking)
ndcg_score = mean_ndcg(rank_test, pred_ranking, 5)

print("MAP:", map_score)
print("nDCG@3:", ndcg_score)

MAP: 0.0012918167948842794
nDCG@3: 0.002439225147425161


In [37]:
pred_ranking

defaultdict(list,
            {196: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             186: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             22: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             244: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             166: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             298: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             115: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             253: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             305: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             62: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             286: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             200: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             210: [963, 1007, 1194, 1142, 1039, 1019, 945, 1

In [26]:
movie_rank = train.groupby(["movie_id", "movie_title"]).agg({"rating": [np.size, np.mean]})

rated_100_indices = movie_rank["rating"]["size"] >= 30

movie_sorted_by_rating = movie_rank[rated_100_indices].sort_values(by=("rating", "mean"), ascending=False)

movie_sorted_by_rating.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
movie_id,movie_title,Unnamed: 2_level_2,Unnamed: 3_level_2
408,"Close Shave, A (1995)",106,4.5
318,Schindler's List (1993),285,4.470175
169,"Wrong Trousers, The (1993)",116,4.465517
483,Casablanca (1942),234,4.457265
64,"Shawshank Redemption, The (1994)",272,4.455882


In [27]:
movie_average_rating = train.groupby("movie_id").agg({"rating":np.mean})
movie_average_rating.sample(5)

Unnamed: 0_level_0,rating
movie_id,Unnamed: 1_level_1
402,3.438272
863,3.956522
952,3.136364
127,4.286802
1259,2.0


In [28]:
movie_rating_predict = test.merge(movie_average_rating, on="movie_id", how='left', suffixes=("_test", "_pred")).fillna(0) # AveragePrediction

In [29]:

# Popularity
top_K = 10
minimum_num_ratings = 30

pred_user2items = defaultdict(list)

user_evaluated_movies = train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()
# Popular movies

movie_ranking = train.groupby("movie_id").agg({"rating": [np.size, np.mean]})
min_flags = movie_ranking["rating"]["size"] > minimum_num_ratings

popluar_movies = movie_ranking[min_flags].sort_values(by=("rating", "mean"), ascending=False).index.tolist()



In [30]:
for user_id in train.user_id.unique():
    for movie_id in popluar_movies:
        if movie_id not in user_evaluated_movies:
            pred_user2items[user_id].append(movie_id)
        if len(pred_user2items[user_id]) >= 10:
            break

In [31]:
pred_user2items

defaultdict(list,
            {196: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             186: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             22: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             244: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             166: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             298: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             115: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             253: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             305: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             62: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             286: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             200: [963, 1007, 1194, 1142, 1039, 1019, 945, 1020, 969, 1021],
             210: [963, 1007, 1194, 1142, 1039, 1019, 945, 1