# Evaluator Module
The Evaluator module creates evaluation reports.

Reports contain evaluation metrics depending on models specified in the evaluation config.

In [None]:
# reloads modules automatically before entering the execution of code
#load_ext autoreload
#autoreload 2

# third parties imports
import numpy as np 
import pandas as pd
# -- add new imports here --

# local imports
from configs import EvalConfig
from constants import Constant as C
from loaders import export_evaluation_report
from loaders import load_ratings
from models import get_top_n
# -- add new imports here --
import numpy as np
import pandas as pd
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from configs import EvalConfig 
from surprise import Dataset, Reader, accuracy
from constants import Constant as C
from surprise.model_selection import LeaveOneOut
from collections import defaultdict

# 1. Model validation functions
Validation functions are a way to perform crossvalidation on recommender system models. 

In [2]:
def load_ratings(surprise_format=False):
    df_ratings = pd.read_csv(C.EVIDENCE_PATH / C.RATINGS_FILENAME)
    if surprise_format:
        reader = Reader(rating_scale=C.RATINGS_SCALE)
        data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
        return data
    else:
        return df_ratings

def generate_split_predictions(algo, ratings_dataset, eval_config):
    """
    Generate predictions on a random test set specified in eval_config.
    
    Parameters:
        algo: A Surprise algorithm instance (e.g., SVD, KNNBasic).
        ratings_dataset: A Surprise Dataset object.
        eval_config: An EvalConfig object containing evaluation parameters (e.g., test_size).
    
    Returns:
        List of predictions made by the algorithm on the test set.
    """
    # Récupérer la proportion test depuis eval_config
    test_size = eval_config.test_size
    # Diviser le dataset en train/test
    trainset, testset = train_test_split(ratings_dataset, test_size=test_size)
    # Entraîner le modèle sur le trainset
    algo.fit(trainset)
    # Faire des prédictions sur le testset
    predictions = algo.test(testset)
    return predictions

""""
def generate_loo_top_n(algo, ratings_dataset, eval_config):
    #Generate top-n recommendations for each user on a random Leave-one-out split (LOO)
    #leaveOneOut object with one split
    loo = LeaveOneOut(n_splits=1)
    # Split the dataset into training and testing sets
    trainset, testset = next(loo.split(ratings_dataset))
    # Train the algorithm on the training set
    algo.fit(trainset)
    # Generate the anti-testset
    anti_testset = trainset.build_anti_testset()
    # Generate predictions on the anti-testset
    predictions = algo.test(anti_testset)
    # Get top-N recommendations
    anti_testset_top_n = get_top_n(predictions, n=eval_config.top_n_value)
    return anti_testset_top_n, testset """



""""
def generate_full_top_n(algo, ratings_dataset, eval_config):
    #Generate top-n recommendations for each user with full training set (LOO)
    # Construire l’ensemble d’entraînement complet à partir de toutes les données
    full_trainset = ratings_dataset.build_full_trainset()

    # Entraîner l’algorithme sur toutes les données disponibles
    algo.fit(full_trainset)

    # Générer le anti-testset : tous les items que chaque utilisateur n’a pas encore notés
    anti_testset = full_trainset.build_anti_testset()

    # Générer les prédictions sur le anti-testset
    predictions = algo.test(anti_testset)

    # Extraire les top-N recommandations
    anti_testset_top_n = get_top_n(predictions, n=eval_config.top_n_value)

    return anti_testset_top_n
"""
"""
def precompute_information():
    # Returns a dictionary that precomputes relevant information for evaluating in full mode
    
    #Dictionary keys:
   # - precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to rankings
   # - (-- for your project, add other relevant information here -- )
    
  
    ratings = load_ratings()
    # Compter les évaluations par film et trier par popularité décroissante
    item_counts = ratings['movieId'].value_counts().sort_values(ascending=False)
    # Mapper chaque film à son rang de popularité (1 = plus populaire)
    item_to_rank = {movie: idx + 1 for idx, movie in enumerate(item_counts.index)}
    
    return {'item_to_rank': item_to_rank}            
"""

def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):
    """ Create a DataFrame evaluating various models on metrics specified in an evaluation config.  
    """
    evaluation_dict = {}
    for model_name, model, arguments in eval_config.models:
        print(f'Handling model {model_name}')
        algo = model(**arguments)
        evaluation_dict[model_name] = {}
        
        # Type 1 : split evaluations
        if len(eval_config.split_metrics) > 0:
            print('Training split predictions')
            predictions = generate_split_predictions(algo, sp_ratings, eval_config)
            for metric in eval_config.split_metrics:
                print(f'- computing metric {metric}')
                assert metric in available_metrics['split']
                evaluation_function, parameters =  available_metrics["split"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters) 

        """  # Type 2 : loo evaluations
        if len(eval_config.loo_metrics) > 0:
            print('Training loo predictions')
            anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.loo_metrics:
                assert metric in available_metrics['loo']
                evaluation_function, parameters =  available_metrics["loo"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)
        
        # Type 3 : full evaluations
        if len(eval_config.full_metrics) > 0:
            print('Training full predictions')
            anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.full_metrics:
                assert metric in available_metrics['full']
                evaluation_function, parameters =  available_metrics["full"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(
                    anti_testset_top_n,
                    **precomputed_dict,
                    **parameters
                )"""
        
    return pd.DataFrame.from_dict(evaluation_dict).T

# 2. Evaluation metrics
Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)

In [3]:
""" def get_hit_rate(anti_testset_top_n, testset):
    #Compute the average hit over the users (loo metric)
    
    #A hit (1) happens when the movie in the testset has been picked by the top-n recommender
    #A fail (0) happens when the movie in the testset has not been picked by the top-n recommender
    
   #implement the function get_hit_rate 
    hits = 0
    total = len(testset)  

    # Iterate through each entry in the testset
    for user_id, movie_id, _ in testset:
        top_n_recommendations = anti_testset_top_n.get(user_id, [])
        if movie_id in [recommended_movie[0] for recommended_movie in top_n_recommendations]:
            hits += 1 

    hit_rate = hits / total if total > 0 else 0
    return hit_rate


def get_novelty(anti_testset_top_n, item_to_rank):
    #Compute the average novelty of the top-n recommendation over the users (full metric)
    
    #The novelty is defined as the average ranking of the movies recommended
    
    total_rank = 0
    num_entries = 0
    total_items = len(item_to_rank)
    for user_recommendations in anti_testset_top_n.values():
        for movie_id, _ in user_recommendations:
            total_rank += item_to_rank.get(movie_id, total_items + 1)
            num_entries += 1
    average_rank_sum = total_rank / num_entries if num_entries > 0 else 0
    normalized_novelty = average_rank_sum / total_items  # Normalization step
    return normalized_novelty"""""
    #return average_rank_sum

' def get_hit_rate(anti_testset_top_n, testset):\n    #Compute the average hit over the users (loo metric)\n    \n    #A hit (1) happens when the movie in the testset has been picked by the top-n recommender\n    #A fail (0) happens when the movie in the testset has not been picked by the top-n recommender\n    \n   #implement the function get_hit_rate \n    hits = 0\n    total = len(testset)  \n\n    # Iterate through each entry in the testset\n    for user_id, movie_id, _ in testset:\n        top_n_recommendations = anti_testset_top_n.get(user_id, [])\n        if movie_id in [recommended_movie[0] for recommended_movie in top_n_recommendations]:\n            hits += 1 \n\n    hit_rate = hits / total if total > 0 else 0\n    return hit_rate\n\n\ndef get_novelty(anti_testset_top_n, item_to_rank):\n    #Compute the average novelty of the top-n recommendation over the users (full metric)\n    \n    #The novelty is defined as the average ranking of the movies recommended\n    \n    total_r

# 3. Evaluation workflow
Load data, evaluate models and save the experimental outcomes

In [4]:
AVAILABLE_METRICS = {
    "split": {
        "mae": (accuracy.mae, {'verbose': False}),
        "rmse": (accuracy.rmse, {'verbose': False}),
    },
    #"loo": {
    #   "hit_rate": (get_hit_rate, {}),
    #},
    #"full": {
    #   "novelty": (get_novelty, {}),
    #}
    
    
}

sp_ratings = load_ratings(surprise_format=True)
#precomputed_dict = precompute_information()
precomputed_dict = {}  # Placeholder for precomputed information
evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)
print("Résultats de l'évaluation des 4 modèles :")
display(evaluation_report)  
print(evaluation_report)  
export_evaluation_report(evaluation_report)

Handling model content_1
Training split predictions
- computing metric mae
- computing metric rmse
Handling model content_2
Training split predictions
- computing metric mae
- computing metric rmse
Handling model content_3
Training split predictions
- computing metric mae
- computing metric rmse
Résultats de l'évaluation des 4 modèles :


Unnamed: 0,mae,rmse
content_1,0.746454,0.963854
content_2,0.758656,0.976203
content_3,0.749208,0.964692


                mae      rmse
content_1  0.746454  0.963854
content_2  0.758656  0.976203
content_3  0.749208  0.964692
Evaluation report successfully exported to: /Users/delhoutecharles/Documents/GitHub/Recommender-Systeem-/data/small/evaluations/evaluation_report_2025_05_16.csv


Commentaire:
MAE / RMSE :
La baseline 4 est de loin la plus performante et donc la plus précise selon ces deux indicateurs. Elle est suivie par la baseline 3. En revanche la baseline 1 est la moins performante ce qui en fait le modèle le moins précis.

Hit Rate :
Le taux de succès est très faible pour les baselines 1, 2 et 3, avec seulement environ 0,2 %, 0,4 % et 0,6 % respectivement dans ce cas. Cela montre leur incapacité à recommander efficacement des items pertinents. En revanche, la baseline 4 se distingue avec un Hit Rate nettement supérieur (5,2 %).

Novelty :
Les baselines 1, 2 et 3 obtiennent des scores de nouveauté relativement élevés, ce qui signifie qu’elles recommandent des items moins populaires. À l’inverse, la baseline 4 présente une valeur de nouveauté nettement plus faible ce qui indique qu’elle recommande majoritairement des films très vus. Cela maximise la précision mais se fait au détriment de la diversité des recommandations.