# Evaluator Module
The Evaluator module creates evaluation reports.

Reports contain evaluation metrics depending on models specified in the evaluation config.

In [1]:
# reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2

# third parties imports
import numpy as np 
import pandas as pd
# -- add new imports here --

# local imports
from configs import EvalConfig
from constants import Constant as C
from loaders import export_evaluation_report
from loaders import load_ratings
# -- add new imports here --
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from models import get_top_n
from surprise import accuracy

# 1. Model validation functions
Validation functions are a way to perform crossvalidation on recommender system models. 

In [2]:
df_ratings = load_ratings(surprise_format=True)

def generate_split_predictions(algo, ratings_dataset, eval_config):
    """Generate predictions on a random test set specified in eval_config"""
    # -- implement the function generate_split_predictions 
    trainset, testset = train_test_split(ratings_dataset, test_size=eval_config.test_size)

    #train the algorithm on the training set
    algo.fit(trainset)
    #Generate predictions on the testset
    predictions = algo.test(testset)
    return predictions
    
def generate_loo_top_n(algo, ratings_dataset, eval_config):
    """Generate top-n recommendations for each user on a random Leave-one-out split (LOO)"""
    loo = LeaveOneOut(n_splits=1)
    trainset, testset = next(loo.split(ratings_dataset))
    #train the algorithm on the training set
    algo.fit(trainset)
    #Generate the anti-testset
    anti_testset = trainset.build_anti_testset()
    #Generate predictions on the anti-testset
    predictions = algo.test(anti_testset)
    #Get top-N recommendations
    anti_testset_top_n = get_top_n(predictions, n=eval_config.top_n_value)
    return anti_testset, testset


def generate_full_top_n(algo, ratings_dataset, eval_config):
    """Generate top-n recommendations for each user with full training set (LOO)"""
    #build the full trainset
    trainset = ratings_dataset.build_full_trainset()
    #train the algorithm on the full training set
    algo.fit(trainset)
    #Generate the anti-testset
    anti_testset = trainset.build_anti_testset()
    #Generate predictions on the anti-testset
    predictions = algo.test(anti_testset)
    #Get top-N recommendations
    anti_testset_top_n = get_top_n(predictions, n=eval_config.top_n_value)
    return anti_testset_top_n


def precompute_information():
    """ Returns a dictionary that precomputes relevant information for evaluating in full mode
    
    Dictionary keys:
    - precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to rankings
    - (-- for your project, add other relevant information here -- )
    """
    precomputed_dict = {}
    precomputed_dict["item_to_rank"] = None
    return precomputed_dict    
from collections import Counter

from collections import Counter

def precompute_information():
    """ Returns a dictionary that precomputes relevant information for evaluating in full mode
    
    Dictionary keys:
    - precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to rankings
    """
    precomputed_dict = {}
    
    # Charger les données de ratings
    data = load_ratings(surprise_format=False)
    
    # Compter les occurrences de chaque movieId
    item_counts = Counter(data['movieId'])
    
    # Trier les items par popularité décroissante (le plus populaire en premier)
    sorted_items = sorted(item_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Créer un dictionnaire qui associe l'ID de l'item à son rang
    item_to_rank = {item_id: rank + 1 for rank, (item_id, _) in enumerate(sorted_items)}
    
    # Afficher un aperçu du dictionnaire item_to_rank
    print("Item to Rank (top 10 items):")
    for item_id, rank in list(item_to_rank.items())[:10]:  # Affiche les 10 premiers items
        print(f"Item ID: {item_id}, Rank: {rank}")
    
    # Ajouter à notre dictionnaire pré-calculé
    precomputed_dict["item_to_rank"] = item_to_rank
    
    return precomputed_dict

            


def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):
    """ Create a DataFrame evaluating various models on metrics specified in an evaluation config.  
    """
    evaluation_dict = {}
    for model_name, model, arguments in eval_config.models:
        print(f'Handling model {model_name}')
        algo = model(**arguments)
        evaluation_dict[model_name] = {}
        
        # Type 1 : split evaluations
        if len(eval_config.split_metrics) > 0:
            print('Training split predictions')
            predictions = generate_split_predictions(algo, sp_ratings, eval_config)
            for metric in eval_config.split_metrics:
                print(f'- computing metric {metric}')
                assert metric in available_metrics['split']
                evaluation_function, parameters =  available_metrics["split"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters) 

        # Type 2 : loo evaluations
        if len(eval_config.loo_metrics) > 0:
            print('Training loo predictions')
            anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.loo_metrics:
                assert metric in available_metrics['loo']
                evaluation_function, parameters =  available_metrics["loo"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)
        
        # Type 3 : full evaluations
        if len(eval_config.full_metrics) > 0:
            print('Training full predictions')
            anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.full_metrics:
                assert metric in available_metrics['full']
                evaluation_function, parameters =  available_metrics["full"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(
                    anti_testset_top_n,
                    **precomputed_dict,
                    **parameters
                )
        
    return pd.DataFrame.from_dict(evaluation_dict).T

# 2. Evaluation metrics
Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)

In [22]:
def rmse_metric(predictions, **kwargs):
    """Compute RMSE from Surprise predictions."""
    return accuracy.rmse(predictions, verbose=False)

def get_hit_rate(anti_testset_top_n, testset):
    hits = 0
    total = 0

    # Vérification si anti_testset_top_n est vide
    if not anti_testset_top_n:
        print("anti_testset_top_n est vide !")
        return 0

    # Vérification de la structure de testset
    print(f"Testset contient {len(testset)} éléments.")
    for uid, iid, _ in testset:
        print(f"Utilisateur {uid}: Test item {iid}")

        # Accéder aux recommandations pour l'utilisateur par son index
        if uid < len(anti_testset_top_n):
            recommendations = anti_testset_top_n[uid]
        else:
            print(f"Utilisateur {uid} n'existe pas dans anti_testset_top_n.")
            continue  # Passer à l'utilisateur suivant si l'index est hors limite

        print(f"Recommandations pour l'utilisateur {uid}: {recommendations}")
        
        found_hit = False
        # Comparer les items dans testset avec les recommandations
        for rec in recommendations:
            if isinstance(rec, tuple):  # Si rec est un tuple (id, autre_id, score)
                pred_iid = rec[0]  # L'ID de l'item recommandé
            else:  # Sinon, rec est un entier (id)
                pred_iid = rec  # L'ID de l'item recommandé

            if iid == pred_iid:
                hits += 1
                found_hit = True
                break
        
        if not found_hit:
            print(f"Item {iid} n'a pas été trouvé dans les recommandations pour l'utilisateur {uid}.")
        
        total += 1

    # Affichage des résultats
    print(f"Hits: {hits}, Total: {total}")
    return hits / total if total > 0 else 0





def get_novelty(anti_testset_top_n, item_to_rank):
    """Compute the average novelty of the top-n recommendation over the users (full metric)
    
    The novelty is defined as the average ranking of the movies recommended
    """
    total_rank_sum = 0
    total_items = 0
    
    if item_to_rank is None:
        raise ValueError("item_to_rank cannot be None")

    for uid, user_recs in anti_testset_top_n.items():
        for iid, _ in user_recs:
            rank = item_to_rank.get(iid, None)
            if rank is not None:
                total_rank_sum += rank
                total_items += 1

    # Calculate the average novelty
    if total_items > 0:
        average_rank_sum = total_rank_sum / total_items
    else:
        average_rank_sum = 0

    return average_rank_sum

# 3. Evaluation workflow
Load data, evaluate models and save the experimental outcomes

In [23]:
AVAILABLE_METRICS = {
    "split": {
        "mae": (accuracy.mae, {'verbose': False}),
        "rmse": (accuracy.rmse, {'verbose': False}),  # Ajout de RMSE
        # -- ajouter d'autres métriques de split ici --
    },
    "loo": {
        "hit_rate": (get_hit_rate, {}),  # Ajout de Hit Rate pour LOO
    },
    "full": {
        "novelty": (get_novelty, {}),  # Ajout de Novelty pour Full
    }
}

sp_ratings = load_ratings(surprise_format=True)
precomputed_dict = precompute_information()
evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)
export_evaluation_report(evaluation_report)

Item to Rank (top 10 items):
Item ID: 356, Rank: 1
Item ID: 296, Rank: 2
Item ID: 318, Rank: 3
Item ID: 593, Rank: 4
Item ID: 260, Rank: 5
Item ID: 480, Rank: 6
Item ID: 2571, Rank: 7
Item ID: 1, Rank: 8
Item ID: 527, Rank: 9
Item ID: 589, Rank: 10
Handling model baseline_1
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Testset contient 671 éléments.
Utilisateur 1: Test item 1061
Recommandations pour l'utilisateur 1: (1, 17, 3.5427300091611045)
Item 1061 n'a pas été trouvé dans les recommandations pour l'utilisateur 1.
Utilisateur 2: Test item 370
Recommandations pour l'utilisateur 2: (1, 39, 3.5427300091611045)
Item 370 n'a pas été trouvé dans les recommandations pour l'utilisateur 2.
Utilisateur 3: Test item 3510
Recommandations pour l'utilisateur 3: (1, 47, 3.5427300091611045)
Item 3510 n'a pas été trouvé dans les recommandations pour l'utilisateur 3.
Utilisateur 4: Test item 2003
Recommandations pour l'utilisateur 4: (1, 50, 3.542