In [None]:
%load_ext autoreload
%autoreload 2

# Evaluating Recommender System Metrics

This notebook demonstrates how to evaluate a recommender system using various metrics including:

| Metric    |      |
|-----------|------|
| RMSE      | Root Mean Squared Error. Lower values mean better accuracy. |
| MAE       |  Mean Absolute Error. Lower values mean better accuracy. |
| HR        |   Hit Rate; how often we are able to recommend a left-out rating. Higher is better. |
| cHR       |  Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better. |
| ARHR      | Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better. |
| Coverage  | Ratio of users for whom recommendations above a certain threshold exist. Higher is better. |
| Diversity | 1-S, where S is the average similarity score between every possible pair of recommendations for a given user. Higher means more diverse. |
| Novelty   |  Average popularity rank of recommended items. Higher means more novel. |

### Import Required Libraries

In [32]:
from recsys.MovieLens import MovieLens
from surprise import SVD
from surprise import KNNBaseline
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from recsys.RecommenderMetrics import RecommenderMetrics
import pandas as pd
import numpy as np

### Preview the ratings data

In [None]:
data_types = { 'userId': np.int32, 'movieId': np.int32, 'rating': np.float32, 'timestamp': np.int32 }
ratings_df = pd.read_csv('../../src/recsys/data/ratings.csv', dtype=data_types)

ratings_df.head(3)

### Load and Prepare Data

In [None]:
# Initialize MovieLens data loader & load data
lens, ratings_data, rankings  = MovieLens.load()

# Generate item similarities so we can measure diversity later
full_trainset = ratings_data.build_full_trainset()
knn_options = {'name': 'pearson_baseline', 'user_based': False}
similarities_model = KNNBaseline(sim_options=knn_options, verbose=False)
similarities_model.fit(full_trainset)

### Train-Test Split and Model Training

In [35]:
trainset, testset = train_test_split(ratings_data, test_size=.25, random_state=1)

model = SVD(random_state=10, verbose=False)
model.fit(trainset)

predictions = model.test(testset)

### Evaluate Model Accuracy

In [None]:
print(f"RMSE: {RecommenderMetrics.rmse(predictions)}")
print(f"MAE: {RecommenderMetrics.mae(predictions)}")

### Evaluate Top-N Recommendations using Leave-One-Out Cross Validation

In [None]:
# Set aside one rating per user for testing
loo_iterator = LeaveOneOut(n_splits=1, random_state=1)

for trainset, testset in loo_iterator.split(ratings_data):
    # Train model without left-out ratings
    model.fit(trainset)

    # Predicts ratings for left-out ratings only
    loo_predictions = model.test(testset)

    # Create predictions for all ratings not in the training set
    anti_testset = trainset.build_anti_testset()
    all_predictions = model.test(anti_testset)

    # Calculate top n recommendations for each user
    n=10
    top_n_predictions = RecommenderMetrics.get_top_n(all_predictions, n=n)

    # top_n_predictions2 = RecommenderMetrics.get_top_n2(all_predictions, n=n)

    # print(f"Top-N predictions: {len(top_n_predictions.keys())}")
    # print(f"Top-N predictions2: {len(top_n_predictions2.keys())}")

    # assert len(top_n_predictions) == len(top_n_predictions2)

    # for uid, ratings in top_n_predictions.items():
    #     assert len(ratings) == len(top_n_predictions2[uid]), f"Expected {len(ratings)} but got {len(top_n_predictions2[uid])}"
    #     for i, (iid, rating) in enumerate(ratings):
    #         assert iid == top_n_predictions2[uid][i][1], f"Expected {iid} but got {top_n_predictions2[uid][i][1]}"
    #         assert rating == top_n_predictions2[uid][i][3], f"Expected {rating} but got {top_n_predictions2[uid][i][3]}"
    
    # How often we recommended a movie the user actually rated
    # How often we recommended a movie the user actually liked
    # ARHR
    # Hit Rate by rating value
    hit_rate, cumulative_hit_rate, average_reciprocal_hit_rank, rating_hit_rate = RecommenderMetrics.hit_rate_metrics(top_n_predictions, loo_predictions, 4.0)
    
    print(f"Hit Rate: {hit_rate:.5f}")
    print("rHR (Hit Rate by rating value):")
    for rating, rate in rating_hit_rate:
        print(f"\t{rating}: {rate:.5f}")
    print(f"cHR (Cumulative Hit Rate, rating >= 4): {cumulative_hit_rate:.5f}")
    print(f"ARHR (Average Reciprocal Hit Rank): {average_reciprocal_hit_rank:.5f}")

### Evaluate Complete Recommendations

In [None]:
model.fit(full_trainset)
anti_testset = full_trainset.build_anti_testset()
all_predictions = model.test(anti_testset)
top_n_predictions = RecommenderMetrics.get_top_n(all_predictions, n=10)

minimum_rating = 4.0

user_coverage = RecommenderMetrics.user_coverage(
    top_n_predictions, full_trainset.n_users, minimum_rating=minimum_rating
)
diversity = RecommenderMetrics.diversity(top_n_predictions, similarities_model)
novelty = RecommenderMetrics.novelty(top_n_predictions, rankings)

print(f"User coverage, rating >= {minimum_rating}: {user_coverage:.5f}")
print(f"Diversity: {diversity:.5f}")
print(f"Novelty (average popularity rank): {novelty:.5f}")