In [1]:
%load_ext autoreload
%autoreload 2

# Evaluating Recommender System Metrics

This notebook demonstrates how to evaluate a recommender system using various metrics including:

| Metric    |      |
|-----------|------|
| RMSE      | Root Mean Squared Error. Lower values mean better accuracy. |
| MAE       |  Mean Absolute Error. Lower values mean better accuracy. |
| HR        |   Hit Rate; how often we are able to recommend a left-out rating. Higher is better. |
| cHR       |  Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better. |
| ARHR      | Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better. |
| Coverage  | Ratio of users for whom recommendations above a certain threshold exist. Higher is better. |
| Diversity | 1-S, where S is the average similarity score between every possible pair of recommendations for a given user. Higher means more diverse. |
| Novelty   |  Average popularity rank of recommended items. Higher means more novel. |

### Import Required Libraries

In [2]:
from recsys.MovieLens import MovieLens
from surprise import SVD
from surprise import KNNBaseline
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from recsys.RecommenderMetrics import RecommenderMetrics
import pandas as pd
import numpy as np

### Preview the ratings data

In [3]:
data_types = { 'userId': np.int32, 'movieId': np.int32, 'rating': np.float32, 'timestamp': np.int32 }
ratings_df = pd.read_csv('../../src/recsys/data/ratings.csv', dtype=data_types)

ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


### Load and Prepare Data

In [4]:
# Initialize MovieLens data loader & load data
lens, ratings_data, rankings  = MovieLens.load()

# Generate item similarities so we can measure diversity later
full_trainset = ratings_data.build_full_trainset()
knn_options = {'name': 'pearson_baseline', 'user_based': False}
similarities_model = KNNBaseline(sim_options=knn_options, verbose=False)
similarities_model.fit(full_trainset)

<surprise.prediction_algorithms.knns.KNNBaseline at 0x77934e90bfe0>

### Train-Test Split and Model Training

In [5]:
trainset, testset = train_test_split(ratings_data, test_size=.25, random_state=1)

algo = SVD(random_state=10, verbose=False)
algo.fit(trainset)

predictions = algo.test(testset)

### Evaluate Model Accuracy

In [6]:
print(f"RMSE: {RecommenderMetrics.rmse(predictions)}")
print(f"MAE: {RecommenderMetrics.mae(predictions)}")

RMSE: 0.9033701087151801
MAE: 0.6977882196132263


### Evaluate Top-N Recommendations using Leave-One-Out Cross Validation

In [7]:
# Set aside one rating per user for testing
loo_iterator = LeaveOneOut(n_splits=1, random_state=1)

for trainset, testset in loo_iterator.split(ratings_data):
    # Train model without left-out ratings
    algo.fit(trainset)

    # Predicts ratings for left-out ratings only
    loo_predictions = algo.test(testset)

    # Create predictions for all ratings not in the training set
    anti_testset = trainset.build_anti_testset()
    all_predictions = algo.test(anti_testset)

    # Calculate top n recommendations for each user
    n=10
    top_n_predictions = RecommenderMetrics.get_top_n(all_predictions, n=n)
    
    # How often we recommended a movie the user actually rated
    # How often we recommended a movie the user actually liked
    # Compute ARHR
    hit_rate, cumulative_hit_rate, average_reciprocal_hit_rank = RecommenderMetrics.hit_rate_metrics(top_n_predictions, loo_predictions, 4.0)
    
    # Hit Rate by rating value
    rating_hit_rate = RecommenderMetrics.rating_hit_rate(top_n_predictions, loo_predictions)
    
    print(f"Hit Rate: {hit_rate:.5f}")
    print("rHR (Hit Rate by rating value):")
    for rating, rate in rating_hit_rate:
        print(f"\t{rating}: {rate:.5f}")
    print(f"cHR (Cumulative Hit Rate, rating >= 4): {cumulative_hit_rate:.5f}")
    print(f"ARHR (Average Reciprocal Hit Rank): {average_reciprocal_hit_rank:.5f}")

Hit Rate: 0.02981
rHR (Hit Rate by rating value):
	3.5: 0.01724
	4.0: 0.04255
	4.5: 0.02083
	5.0: 0.06803
cHR (Cumulative Hit Rate, rating >= 4): 0.02832
ARHR (Average Reciprocal Hit Rank): 0.01116


### Evaluate Complete Recommendations

In [8]:
algo.fit(full_trainset)
anti_testset = full_trainset.build_anti_testset()
all_predictions = algo.test(anti_testset)
top_n_predictions = RecommenderMetrics.get_top_n(all_predictions, n=10)

minimum_rating = 4.0

user_coverage = RecommenderMetrics.user_coverage(
    top_n_predictions, full_trainset.n_users, minimum_rating=minimum_rating
)
diversity = RecommenderMetrics.diversity(top_n_predictions, similarities_model)
novelty = RecommenderMetrics.novelty(top_n_predictions, rankings)

print(f"User coverage, rating >= {minimum_rating}: {user_coverage:.5f}")
print(f"Diversity: {diversity:.5f}")
print(f"Novelty (average popularity rank): {novelty:.5f}")

User coverage, rating >= 4.0: 0.95529
Diversity: 0.96652
Novelty (average popularity rank): 491.57678
