In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

train_df = pd.read_csv("../data/processed/train_ratings.csv")
movies = pd.read_csv("../data/raw/movies.csv")

print("Data loaded for diversity metrics ")


Data loaded for diversity metrics 


In [13]:
import pickle

with open("../models/svd_model.pkl", "rb") as f:
    svd = pickle.load(f)

print("SVD model loaded ")


SVD model loaded 


In [14]:
movies["genres_clean"] = movies["genres"].str.lower().str.replace("|", " ")

tfidf = TfidfVectorizer()
genre_matrix = tfidf.fit_transform(movies["genres_clean"])


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_matrix)

print("genre_sim created ")


genre_sim created 


In [28]:
# --- Helper function (define again, safe) ---
def get_unseen_movies(user_id, train_df, movies_df):
    seen_movies = train_df[train_df["userId"] == user_id]["movieId"].unique()
    all_movies = movies_df["movieId"].unique()
    return list(set(all_movies) - set(seen_movies))


# --- Recreate hybrid recommender here ---
def hybrid_recommend(user_id, k=10, alpha=0.7):
    unseen_movies = get_unseen_movies(user_id, train_df, movies)
    scores = []

    for movie_id in unseen_movies:
        svd_score = svd.predict(user_id, movie_id).est
        
        movie_idx = movies[movies["movieId"] == movie_id].index[0]
        genre_score = genre_sim[movie_idx].mean()
        
        final_score = alpha * svd_score + (1 - alpha) * genre_score
        scores.append((movie_id, final_score))
    
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:k]


In [29]:
def intra_list_diversity(movie_ids, movies_df, genre_matrix):
    indices = movies_df[movies_df["movieId"].isin(movie_ids)].index
    sims = cosine_similarity(genre_matrix[indices])
    
    n = len(indices)
    if n <= 1:
        return 0.0
    
    diversity = 1 - sims
    return diversity.sum() / (n * (n - 1))


In [38]:
# Movie popularity = number of ratings per movie
movie_popularity = train_df.groupby("movieId").size()

print("movie_popularity created ")


movie_popularity created 


In [39]:
def catalog_coverage(recommended_movie_ids, total_movies):
    unique_recs = set(recommended_movie_ids)
    return len(unique_recs) / total_movies


In [40]:
def novelty_score(movie_ids, popularity_series):
    scores = []
    for m in movie_ids:
        pop = popularity_series.get(m, 1)  # default = 1 if movie unseen
        scores.append(1 / np.log(1 + pop))
    
    return np.mean(scores)

In [41]:
user_id = 10
hybrid_top10 = hybrid_recommend(user_id, k=5)

hybrid_top10


[(1233, 3.2276026219016147),
 (142488, 2.9942184172281188),
 (1387, 2.9803696131317814),
 (3683, 2.954373162475259),
 (106100, 2.9483136988697556)]

In [42]:
movie_ids = [mid for mid, _ in hybrid_top10]

ild = intra_list_diversity(movie_ids, movies, genre_matrix)
coverage = catalog_coverage(movie_ids, movies.shape[0])
novelty = novelty_score(movie_ids, movie_popularity)

ild, coverage, novelty


(0.907163964512866, 0.0005132416341613632, 0.30080531870218347)