Content-Based & Hybrid Recommendation

In [1]:
import os
import random
import joblib
import numpy as np
import pandas as pd
from collections import defaultdict
from surprise import SVD, Dataset, Reader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy import sparse

In [None]:
# Fix Random Seeds for Determinism 
random.seed(42)
np.random.seed(42)

In [None]:
# Paths 
path = "../data"
movies = pd.read_csv(f"{path}/processed/movies.csv")
ratings = pd.read_csv(f"{path}/processed/ratings.csv")

In [None]:
# Ensure movieId and userId dtypes are consistent
movies['movieId'] = movies['movieId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)
ratings['userId'] = ratings['userId'].astype(int)

In [None]:
# Content-Based Recommendation 
genre_cols = ["unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
              "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
              "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

# defensive: if any genre_cols missing, create them with 0
for g in genre_cols:
    if g not in movies.columns:
        movies[g] = 0

# Combine title + genres into a content string
movies['genre_str'] = movies[genre_cols].apply(
    lambda x: ' '.join([g.replace("'", "") for g,val in zip(genre_cols, x) if int(val) == 1]), axis=1
)
movies['content'] = movies['title'].fillna('') + " " + movies['genre_str']

In [None]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['content'])   # sparse matrix (n_movies, n_features)

# Compute cosine similarity 
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)  # shape (n_movies, n_movies)



In [7]:
# Mapping: movieId -> index (int -> int)
movie_idx_map = pd.Series(movies.index, index=movies['movieId']).to_dict()
# and reverse map:
idx_movie_map = pd.Series(movies['movieId'].values, index=movies.index).to_dict()

def recommend_content(movie_id, top_n=10):
    """
    Return top_n similar movies by TF-IDF cosine similarity.
    Safe: returns empty DataFrame if movie_id missing.
    Deterministic: sorts by (-score, index).
    """
    if movie_id not in movie_idx_map:
        print(f"[warn] movie_id {movie_id} not found in movie_idx_map")
        return pd.DataFrame(columns=['title','year'])
    idx = movie_idx_map[movie_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    # sort by similarity desc then index asc to be deterministic
    sim_scores = sorted(sim_scores, key=lambda x: (-x[1], x[0]))[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['title', 'year']].reset_index(drop=True)

In [None]:
# Collaborative Filtering (SVD)
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)
trainset = data.build_full_trainset()

# SVD deterministic with random_state
svd_model = SVD(random_state=42, n_factors=100, n_epochs=20, verbose=False)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d994aab210>

In [None]:
# Hybrid Recommendation (vectorized similarity) 
# Precompute a matrix view for quick index lookups
# For vectorized similarity we will use numpy indexing
cosine_sim_np = np.asarray(cosine_sim)

def hybrid_recommend(user_id, top_n=10, alpha=0.7, top_k_cf=50):
    """
    Hybrid: alpha * CF_pred + (1-alpha) * content_similarity_scaled
    - If user has no ratings, fallback to top CF predictions
    - Vectorized similarity calculation for speed
    """
    # rated movies by user
    rated_movies = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    # all candidate movieIds (unrated)
    all_movie_ids = movies['movieId'].tolist()
    unrated_mask = np.isin(all_movie_ids, rated_movies, invert=True)
    candidate_ids = np.array(all_movie_ids)[unrated_mask]

    # If no candidates (user rated everything) -> return empty
    if len(candidate_ids) == 0:
        return pd.DataFrame(columns=['title','year'])

    # CF predictions for candidates
    cf_preds = []
    for mid in candidate_ids:
        pred = svd_model.predict(user_id, int(mid)).est
        cf_preds.append(pred)
    cf_preds = np.array(cf_preds)

    # take top_k_cf by CF score (deterministic tie-break by movieId)
    candidate_with_preds = list(zip(candidate_ids.astype(int), cf_preds))
    candidate_with_preds.sort(key=lambda x: (-x[1], int(x[0])))
    top_collab = candidate_with_preds[:top_k_cf]

    top_ids = np.array([m for m,_ in top_collab], dtype=int)
    top_cf_scores = np.array([s for _,s in top_collab], dtype=float)

    # compute content similarity score vectorized
    if len(rated_movies) == 0:
        sim_scores = np.zeros(len(top_ids), dtype=float)
    else:
        # get indices in cosine_sim for top_ids and rated_movies
        top_idx = np.array([movie_idx_map[mid] for mid in top_ids], dtype=int)
        rated_idx = np.array([movie_idx_map[mid] for mid in rated_movies if mid in movie_idx_map], dtype=int)
        if rated_idx.size == 0:
            sim_scores = np.zeros(len(top_idx), dtype=float)
        else:
            # For each top movie, average similarity to all rated movies
            sims_to_rated = cosine_sim_np[top_idx][:, rated_idx]   
            sim_scores = sims_to_rated.mean(axis=1)               # average similarity

    # Rescale sim_scores (0..1) to rating scale (0..5) to mix with CF ratings
    sim_scores_scaled = sim_scores * 5.0

    # hybrid score
    hybrid_scores = alpha * top_cf_scores + (1 - alpha) * sim_scores_scaled

    # prepare deterministic sorting (score desc, movieId asc)
    movie_and_score = list(zip(top_ids.astype(int), hybrid_scores))
    movie_and_score.sort(key=lambda x: (-x[1], int(x[0])))

    top_movies = [m for m,_ in movie_and_score[:top_n]]
    return movies[movies['movieId'].isin(top_movies)][['title','year']].reset_index(drop=True)

In [None]:
print("Content-Based Recommendations for 'Toy Story (1995)':")
print(recommend_content(1, top_n=10))

🎬 Content-Based Recommendations for 'Toy Story (1995)':
                               title  year
0  Pyromaniac's Love Story, A (1995)  1995
1                       Balto (1995)  1995
2              Goofy Movie, A (1995)  1995
3  NeverEnding Story III, The (1994)  1994
4                  Pocahontas (1995)  1995
5     FairyTale: A True Story (1997)  1997
6     Philadelphia Story, The (1940)  1940
7       Story of Xinghua, The (1993)  1993
8            Gumby: The Movie (1995)  1995
9                     Aladdin (1992)  1992


In [11]:
user_id_example = 19
print(f"🤝 Hybrid Recommendations for User {user_id_example}:")
print(hybrid_recommend(user_id=user_example if (user_example:=user_id_example) else user_id_example, top_n=5, alpha=0.7))


🤝 Hybrid Recommendations for User 19:
                                               title  year
0                                12 Angry Men (1957)  1957
1                              Close Shave, A (1995)  1995
2  Dr. Strangelove or: How I Learned to Stop Worr...  1963
3                                  Casablanca (1942)  1942
4                                 Brassed Off (1996)  1996


In [None]:
# Save Models
os.makedirs("../models", exist_ok=True)
joblib.dump(svd_model, "../models/svd_model.pkl")
joblib.dump(tfidf, "../models/tfidf_vectorizer.pkl")
# Save sparse TF-IDF matrix with scipy
sparse.save_npz("../models/tfidf_matrix.npz", tfidf_matrix)
joblib.dump(movie_idx_map, "../models/movie_idx_map.pkl")

print("Models saved successfully in '../models' folder.")

Models saved successfully in '../models' folder.
