In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from collections import defaultdict
import time

# --- Constants ---
RATINGS_FILE = 'RS-A2_A3_Filtered_Ratings.csv'
MOVIES_FILE = 'RS-A2_A3_movie.csv'
TAGS_FILE = 'RS-A2_A3_tag.csv'


HYBRID_ALPHA = 0.7  # 70% CF, 30% CB

def load_data():

    try:
        ratings_df = pd.read_csv(RATINGS_FILE)
        movies_df = pd.read_csv(MOVIES_FILE)
        tags_df = pd.read_csv(TAGS_FILE)

        if 'Unnamed: 0' in ratings_df.columns:
            ratings_df = ratings_df.drop(columns=['Unnamed: 0'])

        return ratings_df, movies_df, tags_df
    except FileNotFoundError as e:
        print(f"Error: {e}. Please make sure all CSV files are in the same directory.")
        return None, None, None



def preprocess_content_data(movies_df, tags_df):

    print("Starting content pre-processing...")

    movies_df['genres_cleaned'] = movies_df['genres'].str.replace('|', ' ', regex=False).fillna("")

    movies_df['genres_cleaned'] = movies_df['genres_cleaned'].str.replace('(no genres listed)', '', regex=False)

    tags_df['tag_cleaned'] = tags_df['tag'].astype(str).str.lower()

    tag_docs = tags_df.groupby('movieId')['tag_cleaned'].apply(lambda x: ' '.join(x))
    tag_docs_df = tag_docs.reset_index()
    tag_docs_df.columns = ['movieId', 'tags_content']

    movies_df = pd.merge(movies_df, tag_docs_df, on='movieId', how='left')
    movies_df['tags_content'] = movies_df['tags_content'].fillna("")

    movies_df['content'] = movies_df['genres_cleaned'] + ' ' + movies_df['tags_content']

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies_df['content'])

    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    indices = pd.Series(movies_df.index, index=movies_df['movieId']).drop_duplicates()

    print("Content pre-processing complete.")
    return cosine_sim, indices, movies_df


def train_cf_model(ratings_df):

    print("Training Collaborative Filtering (SVD) model...")

    min_rating = ratings_df['rating'].min()
    max_rating = ratings_df['rating'].max()
    print(f"Rating scale detected: {min_rating} to {max_rating}")

    reader = Reader(rating_scale=(min_rating, max_rating))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

    trainset = data.build_full_trainset()

    svd = SVD(n_factors=100, n_epochs=20, random_state=42, verbose=False)

    start_time = time.time()
    svd.fit(trainset)
    end_time = time.time()

    print(f"SVD model training complete. Time taken: {end_time - start_time:.2f} seconds.")
    return svd, min_rating, max_rating


def get_hybrid_recommendations(user_id, movies_df, ratings_df, indices, cosine_sim, svd, min_rating, max_rating, alpha=0.7, n_recs=10, n_candidates=100):

    try:
        rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    except KeyError:
        print(f"Error: User {user_id} not found in ratings data.")
        return []

    user_ratings = ratings_df[ratings_df['userId'] == user_id]['rating']
    if user_ratings.empty:
        print(f"User {user_id} has no ratings. Cannot generate recommendations.")
        return []

    rating_threshold = min(np.percentile(user_ratings, 80), 4.0)
    top_rated_df = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= rating_threshold)]
    top_rated_movies = top_rated_df['movieId'].tolist()

    top_movie_indices = [indices[movie_id] for movie_id in top_rated_movies if movie_id in indices]

    all_movies = movies_df['movieId'].unique()

    unseen_movies = [m for m in all_movies if m not in rated_movies]

    print(f"Predicting ratings for {len(unseen_movies)} unseen movies...")
    cf_candidates = []
    for movie_id in unseen_movies:

        pred = svd.predict(user_id, movie_id)
        cf_candidates.append((movie_id, pred.est))

    cf_candidates.sort(key=lambda x: x[1], reverse=True)

    top_n_cf = cf_candidates[:n_candidates]

    hybrid_recs = []

    if not top_movie_indices:

        print("User has no top-rated movies for content matching. Falling back to pure CF.")
        for movie_id, cf_score in top_n_cf:
            hybrid_recs.append((movie_id, cf_score)) # Score is just the CF score

    else:
        print(f"Re-ranking top {n_candidates} candidates using content data...")
        for movie_id, cf_score in top_n_cf:
            if movie_id not in indices:
                continue # Movie has no content data

            candidate_idx = indices[movie_id]

            sim_scores = cosine_sim[candidate_idx, top_movie_indices]

            cb_score = sim_scores.mean()

            norm_cf = (cf_score - min_rating) / (max_rating - min_rating)

            norm_cb = cb_score

            hybrid_score = (alpha * norm_cf) + ((1 - alpha) * norm_cb)
            hybrid_recs.append((movie_id, hybrid_score))

    hybrid_recs.sort(key=lambda x: x[1], reverse=True)

    final_movie_ids = [m[0] for m in hybrid_recs[:n_recs]]

    movie_id_to_title = pd.Series(movies_df.title.values, index=movies_df.movieId).to_dict()

    final_recommendations = []
    for mid in final_movie_ids:
        title = movie_id_to_title.get(mid, "Unknown Movie")
        final_recommendations.append((title, mid))

    return final_recommendations


def main():

    print("Loading data...")
    ratings_df, movies_df, tags_df = load_data()
    if ratings_df is None:
        return
    print(f"Loaded {len(ratings_df)} ratings, {len(movies_df)} movies, {len(tags_df)} tags.")

    cosine_sim, indices, movies_df_processed = preprocess_content_data(movies_df.copy(), tags_df.copy())

    svd, min_r, max_r = train_cf_model(ratings_df.copy())

    user_counts = ratings_df['userId'].value_counts()
    if user_counts.empty:
        print("No users found in ratings data.")
        return

    example_user_id = user_counts.index[0] # User with the most ratings

    print(f"\n--- DEMONSTRATION: User {example_user_id} ---")

    user_ratings = ratings_df[ratings_df['userId'] == example_user_id]
    user_ratings_merged = user_ratings.merge(movies_df_processed, on='movieId', how='left')
    top_5 = user_ratings_merged.sort_values('rating', ascending=False).head(5)

    print(f"\nUser {example_user_id}'s Top 5 Rated Movies (for context):")
    for _, row in top_5.iterrows():
        print(f"  - {row['title']} (Rating: {row['rating']})")

    print("\nCalculating hybrid recommendations...")
    start_time = time.time()
    recommendations = get_hybrid_recommendations(
        user_id=example_user_id,
        movies_df=movies_df_processed,
        ratings_df=ratings_df,
        indices=indices,
        cosine_sim=cosine_sim,
        svd=svd,
        min_rating=min_r,
        max_rating=max_r,
        alpha=HYBRID_ALPHA,
        n_recs=10,
        n_candidates=100
    )
    end_time = time.time()
    print(f"Recommendation generation took {end_time - start_time:.2f} seconds.")

    print(f"\nTop 10 Hybrid Recommendations for User {example_user_id} (Alpha={HYBRID_ALPHA}):")
    if not recommendations:
        print("No recommendations could be generated.")
    else:
        for i, (title, mid) in enumerate(recommendations):
            print(f"  {i+1:2}. {title} (MovieID: {mid})")

if __name__ == "__main__":
    main()

Loading data...
Loaded 10000 ratings, 27278 movies, 465564 tags.
Starting content pre-processing...
Content pre-processing complete.
Training Collaborative Filtering (SVD) model...
Rating scale detected: 1.0 to 5.0
SVD model training complete. Time taken: 0.16 seconds.

--- DEMONSTRATION: User 45989 ---

User 45989's Top 5 Rated Movies (for context):
  - Casino (1995) (Rating: 5.0)
  - Rob Roy (1995) (Rating: 5.0)
  - Dances with Wolves (1990) (Rating: 5.0)
  - Braveheart (1995) (Rating: 5.0)
  - Clueless (1995) (Rating: 5.0)

Calculating hybrid recommendations...
Predicting ratings for 27156 unseen movies...
Re-ranking top 100 candidates using content data...
Recommendation generation took 0.50 seconds.

Top 10 Hybrid Recommendations for User 45989 (Alpha=0.7):
   1. Sense and Sensibility (1995) (MovieID: 17)
   2. Schindler's List (1993) (MovieID: 527)
   3. Piano, The (1993) (MovieID: 509)
   4. American President, The (1995) (MovieID: 11)
   5. Like Water for Chocolate (Como agua p

In [None]:
# ---------------------------------------------------------------------
# Simple-language explanation & theory (copy-paste into Jupyter cell)
# ---------------------------------------------------------------------
#
# OVERVIEW (what this notebook does)
# ---------------------------------
# This notebook builds a **hybrid recommendation system** that combines:
#   1) Collaborative Filtering (CF) via matrix factorization (Surprise's SVD)
#   2) Content-Based (CB) recommendations using movie tags / text and TF-IDF + cosine similarity
#
# The final recommendations are a weighted blend of CF and CB scores using a parameter (HYBRID_ALPHA).
# - HYBRID_ALPHA = 0.7 means 70% weight to CF (SVD) and 30% weight to CB (TF-IDF cosine similarity).
#
# WHY A HYBRID?
# -------------
# - CF recommends items by learning user/item interaction patterns (good for personalization).
# - CB recommends items similar in content to what the user liked (helps with new items or cold-start items).
# - Combining them reduces each approach's weaknesses and improves recommendation coverage and quality.
#
# KEY STEPS IN THE CODE
# ---------------------
# 1) IMPORTS
#    - pandas, numpy for data handling
#    - sklearn.feature_extraction.text.TfidfVectorizer to turn text into numeric features
#    - sklearn.metrics.pairwise.cosine_similarity to measure similarity between item vectors
#    - surprise.Reader/Dataset and surprise.SVD to train a matrix-factorization CF model
#
# 2) DATA LOADING
#    - Read ratings, movies, and tags CSVs into DataFrames.
#    - Clean trivial columns like 'Unnamed: 0' if present.
#
# 3) CONTENT REPRESENTATION (TF-IDF)
#    - Use TfidfVectorizer to convert textual tags/genres/plot into a TF-IDF matrix.
#    - Each movie becomes a high-dimensional vector (one dimension per token).
#
#    Purpose of TfidfVectorizer:
#    - TF-IDF = term-frequency * inverse-document-frequency.
#    - It increases weight for tokens that are frequent in a particular document but rare across documents,
#      which helps highlight distinctive words for each movie.
#
#    Basic TF-IDF formula (common version):
#      tfidf(t, d) = tf(t, d) * idf(t)
#      idf(t) = log( N / (1 + df(t)) )
#      where:
#        - tf(t,d) = frequency of term t in document d (often normalized)
#        - N = total number of documents (movies)
#        - df(t) = number of documents containing term t
#
# 4) CONTENT-BASED SIMILARITY (Cosine similarity)
#    - Compute pairwise cosine similarity between TF-IDF vectors to get how "alike" two movies are.
#
#    Cosine similarity formula:
#      cosine_similarity(a, b) = (a · b) / (||a|| * ||b||)
#      where:
#        - a · b is the dot product of vectors a and b
#        - ||a|| is the Euclidean norm (length) of vector a
#    - Cosine ranges from -1 to 1 (for TF-IDF nonnegative vectors, range is [0,1]).
#
# 5) COLLABORATIVE FILTERING (Surprise SVD)
#    - Use the Surprise library's SVD implementation to learn latent factors for users and items from ratings.
#    - The model approximates the rating matrix R (users × items) by factorizing into low-rank matrices.
#
#    Matrix factorization / SVD idea (intuitive math):
#      Given rating matrix R, approximate:
#         R ≈ U Σ V^T
#      In recommender context we often learn:
#         R_ij ≈ p_i^T q_j + b_i + b_j + μ
#      where:
#         - p_i is the user i latent vector
#         - q_j is the item j latent vector
#         - b_i and b_j are user/item biases, μ is global mean
#      The dot product p_i^T q_j gives the predicted affinity of user i for item j.
#
#    - Surprise's SVD learns these latent vectors using stochastic gradient descent or ALS-like updates.
#
# 6) HYBRID SCORING
#    - For a candidate movie, compute:
#        score_cf  = predicted rating from SVD for (user, movie)
#        score_cb  = similarity score from cosine similarity between candidate and user's liked items
#    - Combine them:
#        hybrid_score = alpha * normalized(score_cf) + (1 - alpha) * normalized(score_cb)
#      where alpha is HYBRID_ALPHA (e.g., 0.7).
#    - Normalization (e.g., MinMax scaling) brings both scores to same scale before blending.
#
# 7) RECOMMENDATION GENERATION
#    - For a target user:
#      - Get top-N candidate items (for efficiency you may sample n_candidates or use items not rated by user).
#      - Compute CF and CB scores for candidates, normalize, blend, then return top-k.
#

# ------------------ TF-IDF (Theory + Intuition + Example) ------------------
#
# PURPOSE:
# --------
# TF-IDF stands for **Term Frequency–Inverse Document Frequency**.
# It is a statistical method used to convert text data (like movie tags, genres, or descriptions)
# into **numerical feature vectors** that can be used by machine learning models.
#
# WHY WE USE IT:
# --------------
# - In a recommendation system, we need a numeric representation of text content (e.g., movie tags or summaries).
# - TF-IDF helps highlight *important and distinctive* words for each item (movie).
# - It gives more weight to words that appear often in one movie but not across all movies.
# - It reduces the influence of very common words like "the", "movie", "film", etc.
#
# CORE IDEA:
# -----------
# TF-IDF = Term Frequency (TF) × Inverse Document Frequency (IDF)
#
# Mathematically:
#   tfidf(t, d) = tf(t, d) * idf(t)
#
# where:
#   tf(t, d)   = (Number of times term t appears in document d) / (Total terms in d)
#   idf(t)     = log( N / (1 + df(t)) )
#   N          = total number of documents (e.g., total movies)
#   df(t)      = number of documents that contain term t
#
# EXPLANATION:
# ------------
# - TF (Term Frequency): Measures how frequently a word occurs in a single document.
#   → Higher TF means the term is important for that document.
#
# - IDF (Inverse Document Frequency): Measures how rare a word is across all documents.
#   → A term appearing in many documents is less useful for distinguishing them.
#   → The log ensures smoother scaling.
#
# - Multiplying them (TF × IDF):
#   → High score if the term is frequent in one document but rare overall.
#   → Low score if the term is common across all documents.
#IN CODE:
# - Each movie becomes a vector of TF-IDF weights (one dimension per word).
# - This TF-IDF matrix is then used with cosine similarity to find movies with similar content.

# ------------------ COSINE SIMILARITY (theory + example) ------------------
#
# What it measures (intuition):
# - Cosine similarity measures the angle between two vectors in high-dimensional space.
# - It tells us how similar the *direction* of two vectors is, ignoring their magnitudes.
# - For text (TF-IDF) vectors: two documents with similar words have a small angle -> cosine near 1.
#
# Formula (compact):
#   cosine(a, b) = (a · b) / (||a|| * ||b||)
#   where
#     - a · b = sum_i (a_i * b_i)  (dot product)
#     - ||a|| = sqrt(sum_i a_i^2)  (Euclidean norm)
#
# Properties:
# - Range: for TF-IDF (non-negative) vectors cosine ∈ [0, 1] (0 = orthogonal/unrelated, 1 = identical direction).
# - Insensitive to scale: if you multiply a vector by a positive constant, cosine doesn't change.
#
# Step-by-step numeric example:
#   a = [1, 2, 3]
#   b = [4, 5, 6]
#   dot = 1*4 + 2*5 + 3*6 = 32
#   ||a|| = sqrt(1^2 + 2^2 + 3^2) = sqrt(14) ≈ 3.7417
#   ||b|| = sqrt(4^2 + 5^2 + 6^2) = sqrt(77) ≈ 8.7750
#   cosine = 32 / (3.7417 * 8.7750) ≈ 0.9746  (very similar)
#
# How it's used in this notebook:
# - Compute TF-IDF vectors for movies (using tags/genres/descriptions).
# - Cosine similarity between movie vectors => content-similarity matrix.
# - For a user's liked movies, content-score of a candidate can be average or weighted sum of similarities.

# ------------------ SVD / MATRIX FACTORIZATION (theory + training) ------------------
#
# Goal and intuition:
# - Collaborative Filtering via matrix factorization tries to explain the user-item rating matrix R
#   with low-dimensional latent factors. Each user and each item get a k-dimensional vector.
# - The predicted rating is roughly the dot product between user and item vectors (plus biases).
#
# Mathematical view (full SVD vs learned MF):
# - Full SVD (linear algebra): R = U Σ V^T  (requires a fully observed matrix)

# Practical hyperparameters:
# - n_factors (k): dimensionality of latent space (common: 20–200).
# - n_epochs: number of passes over training data.
# - lr (γ): learning rate for SGD — control the update size.
# - reg (λ): regularization strength — prevents overfitting.