In [1]:
# Hybrid Movie Recommendation System
# Using TF-IDF (Content-Based) + SVD (Collaborative)

import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, pairwise_distances
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
import numpy as np

# --- Step 1: Load and clean dataset ---
movies = pd.read_csv('RS-A2_A3_movie.csv')
movies = movies[['title', 'genres']]
movies.dropna(inplace=True)

# Convert JSON-like or pipe-separated 'genres' column to plain text
# (handles both ['Action', 'Drama'] or Action|Drama)
def clean_genres(x):
    try:
        if isinstance(x, str):
            if x.startswith('['):
                return ' '.join([i['name'] if isinstance(i, dict) else i for i in ast.literal_eval(x)])
            elif '|' in x:
                return x.replace('|', ' ')
            else:
                return x
        return ''
    except Exception:
        return ''

movies['genres'] = movies['genres'].apply(clean_genres)
movies['content'] = movies['genres']  

# --- Step 2: Content-based TF-IDF similarity ---
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['content'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# --- Step 3: Create sample user-rating matrix for collaborative filtering 
np.random.seed(42)
user_ids = [1, 2, 3, 4, 5]
sample_movies = movies['title'].sample(10, random_state=42).tolist()

ratings_data = []
for user in user_ids:
    for title in np.random.choice(sample_movies, 5, replace=False):
        ratings_data.append({
            'user_id': user,
            'title': title,
            'rating': np.random.randint(3, 6)
        })

ratings_df = pd.DataFrame(ratings_data)

# --- Step 4: Collaborative filtering using SVD ---
user_item_matrix = ratings_df.pivot(index='user_id', columns='title', values='rating').fillna(0)
user_item_sparse = csr_matrix(user_item_matrix.values)

svd = TruncatedSVD(n_components=2)
latent_matrix = svd.fit_transform(user_item_sparse)

# --- Step 5: Recommendation functions ---
def get_content_based_recommendations(title, cosine_sim=cosine_sim):
    """Recommend similar movies based on TF-IDF cosine similarity."""
    if title not in movies['title'].values:
        return []
    idx = movies.index[movies['title'] == title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # skip same movie, top 5 similar
    return [movies['title'].iloc[i[0]] for i in sim_scores]

def get_collaborative_recommendations(user_id):
    """Recommend movies based on user similarity using latent features."""
    if user_id not in user_item_matrix.index:
        return []
    user_idx = list(user_item_matrix.index).index(user_id)
    similar_users = pairwise_distances(
        latent_matrix[user_idx].reshape(1, -1), latent_matrix, metric='cosine'
    )[0]
    similar_users_indices = np.argsort(similar_users)[:3]
    recommended_movies = []
    for idx in similar_users_indices:
        uid = list(user_item_matrix.index)[idx]
        recommended_movies.extend(ratings_df[ratings_df['user_id'] == uid]['title'].tolist())
    return list(set(recommended_movies))

def hybrid_recommendations(user_id, title):
    """Combine content-based and collaborative filtering results."""
    content_based = get_content_based_recommendations(title)
    collaborative_based = get_collaborative_recommendations(user_id)
    combined = list(set(content_based + collaborative_based))
    return combined

# --- Step 6: Example Usage ---
user_id = 1
movie_title = movies['title'].iloc[0]  # pick any movie from dataset
recommended_movies = hybrid_recommendations(user_id, movie_title)

print(f"Hybrid Recommendations for user {user_id} and movie '{movie_title}':")
print(recommended_movies)


Hybrid Recommendations for user 1 and movie 'Toy Story (1995)':
[np.str_("Devil's Own, The (1997)"), np.str_('G-Force (2009)'), np.str_('Chaos (2005)'), np.str_('Royal Scandal, The (2001)'), np.str_('Other Man, The (2008)'), np.str_('Black Caesar (1973)'), np.str_('Raze (2013)'), 'Antz (1998)', 'Adventures of Rocky and Bullwinkle, The (2000)', 'Monsters, Inc. (2001)', np.str_('Descent (2007)'), 'Toy Story 2 (1999)', "Emperor's New Groove, The (2000)"]


In [None]:
# Overview (high level)
# - This notebook builds a HYBRID recommendation system that combines:
#     1) Content-based filtering using TF-IDF on movie metadata (genres/text).
#     2) Collaborative filtering using a low-rank approximation (Truncated SVD)
#        of the user-item matrix to capture latent preferences.
# - The hybrid approach merges the two sets of recommendations to get more
#   robust suggestions (covers cold-start with content info and personalized
#   tastes from collaborative signals).
#
# Code flow (what each major block does)
# 1) Data loading & cleaning
#    - Load the CSV of movies (title, genres). Drop missing rows.
#    - Prepare/parse any text fields so they can be vectorized (e.g., convert
#      genre lists or strings into a single text field if needed).
#
# 2) Content-based (TF-IDF) part
#    - TF-IDF vectorizer (sklearn.feature_extraction.text.TfidfVectorizer)
#      transforms each movie's text (genres/description) into a numeric vector.
#    - Purpose of TF-IDF:
#        * TF (term frequency): measures how often a token appears in a doc.
#        * IDF (inverse document frequency): downweights terms that appear
#          in many documents (common words) and boosts rare but informative terms.
#        * Formula (common form):
#            tfidf(t, d) = tf(t, d) * log( N / (1 + df(t)) )
#          where:
#            - tf(t,d) is the term frequency of token t in document d
#            - N is total number of documents
#            - df(t) is number of documents containing token t
#    - The result is a matrix: M_content (num_movies × num_features).
#
# 3) Similarity measure (cosine similarity)
#    - Cosine similarity is used to measure closeness between two vectors.
#    - Formula:
#            cos(θ) = (A · B) / (||A|| * ||B||)
#      where (A · B) is dot product and ||A|| is Euclidean norm of A.
#    - In code:
#        * sklearn.metrics.pairwise.linear_kernel(X, Y) returns dot products.
#        * If vectors are TF-IDF (which are often L2-normalized), the dot product
#          equals cosine similarity.
#    - We use cosine similarity to find movies with similar TF-IDF vectors
#      (i.e., similar genre/description fingerprints).
#
# 4) Collaborative part (Truncated SVD on user-item matrix)
#    - Build a user-item matrix R (users × items) where entries are ratings
#      or implicit feedback (e.g., play counts). This matrix is typically sparse.
#    - Truncated SVD (sklearn.decomposition.TruncatedSVD) factorizes R into:
#            R ≈ U_k Σ_k V_k^T
#      where:
#        - U_k (users × k) are user latent factors,
#        - Σ_k (k × k) is diagonal with top k singular values,
#        - V_k^T (k × items) are item latent factors.
#    - TruncatedSVD computes a low-rank approximation capturing the main
#      latent patterns (e.g., "action-lovers" vs "romcom-lovers" axes).
#    - After decomposition, each user gets a k-dimensional representation.
#      We can compute distances or similarities between users in this latent
#      space to find similar users and recommend items they liked.
#    - Note: TruncatedSVD works well for sparse matrices and is similar in
#      spirit to PCA but works directly on sparse inputs.
#
# 5) Pairwise distances / neighbor selection
#    - pairwise_distances or cosine distances on the user-factor matrix
#      are used to find nearest neighbors in latent space.
#    - Typical approach:
#        * find top similar users to a target user,
#        * aggregate their liked items that the target user hasn't seen,
#        * rank and return those items.
#
# 6) Combining / hybrid logic
#    - The function hybrid_recommendations(user_id, title) does:
#        * get_content_based_recommendations(title) -> list of movies similar
#          to the seed movie (by TF-IDF + cosine similarity).
#        * get_collaborative_recommendations(user_id) -> list of movies from
#          collaborative filtering (via SVD and nearest neighbors).
#        * combine the two lists (e.g., set union or weighted merge) to return
#          the final recommendations.
#    - Combining strategies (common options):
#        * union of top-N from both methods,
#        * weighted scoring: score = α * content_score + (1-α) * collab_score,
#        * fallback: if few collab recs exist (cold-start), rely more on content.
#
# Practical/function notes (mapping to code pieces you likely have)
# - TfidfVectorizer(...):
#     * common options: max_features to limit vocabulary, ngram_range, stop_words,
#       norm='l2' to normalize vectors (makes linear_kernel ~ cosine similarity).
# - linear_kernel(tfidf_matrix, tfidf_matrix) computes matrix of dot products
#   (fast implementation). When tf-idf vectors are L2-normalized, linear_kernel
#   = cosine similarity.
# - csr_matrix: efficient sparse representation for the user-item matrix.
# - TruncatedSVD(n_components=k).fit_transform(R) gives a low-dim embedding.
# - pairwise_distances(embeddings, metric='cosine' or 'euclidean') finds
#   user-to-user or item-to-item distances.
#
# Why hybrid helps (intuition)
# - Content-based strengths: works without user history (cold-start items),
#   explains why an item was recommended (shared genres/keywords).
# - Collaborative strengths: captures taste patterns users don't articulate in
#   metadata (latent features like "dark humour" vs "feel-good pacing").
# - Hybrid mitigates the weaknesses of each when combined.
#
# Practical tips / improvements
# - Normalize scores before combining (scale content and collaborative scores
#   to [0,1] using MinMaxScaler or similar).
# - Use more metadata (cast, director, plot summary) to enrich TF-IDF.
# - Use implicit-feedback weighting (e.g., popularity, recency) for collaborative
#   signals to improve recommendations.
# - Experiment with the number of SVD components (k) — too small loses detail,
#   too large keeps noise.
#
# Quick math summary (cheat-sheet)
# - TF-IDF:
#     tfidf(t, d) = tf(t, d) * log( N / (1 + df(t)) )
# - Cosine similarity between vectors a and b:
#     cos(a, b) = (a · b) / (||a|| * ||b||)
# - SVD (matrix M):
#     M = U Σ V^T  (full SVD)
#     Truncated SVD keeps top-k: M ≈ U_k Σ_k V_k^T


In [None]:
# ------------------ TF-IDF (Theory + Intuition + Example) ------------------
#
# PURPOSE:
# --------
# TF-IDF stands for **Term Frequency–Inverse Document Frequency**.
# It is a statistical method used to convert text data (like movie tags, genres, or descriptions)
# into **numerical feature vectors** that can be used by machine learning models.
#
# WHY WE USE IT:
# --------------
# - In a recommendation system, we need a numeric representation of text content (e.g., movie tags or summaries).
# - TF-IDF helps highlight *important and distinctive* words for each item (movie).
# - It gives more weight to words that appear often in one movie but not across all movies.
# - It reduces the influence of very common words like "the", "movie", "film", etc.
#
# CORE IDEA:
# -----------
# TF-IDF = Term Frequency (TF) × Inverse Document Frequency (IDF)
#
# Mathematically:
#   tfidf(t, d) = tf(t, d) * idf(t)
#
# where:
#   tf(t, d)   = (Number of times term t appears in document d) / (Total terms in d)
#   idf(t)     = log( N / (1 + df(t)) )
#   N          = total number of documents (e.g., total movies)
#   df(t)      = number of documents that contain term t
#
# EXPLANATION:
# ------------
# - TF (Term Frequency): Measures how frequently a word occurs in a single document.
#   → Higher TF means the term is important for that document.
#
# - IDF (Inverse Document Frequency): Measures how rare a word is across all documents.
#   → A term appearing in many documents is less useful for distinguishing them.
#   → The log ensures smoother scaling.
#
# - Multiplying them (TF × IDF):
#   → High score if the term is frequent in one document but rare overall.
#   → Low score if the term is common across all documents.
#IN CODE:
# - Each movie becomes a vector of TF-IDF weights (one dimension per word).
# - This TF-IDF matrix is then used with cosine similarity to find movies with similar content.

# ------------------ COSINE SIMILARITY (theory + example) ------------------
#
# What it measures (intuition):
# - Cosine similarity measures the angle between two vectors in high-dimensional space.
# - It tells us how similar the *direction* of two vectors is, ignoring their magnitudes.
# - For text (TF-IDF) vectors: two documents with similar words have a small angle -> cosine near 1.
#
# Formula (compact):
#   cosine(a, b) = (a · b) / (||a|| * ||b||)
#   where
#     - a · b = sum_i (a_i * b_i)  (dot product)
#     - ||a|| = sqrt(sum_i a_i^2)  (Euclidean norm)
#
# Properties:
# - Range: for TF-IDF (non-negative) vectors cosine ∈ [0, 1] (0 = orthogonal/unrelated, 1 = identical direction).
# - Insensitive to scale: if you multiply a vector by a positive constant, cosine doesn't change.
#
# Step-by-step numeric example:
#   a = [1, 2, 3]
#   b = [4, 5, 6]
#   dot = 1*4 + 2*5 + 3*6 = 32
#   ||a|| = sqrt(1^2 + 2^2 + 3^2) = sqrt(14) ≈ 3.7417
#   ||b|| = sqrt(4^2 + 5^2 + 6^2) = sqrt(77) ≈ 8.7750
#   cosine = 32 / (3.7417 * 8.7750) ≈ 0.9746  (very similar)
#
# How it's used in this notebook:
# - Compute TF-IDF vectors for movies (using tags/genres/descriptions).
# - Cosine similarity between movie vectors => content-similarity matrix.
# - For a user's liked movies, content-score of a candidate can be average or weighted sum of similarities.

# ------------------ SVD / MATRIX FACTORIZATION (theory + training) ------------------
#
# Goal and intuition:
# - Collaborative Filtering via matrix factorization tries to explain the user-item rating matrix R
#   with low-dimensional latent factors. Each user and each item get a k-dimensional vector.
# - The predicted rating is roughly the dot product between user and item vectors (plus biases).
#
# Mathematical view (full SVD vs learned MF):
# - Full SVD (linear algebra): R = U Σ V^T  (requires a fully observed matrix)

# Practical hyperparameters:
# - n_factors (k): dimensionality of latent space (common: 20–200).
# - n_epochs: number of passes over training data.
# - lr (γ): learning rate for SGD — control the update size.
# - reg (λ): regularization strength — prevents overfitting.