<a href="https://colab.research.google.com/github/Obi-chuks/Recommendation-System/blob/main/Movie_Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================
# HYBRID RECOMMENDATION SYSTEM
# Final Evaluation Score: 24.18%
# Strategy: SVD + Temporal Decay + Popularity Penalty
# ==========================================

import os
import logging
import cloudpickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from collections import defaultdict

# Setup logging to track training milestones in the console
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)


# ==========================================
# 1. TRAINING ENGINE
# ==========================================
class HybridSVD:
    """
    The main engine that learns from user behavior.
    It combines Matrix Factorization with time-based importance weights.
    """
    REQUIRED_COLS = {"userId", "movieId", "rating"}

    def __init__(self, n_components=60):
        # n_components=60: The number of 'hidden' features the model extracts.
        # Too low = misses nuances; Too high = overfits to noise.
        self.n_components = n_components
        self.model        = TruncatedSVD(n_components=n_components, random_state=42)
        self.user_factors = None
        self.item_factors = None
        self.u_map        = {}  # Maps internal indices back to original UserIDs
        self.i_map        = {}  # Maps internal indices back to original MovieIDs
        self.u_inv_map    = {}
        self.pop_counts   = {}  # Stores movie popularity (probability of appearance)
        self._sparse_mx   = None
        self._penalty_vec = None

    def fit(self, interactions):
        """
        Trains the model by centering ratings, applying recency weights,
        and performing Singular Value Decomposition (SVD).
        """
        missing = self.REQUIRED_COLS - set(interactions.columns)
        if missing:
            raise ValueError(f"Missing columns: {missing}")

        interactions = interactions.copy()
        logger.info("Training SVD (n_components=%d) on %d rows...",
                    self.n_components, len(interactions))

        # --- STEP A: USER CENTERING ---
        # Normalizes the data: ensures 'easy raters' and 'harsh critics'
        # are measured on the same relative scale.
        user_means = interactions.groupby("userId")["rating"].mean()
        interactions["centered_rating"] = (
            interactions["rating"] - interactions["userId"].map(user_means)
        )

        # --- STEP B: RECENCY WEIGHTING (TEMPORAL DECAY) ---
        # Uses an exponential decay formula: weight = exp(-days_ago / half_life).
        # This solves the 'Frozen Profile' problem by prioritizing recent tastes.
        most_recent  = interactions["timestamp"].max()
        days_ago     = (most_recent - interactions["timestamp"]).dt.days.clip(lower=0)
        half_life    = 365  # 1 year half-life
        recency_w    = np.exp(-days_ago.values / half_life).astype("float32")

        # Apply the weight to the ratings
        interactions["centered_rating"] = (
            interactions["centered_rating"] * recency_w
        )

        # Map User/Item IDs to categorical codes for matrix construction
        user_cat = interactions["userId"].astype("category")
        item_cat = interactions["movieId"].astype("category")

        self.u_map     = dict(enumerate(user_cat.cat.categories))
        self.i_map     = dict(enumerate(item_cat.cat.categories))
        self.u_inv_map = {v: k for k, v in self.u_map.items()}

        n_users, n_items = len(self.u_map), len(self.i_map)

        # Ensure we don't have more factors than data points
        n_comp = min(self.n_components, n_users - 1, n_items - 1)
        if n_comp != self.n_components:
            self.n_components = n_comp
            self.model = TruncatedSVD(n_components=n_comp, random_state=42)

        # Build Sparse Matrix (CSR format) to save memory (RAM)
        self._sparse_mx = csr_matrix(
            (interactions["centered_rating"].values,
             (user_cat.cat.codes, item_cat.cat.codes)),
            shape=(n_users, n_items),
        )

        # Extract latent features
        self.user_factors = self.model.fit_transform(self._sparse_mx).astype("float32")
        self.item_factors = self.model.components_.astype("float32")

        # Normalize for better ranking (Cosine Similarity approach)
        self.user_factors = normalize(self.user_factors, axis=1)

        # Create a penalty vector based on global popularity to promote diversity
        self.pop_counts   = interactions["movieId"].value_counts(normalize=True).to_dict()
        self._penalty_vec = np.array(
            [self.pop_counts.get(self.i_map[i], 0.0) for i in range(n_items)],
            dtype="float32",
        )

        logger.info("Training done. %d users x %d items.", n_users, n_items)
        return self

    def recommend_all(self, n_recommendations=10, penalty=0.15, batch_size=512):
        """
        Generates recommendations in batches to prevent memory crashes.
        Applies the 'Diversity Penalty' during the scoring phase.
        """
        n_users = self.user_factors.shape[0]
        n_items = self.item_factors.shape[1]
        candidate_k = min(100, n_items)
        k           = min(n_recommendations, candidate_k)
        pv          = self._penalty_vec * penalty

        recs = {}
        for start in range(0, n_users, batch_size):
            end    = min(start + batch_size, n_users)

            # THE CORE CALCULATION: Matrix Multiplication - Popularity Penalty
            scores = (self.user_factors[start:end] @ self.item_factors) - pv

            for li in range(end - start):
                # Seen Filter: don't recommend what the user already watched
                seen = self._sparse_mx[start + li].indices
                if len(seen):
                    scores[li, seen] = -np.inf

            # Get top indices
            top = self._topk(scores, candidate_k)
            for li in range(end - start):
                uid = self.u_map[start + li]
                recs[uid] = [int(self.i_map[m]) for m in top[li][:k]]

        return recs

    @staticmethod
    def _topk(scores, k):
        """Efficient partial sort to find the highest scores."""
        nr, nc = scores.shape
        k    = min(k, nc)
        part = np.argpartition(-scores, k, axis=1)[:, :k]
        rows = np.arange(nr)[:, None]
        return part[rows, np.argsort(-scores[rows, part], axis=1)]


# ==========================================
# 2. LOCAL EVALUATOR (Metrics)
# ==========================================
def dcg_at_k(relevances, k=10):
    """Measures the gain of an item based on its position in the rank."""
    relevances = np.array(relevances[:k])
    if len(relevances) == 0:
        return 0.0
    positions = np.arange(1, len(relevances) + 1)
    return np.sum(relevances / np.log2(positions + 1))


def evaluate(model_recs: dict, test_interactions: pd.DataFrame,
             all_interactions: pd.DataFrame,
             n_recs: int = 10, relevance_threshold: float = 3.5):
    """
    Evaluates the model on: NDCG, Precision, Recall, HitRate, and Catalog Coverage.
    """
    eval_users = set(model_recs.keys())

    # Filter for users who gave high ratings in the test set
    relevant = (
        test_interactions[
            (test_interactions["rating"] >= relevance_threshold) &
            (test_interactions["userId"].isin(eval_users))
        ]
        .groupby("userId")["movieId"]
        .apply(set)
        .to_dict()
    )

    if not relevant:
        return None

    ndcg_scores, prec_scores, rec_scores, hit_scores = [], [], [], []
    all_recommended = set()

    for user_id, true_items in relevant.items():
        recs = model_recs.get(user_id, [])[:n_recs]
        if not recs:
            ndcg_scores.append(0.0); prec_scores.append(0.0)
            rec_scores.append(0.0); hit_scores.append(0.0)
            continue

        hits      = [1 if r in true_items else 0 for r in recs]
        n_hits    = sum(hits)
        n_true    = len(true_items)

        ideal     = [1] * min(n_true, n_recs)
        idcg      = dcg_at_k(ideal, n_recs)
        ndcg      = dcg_at_k(hits, n_recs) / idcg if idcg > 0 else 0.0

        ndcg_scores.append(ndcg)
        prec_scores.append(n_hits / n_recs)
        rec_scores.append(n_hits / n_true if n_true > 0 else 0.0)
        hit_scores.append(1.0 if n_hits > 0 else 0.0)
        all_recommended.update(recs)

    total_items = all_interactions["movieId"].nunique()
    coverage    = len(all_recommended) / total_items if total_items > 0 else 0.0

    return {
        "NDCG@10":      round(np.mean(ndcg_scores) * 100, 2),
        "Precision@10": round(np.mean(prec_scores) * 100, 2),
        "Recall@10":    round(np.mean(rec_scores) * 100, 2),
        "HitRate@10":   round(np.mean(hit_scores) * 100, 2),
        "Coverage":     round(coverage * 100, 2),
        "Combined":     round((0.25 * np.mean(ndcg_scores) + 0.25 * np.mean(prec_scores) +
                               0.20 * np.mean(rec_scores) + 0.15 * np.mean(hit_scores) +
                               0.15 * coverage) * 100, 2),
        "_n_eval_users": len(relevant)
    }


# ==========================================
# 3. CLOSURE FACTORY (Deployment Wrapper)
# ==========================================
def make_model(precomputed_lists: dict, default_list: list):
    """
    Ensures 'model.recommend(uid)' works without needing complex libraries
    on the evaluation server, preventing environment crashes.
    """
    _pre = precomputed_lists
    _def = default_list

    def recommend(user_id, n_recommendations=10):
        k    = int(n_recommendations)
        hits = _pre.get(user_id)
        if hits is not None:
            if k <= len(hits):
                return hits[:k]
            # Backfill with popular movies if recommendations are short
            seen  = set(hits)
            extra = [x for x in _def if x not in seen]
            return (hits + extra)[:k]
        return _def[:k] # Cold-start fallback

    recommend.recommend = recommend
    return recommend


# ==========================================
# 4. DATA LOADING & SPLITTING
# ==========================================
def load_data(path):
    df = pd.read_csv(path)
    # Fix '2003-0' formatting errors in CSV
    df["timestamp"] = df["timestamp"].astype(str).str.replace(r"-0$", "-01-01", regex=True)
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    return df.dropna(subset=["timestamp"]).sort_values("timestamp")

def temporal_split(df, train_ratio=0.8):
    """Splits by time: 80% past for training, 20% future for testing."""
    cutoff = int(len(df) * train_ratio)
    train, test = df.iloc[:cutoff].copy(), df.iloc[cutoff:].copy()

    # Leakage check: make sure the model isn't seeing future data early
    if (test["timestamp"] < train["timestamp"].max()).sum() > 0:
        logger.warning("⚠️ Temporal leakage detected!")
    else:
        logger.info("✅ No temporal leakage detected.")
    return train, test


# ==========================================
# 5. EXECUTION
# ==========================================
if __name__ == "__main__":
    raw_df = load_data("interactions_train.csv")

    # Local simulation
    train_val, test_val = temporal_split(raw_df, train_ratio=0.8)
    h_val = HybridSVD(n_components=60).fit(train_val)
    recs_val = h_val.recommend_all(n_recommendations=10, penalty=0.15, batch_size=256)
    scores = evaluate(recs_val, test_val, raw_df)

    print(f"\nEstimate: {scores['Combined']}% | NDCG: {scores['NDCG@10']}% | Coverage: {scores['Coverage']}%")