<a href="https://colab.research.google.com/github/XLingTong/movielens-recommender_uts2025/blob/main/02_cbf_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook 02: Content-Based Filtering using TF-IDF on Genres
This notebook builds a content-based recommender using TF-IDF vectors of movie genres.

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

## Load Item and Ratings Data

In [4]:
# Load movie item data and user ratings
item_df = pd.read_csv("https://raw.githubusercontent.com/XLingTong/movielens-recommender_uts2025/refs/heads/main/u_item.csv", encoding="latin-1")
ratings_df = pd.read_csv("https://raw.githubusercontent.com/XLingTong/movielens-recommender_uts2025/refs/heads/main/u_data.csv", sep=",", header=0)
ratings_df["userID"] = ratings_df["userID"].astype(int)
ratings_df["itemID"] = ratings_df["itemID"].astype(int)
ratings_df["rating"] = ratings_df["rating"].astype(int)
ratings_df = ratings_df[["userID", "itemID", "rating"]]

## Prepare Genre Strings

In [5]:
# Extract genre columns
genre_cols = item_df.columns[5:]
item_df["genres"] = item_df[genre_cols].apply(
    lambda row: " ".join([genre for genre, val in zip(genre_cols, row) if val == 1]),
    axis=1
)

## Generate TF-IDF Matrix for Items

In [7]:
import os
tfidf = TfidfVectorizer()
item_profiles = tfidf.fit_transform(item_df["genres"])

# Create the 'models' directory if it doesn't exist
os.makedirs("models", exist_ok=True)

joblib.dump(item_profiles, "models/tfidf_item_profiles.npz")
print("TF-IDF item profiles saved.")

TF-IDF item profiles saved.


## Build User Profiles

In [8]:
# Build user profiles based on liked items (rating >= 4)
user_profiles = {}
for user_id in ratings_df["userID"].unique():
    liked_items = ratings_df[(ratings_df["userID"] == user_id) & (ratings_df["rating"] >= 4)]["itemID"]
    liked_indices = [i - 1 for i in liked_items if i - 1 < item_profiles.shape[0]]
    if liked_indices:
        liked_matrix = item_profiles[liked_indices]
        user_profiles[user_id] = np.asarray(liked_matrix.mean(axis=0)).reshape(1, -1)

joblib.dump(user_profiles, "models/user_profiles_cbf.pkl")
print("User profiles saved.")

User profiles saved.


## Predict Ratings using CBF

In [9]:
# Predict for all user-item pairs in the original test set
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

cbf_preds = []
for _, row in test_df.iterrows():
    uid, iid = row["userID"], row["itemID"]
    if uid in user_profiles and iid - 1 < item_profiles.shape[0]:
        score = cosine_similarity(user_profiles[uid], item_profiles[iid - 1])[0, 0]
        cbf_preds.append({"userID": uid, "itemID": iid, "cbf_pred": round(score * 5, 4)})

cbf_df = pd.DataFrame(cbf_preds)
cbf_df.to_csv("models/cbf_predictions.csv", index=False)
print("Saved CBF predictions to models/cbf_predictions.csv")

Saved CBF predictions to models/cbf_predictions.csv
