# Content-Based Filtering

### Tag-Based Content Filtering (using the genome-scores and genome-tags datasets)


- Load and verify dataset
- Build and normalize the movie-tag dense matrix
- Split the ratings dataframe into training and testing set by/per user (traing set: 80%, testing set: 20%)
- Build user profiles for each users
- Evaluate the model (user profiles) on the testing set and calculate RMSE and MAE
- Define a simple recommander/function that recommands movies to new user inputs (using cosine similarity)

In [4]:
# Importing all necessary libraries and dependencies 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [5]:
# Load datasets into dataframes
ratings = pd.read_csv('./subset_ratings.csv')    
movies  = pd.read_csv('./subset_movies.csv')     
scores  = pd.read_csv('./ml-20m/genome-scores.csv')
tags    = pd.read_csv('./ml-20m/genome-tags.csv')

In [6]:
# Show the dimensional structure of each dataframe
print("ratings dataframe stucture:", ratings.shape)
print("movies dataframe stucture: ",  movies.shape)
print("scores dataframe stucture: ",  scores.shape)
print("tags dataframe stucture:   ",  tags.shape)

ratings dataframe stucture: (2078625, 4)
movies dataframe stucture:  (7049, 3)
scores dataframe stucture:  (11709768, 3)
tags dataframe stucture:    (1128, 2)


In [7]:
# Display each dataframes 
# Display the schema of each dataframes

# ratings dataframe 
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,19,1,5.0,855176628
1,19,3,4.0,855176684
2,19,6,5.0,855176684
3,19,7,5.0,855176684
4,19,14,4.0,855176723


In [8]:
# movives dataframe
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# scores dataframe
scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [10]:
# tags dataframe
tags.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [11]:
# Build the movie–tag matrix by merging the tags and scores dataframe
# Filter down to only the movies that are in the movies dataframe (movies dataframe is a subset of the orginial movie dataset)
tagged = scores.merge(tags, on='tagId')
tagged = tagged[tagged['movieId'].isin(movies['movieId'])]
movie_tag = tagged.pivot_table(
    index='movieId',
    columns='tag',
    values='relevance',
    fill_value=0
)

movie_tag.head()

tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,0.032,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,0.0205,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,0.02675,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185
4,0.03725,0.0395,0.03675,0.031,0.06825,0.0405,0.02325,0.087,0.05125,0.03025,...,0.0575,0.03375,0.02275,0.03975,0.18525,0.05925,0.015,0.01525,0.0645,0.013
5,0.042,0.05275,0.05925,0.03675,0.07525,0.12525,0.0285,0.085,0.0295,0.02875,...,0.0425,0.02825,0.0215,0.026,0.14275,0.02075,0.0165,0.01675,0.1075,0.01825


In [12]:
# Normalize each movie’s tag vector in movie_tag (L2 norm → 1)
# Compute the L2 norm of each movie’s tag vector
norms = np.linalg.norm(movie_tag.values, axis=1)
# Divide each row by its norm (replace zeros with 1 to avoid division errors)
movie_norm = movie_tag.div(
    pd.Series(norms, index=movie_tag.index).replace(0, 1),
    axis=0
)

print("movie_norm shape:", movie_norm.shape)
print("First normalized row sum of squares value:",
      np.sum(movie_norm.iloc[0].values**2))
# Display the head of the normalized matrix
movie_norm.head()

movie_norm shape: (7008, 1128)
First normalized row sum of squares value: 0.999999999999998


tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.002919,0.002919,0.006742,0.011295,0.017132,0.025333,0.007822,0.030674,0.030586,0.003736,...,0.004611,0.002101,0.005341,0.003823,0.014593,0.004845,0.002247,0.004232,0.009077,0.002685
2,0.005989,0.006591,0.005687,0.007232,0.01661,0.010923,0.007194,0.016535,0.014953,0.003088,...,0.00629,0.0029,0.002599,0.003653,0.018907,0.00339,0.002335,0.002222,0.013597,0.002825
3,0.008897,0.011198,0.005727,0.015748,0.011044,0.01401,0.011453,0.037837,0.010073,0.005471,...,0.008488,0.005471,0.005676,0.007005,0.031803,0.007516,0.003477,0.003988,0.019839,0.003784
4,0.007856,0.008331,0.007751,0.006538,0.014394,0.008541,0.004903,0.018348,0.010809,0.00638,...,0.012127,0.007118,0.004798,0.008383,0.039069,0.012496,0.003164,0.003216,0.013603,0.002742
5,0.0074,0.009294,0.01044,0.006475,0.013259,0.022068,0.005022,0.014977,0.005198,0.005066,...,0.007488,0.004978,0.003788,0.004581,0.025152,0.003656,0.002907,0.002951,0.018941,0.003216


In [13]:
# Training and testing set, 
# Split by/per user to avoid a user's rating is all in the training or the testing set
# Allow us to better evaluate the effectiveness and performance of the model
def split_per_user(df, test_frac=0.2, seed=7):
    train_list, test_list = [], []
    for _, user_df in df.groupby('userId'):
        if len(user_df) < 2:
            train_list.append(user_df)
        else:
            tr, te = train_test_split(user_df, test_size=test_frac, random_state=seed)
            train_list.append(tr)
            test_list.append(te)
    train = pd.concat(train_list)
    test  = pd.concat(test_list) if test_list else pd.DataFrame(columns=df.columns)
    return train, test

# Drop the timestamp column in ratings dataframe since we don't really need it 
ratings = ratings.drop(columns=['timestamp'])

train_df, test_df = split_per_user(ratings)

print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)
print("\nTraining set:")
print(train_df.head())
print("\nTesting set:")
print(test_df.head())

Training set shape: (1658920, 3)
Testing set shape: (419705, 3)

Training set:
    userId  movieId  rating
31      19      661     3.0
45      19      802     4.0
33      19      707     3.0
18      19      100     4.0
17      19       95     3.0

Testing set:
    userId  movieId  rating
13      19       79     3.0
15      19       88     4.0
22      19      141     5.0
27      19      628     4.0
37      19      725     4.0


In [14]:
# Only keep ratings for movies with known tag features
valid_ids = set(movie_norm.index)

train_df = train_df[train_df.movieId.isin(valid_ids)].reset_index(drop=True)
test_df  = test_df[test_df.movieId.isin(valid_ids)].reset_index(drop=True)

print("Filtered training set shape:", train_df.shape)
print("Filtered testing set  shape:", test_df.shape)

Filtered training set shape: (1656986, 3)
Filtered testing set  shape: (419142, 3)


In [15]:
# Build user profiles for all users
min_r, max_r = train_df['rating'].min(), train_df['rating'].max()
user_profiles = {}

for user, grp in train_df.groupby('userId'):
    # Get the tag‑feature rows for movies this user rated
    feats = movie_norm.loc[grp['movieId']].values
    
    # Convert their ratings into [0,1] weights
    weights = (grp['rating'] - min_r) / (max_r - min_r)
    
    # Weight each movie vector by how much they liked it
    weighted = weights.values[:, None] * feats
    
    # Average into one profile vector
    if weights.sum() > 0:
        profile = weighted.sum(axis=0) / weights.sum()
    else:
        profile = np.zeros(feats.shape[1])
    
    # Normalize to unit length so dot‑product = cosine
    normp = np.linalg.norm(profile)
    if normp > 0:
        profile /= normp
    
    user_profiles[user] = profile

print(f"Built profiles for {len(user_profiles)} users.")


Built profiles for 10000 users.


In [16]:
# Pick an example user from the user_profiles 
# Display that user's profile we built 
example_user = next(iter(user_profiles))
profile = user_profiles[example_user]

# Turn it into a pandas Series indexed by tag names
profile_series = pd.Series(profile, index=movie_norm.columns)

# Show the top 10 tags this user “like” and the bottom 10 they “dislike”
print(f"User {example_user} top tags:")
print(profile_series.sort_values(ascending=False).head(10))

print(f"\nUser {example_user} bottom tags:")
print(profile_series.sort_values().head(10))

print("\nSum of squares for this user:", (profile**2).sum())

User 19 top tags:
tag
original        0.149046
mentor          0.114536
good            0.112136
great           0.103878
dialogue        0.102693
catastrophe     0.101782
story           0.097532
great ending    0.096382
chase           0.092687
fun movie       0.092292
dtype: float64

User 19 bottom tags:
tag
hannibal lecter    0.000597
beatles            0.000697
swedish            0.000711
aardman            0.000879
kurosawa           0.000992
miyazaki           0.000997
ballet             0.001097
marx brothers      0.001109
batman             0.001152
studio ghibli      0.001164
dtype: float64

Sum of squares for this user: 0.9999999999999998


In [17]:
# Predict ratings on the testing set and compute RMSE/MAE
# See how well the user profiles we built predict the held-out rating in the testing dataset

y_true, y_pred = [], []

for _, row in test_df.iterrows():
    u, m, actual = row['userId'], row['movieId'], row['rating']
    prof = user_profiles.get(u)
    # Skip if we somehow lack a profile or feature vector
    if prof is None or m not in movie_norm.index:
        continue
    
    # Compute cosine similarity between user profile and movie
    score = prof.dot(movie_norm.loc[m].values)
    
    # Map similarity [0,1] back to original rating scale [min_r, max_r]
    pred = score * (max_r - min_r) + min_r
    
    # Collect for metrics
    y_true.append(actual)
    y_pred.append(pred)

# Compute and print metrics
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae  = mean_absolute_error(y_true, y_pred)

print(f"Content‑based testing set root mean squared error (RMSE): {rmse}")
print(f"Content‑based testing set mean absolute error (MAE) : {mae}")

Content‑based testing set root mean squared error (RMSE): 1.1655463674468387
Content‑based testing set mean absolute error (MAE) : 0.8854863699811888


### Simple Analysis:
RMSE ≈ 1.1655, meaning that on average, our predicted rating is about 1.17 stars away from the true rating (on a 0.5–5.0 scale).

MAE ≈ 0.8855: on average, our prediction are off by about 0.89 stars.

In [18]:
# Simple Movie Recommander 
# A function that will create a new user profile based on the data that are being input, and return top_n movie recommandations that the user might like based on the content-based method/algorithmn used above  
def recommend(user_input, top_n=8):
    
    ui = pd.DataFrame(user_input)
    ui = pd.merge(ui, movies[['movieId','title']], on='title', how='inner')
    
    # Build this new user’s taste profile
    feats   = movie_norm.loc[ui['movieId']].values
    weights = (ui['rating'] - min_r) / (max_r - min_r)
    weighted = weights.values[:, None] * feats
    if weights.sum() > 0:
        profile = weighted.sum(axis=0) / weights.sum()
    else:
        profile = np.zeros(feats.shape[1])
    normp = np.linalg.norm(profile)
    if normp > 0:
        profile /= normp

    # Score all movies by cosine similarity
    scores_all = movie_norm.dot(profile)
    
    # Exclude already‑watched and pick top_n
    watched = set(ui['movieId'])
    recs = scores_all.drop(index=list(watched)).nlargest(top_n)
    
    return movies.set_index('movieId').loc[recs.index][['title','genres']]

# Using this simple movie recommander/ simple function
new_user = [
    {'title':'Breakfast Club, The (1985)', 'rating':1},
    {'title':'Jumanji (1995)', 'rating':4},
    {'title':'Akira (1988)', 'rating':3}
] 
print("Top 10 recommendations:\n", recommend(new_user, top_n=10))

Top 10 recommendations:
                                    title  \
movieId                                    
6517                    Babe, The (1992)   
2987     Who Framed Roger Rabbit? (1988)   
2054     Honey, I Shrunk the Kids (1989)   
2294                         Antz (1998)   
2414        Young Sherlock Holmes (1985)   
2139          Secret of NIMH, The (1982)   
480                 Jurassic Park (1993)   
7781                      Twister (1990)   
4371                     Baby Boy (2001)   
4619              Little Monsters (1989)   

                                                    genres  
movieId                                                     
6517                                                 Drama  
2987     Adventure|Animation|Children|Comedy|Crime|Fant...  
2054              Adventure|Children|Comedy|Fantasy|Sci-Fi  
2294           Adventure|Animation|Children|Comedy|Fantasy  
2414     Action|Adventure|Children|Fantasy|Mystery|Thri...  
2139               

In [19]:
def rank_cbf(user_id, top_n=10):
    """Return a list of top-N movie IDs recommended for the given user using pure CBF."""
    prof = user_profiles.get(user_id)
    if prof is None:
        return []

    sims = movie_norm.dot(prof)
    watched = set(train_df[train_df.userId == user_id].movieId)
    return sims.drop(index=watched).nlargest(top_n).index.tolist()


In [20]:
import json
from pathlib import Path
from tqdm import tqdm

K = 10
N = 1000  # number of users to evaluate

subset_user_ids = test_df.userId.drop_duplicates().sample(N, random_state=42)

preds_cbf = {
    int(u): rank_cbf(u, top_n=K)
    for u in tqdm(subset_user_ids, desc="Generating CBF Top-10")
}

Path("predictions").mkdir(exist_ok=True)
with open("predictions/cbf_top10_subset.json", "w") as f:
    json.dump(preds_cbf, f)


Generating CBF Top-10:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating CBF Top-10: 100%|██████████| 1000/1000 [00:04<00:00, 228.03it/s]


In [27]:
scenario = "ITEM"   # or "USER" or "ITEM"


In [28]:
if scenario=="STANDARD":
    ratings  = pd.read_csv("subset_ratings.csv", usecols=["userId","movieId","rating"])
    train_df, test_df = split_per_user(ratings)
elif scenario=="USER":
    train_df = pd.read_csv("evaluation/user_cold_train.csv", usecols=["userId","movieId","rating"])
    test_df  = pd.read_csv("evaluation/user_cold_test.csv",  usecols=["userId","movieId","rating"])
else:  # ITEM
    train_df = pd.read_csv("evaluation/item_cold_train.csv", usecols=["userId","movieId","rating"])
    test_df  = pd.read_csv("evaluation/item_cold_test.csv",  usecols=["userId","movieId","rating"])

In [29]:
# ─────────────────────────────────────────────────────────────────────────────
# 1) FILTER OUT ANY MOVIES NOT IN YOUR TAG FEATURES MATRIX
# ─────────────────────────────────────────────────────────────────────────────
train_df = train_df[train_df.movieId.isin(movie_norm.index)].reset_index(drop=True)

# ─────────────────────────────────────────────────────────────────────────────
# 2) REBUILD user_profiles FROM THIS train_df
# ─────────────────────────────────────────────────────────────────────────────
min_r, max_r = train_df.rating.min(), train_df.rating.max()
user_profiles = {}

for user, grp in train_df.groupby("userId"):
    feats   = movie_norm.loc[grp.movieId].values                    # (n_movies_rated, n_tags)
    weights = (grp.rating - min_r) / (max_r - min_r)                # normalized to [0,1]
    weighted = weights.values[:, None] * feats                      # weight each tag vector

    if weights.sum() > 0:
        profile = weighted.sum(axis=0) / weights.sum()             # weighted average
    else:
        profile = np.zeros(feats.shape[1])

    normp = np.linalg.norm(profile)
    if normp > 0:
        profile /= normp                                           # unit length

    user_profiles[user] = profile

# ─────────────────────────────────────────────────────────────────────────────
# 3) RANK Top-K MOVIES FOR EVERY USER IN test_df
# ─────────────────────────────────────────────────────────────────────────────
K = 10
preds_cbf = {}
test_users = test_df["userId"].unique().tolist()

for u in tqdm(test_users, desc=f"CBF Top-{K} ({scenario})"):
    prof = user_profiles.get(u)
    if prof is None:
        preds_cbf[int(u)] = []
        continue

    # cosine similarity to every movie’s tag vector
    sims = movie_norm.dot(prof)  
    watched = set(train_df[train_df.userId == u].movieId)
    recs    = sims.drop(index=watched).nlargest(K).index.tolist()
    preds_cbf[int(u)] = recs

# ─────────────────────────────────────────────────────────────────────────────
# 4) DUMP TO A SCENARIO-NAMED JSON
# ─────────────────────────────────────────────────────────────────────────────
outfn = Path("coldstart_pred") / f"cbf_{scenario.lower()}_top{K}.json"
outfn.parent.mkdir(exist_ok=True)

with open(outfn, "w") as f:
    json.dump(preds_cbf, f, indent=2)

print(f"✅  Saved {len(preds_cbf)} users → {outfn}")

CBF Top-10 (ITEM): 100%|██████████| 7982/7982 [00:35<00:00, 226.64it/s]


✅  Saved 7982 users → coldstart_pred\cbf_item_top10.json
