In [12]:
import pandas as pd
import numpy as np
from collections import defaultdict

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy import sparse
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
# 1. LOAD YOUR DATASET
print("1. LOADING DATASET...")

1. LOADING DATASET...


In [15]:
#Familiarize yourself with the content of your downloaded dataset zip file
import zipfile

zip_path = "/content/archive (11).zip"

# List contents
with zipfile.ZipFile(zip_path, 'r') as z:
    print("Files in zip:\n")
    for name in z.namelist():
        print(name)

Files in zip:

ml-latest-small/README.txt
ml-latest-small/links.csv
ml-latest-small/movies.csv
ml-latest-small/ratings.csv
ml-latest-small/tags.csv


In [16]:
import zipfile
import os


zip_path = "/content/archive (11).zip"
extract_path = "/content/movie_recommendation"

# How To Create Extraction Folder if not exists
os.makedirs(extract_path, exist_ok=True)

# Unzip the dataset
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"✅ Dataset extracted to: {extract_path}")


for root, dirs, files in os.walk(extract_path):
    for name in files[:20]:  # show only first 20 files
        print(os.path.join(root, name))


✅ Dataset extracted to: /content/movie_recommendation
/content/movie_recommendation/ml-latest-small/tags.csv
/content/movie_recommendation/ml-latest-small/README.txt
/content/movie_recommendation/ml-latest-small/movies.csv
/content/movie_recommendation/ml-latest-small/ratings.csv
/content/movie_recommendation/ml-latest-small/links.csv


In [17]:
ratings = pd.read_csv("/content/movie_recommendation/ml-latest-small/ratings.csv")
movies  = pd.read_csv("/content/movie_recommendation/ml-latest-small/movies.csv")
links   = pd.read_csv("/content/movie_recommendation/ml-latest-small/links.csv")
tags    = pd.read_csv("/content/movie_recommendation/ml-latest-small/tags.csv")

print(ratings.shape, movies.shape, links.shape, tags.shape)


(100836, 4) (9742, 3) (9742, 3) (3683, 4)


In [18]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Train/Test Split by User & Utility Mappings

In [19]:
# Cell 3: Split each user's ratings into train/test; build ID maps and helper functions

RATING_THRESHOLD = 4.0      # Ratings >= threshold are considered "relevant" for Precision@K
TEST_FRACTION = 0.2          # Fraction of each user's ratings held out for test
RANDOM_STATE = 42
MIN_RATINGS_PER_USER = 5     # Only users with >= this many ratings are evaluated

rng = np.random.default_rng(RANDOM_STATE)

# Keep only users with sufficient ratings
user_counts = ratings['userId'].value_counts()
eligible_users = set(user_counts[user_counts >= MIN_RATINGS_PER_USER].index)
ratings_eligible = ratings[ratings['userId'].isin(eligible_users)].copy()

# Split per user
def train_test_split_per_user(df, test_fraction=0.2, seed=42):
    rng_local = np.random.default_rng(seed)
    train_rows = []
    test_rows  = []
    for uid, grp in df.groupby('userId'):
        idx = grp.index.to_numpy()
        n   = len(idx)
        if n == 1:
            train_rows.extend(idx)
            continue
        test_size = max(1, int(round(n * test_fraction)))
        test_idx = rng_local.choice(idx, size=test_size, replace=False)
        train_idx = np.setdiff1d(idx, test_idx, assume_unique=True)
        train_rows.extend(train_idx)
        test_rows.extend(test_idx)
    return df.loc[train_rows].reset_index(drop=True), df.loc[test_rows].reset_index(drop=True)

train_ratings, test_ratings = train_test_split_per_user(ratings_eligible, TEST_FRACTION, RANDOM_STATE)

# Build ID mappings using the union of users/items that appear in *train* (model fits on train only).
unique_users = np.sort(train_ratings['userId'].unique())
unique_items = np.sort(train_ratings['movieId'].unique())

user2idx = {u:i for i,u in enumerate(unique_users)}
idx2user = {i:u for u,i in user2idx.items()}
item2idx = {m:i for i,m in enumerate(unique_items)}
idx2item = {i:m for m,i in item2idx.items()}

n_users = len(unique_users)
n_items = len(unique_items)

print(f"Train users/items: {n_users}/{n_items}")

# Build sparse user-item matrix from train
def build_user_item_csr(df, user2idx, item2idx, n_users, n_items):
    rows = []
    cols = []
    data = []
    for r in df.itertuples(index=False):
        if (r.userId in user2idx) and (r.movieId in item2idx):
            rows.append(user2idx[r.userId])
            cols.append(item2idx[r.movieId])
            data.append(r.rating)
    mat = sparse.coo_matrix((data, (rows, cols)), shape=(n_users, n_items))
    return mat.tocsr()

train_csr = build_user_item_csr(train_ratings, user2idx, item2idx, n_users, n_items)

# Helper: get a user's seen items in train
def items_seen_in_train(user_csr_row):
    return set(user_csr_row.indices.tolist())

# Helper: movieId -> title
title_map = dict(zip(movies['movieId'], movies['title']))


Train users/items: 610/8977


User-Based CF: Similarity, Prediction, Recommendation

In [20]:
# Cell 4: User-based collaborative filtering with cosine similarity

# Optionally mean-center users (helps quality)
user_means = np.zeros(n_users)
train_csr_centered = train_csr.astype(float).copy()
for u in range(n_users):
    row = train_csr[u]
    if row.nnz > 0:
        mean_u = row.data.mean()
        user_means[u] = mean_u
        train_csr_centered[u, row.indices] = row.data - mean_u

# Compute user-user cosine similarity on centered ratings
# Normalize each user's vector first (safer for cosine on sparse)
train_csr_centered_norm = normalize(train_csr_centered, norm='l2', axis=1, copy=True)
user_sim = cosine_similarity(train_csr_centered_norm, dense_output=False)  # sparse result possible

def predict_user_based_scores(user_idx, topn_neighbors=25):
    """
    Predict scores for all items for a given user using top-N most similar neighbors.
    Returns a 1D numpy array of length n_items with predicted ratings (filled only for unseen items).
    """
    # Similarities for this user
    sim_row = user_sim[user_idx].toarray().ravel() if sparse.issparse(user_sim) else user_sim[user_idx]
    sim_row[user_idx] = 0.0  # ignore self

    # Top-N neighbors
    if topn_neighbors is not None and topn_neighbors > 0:
        nn_idx = np.argpartition(-sim_row, topn_neighbors)[:topn_neighbors]
        sims = sim_row[nn_idx]
        neighbor_matrix = train_csr_centered[nn_idx]  # centered ratings
    else:
        nn_idx = np.where(sim_row > 0)[0]
        sims = sim_row[nn_idx]
        neighbor_matrix = train_csr_centered[nn_idx]

    # Weighted sum of neighbors' centered ratings
    if len(nn_idx) == 0 or np.all(sims == 0):
        # fallback to user mean if no neighbors
        pred = np.full(n_items, user_means[user_idx])
        return pred

    weights = sims.reshape(-1, 1)  # (neighbors, 1)
    # neighbor_matrix is (neighbors, n_items); do weighted sum
    num = weights.T.dot(neighbor_matrix.toarray()).ravel()  # (n_items,)
    den = np.abs(sims).sum() + 1e-8

    centered_pred = num / den
    pred = centered_pred + user_means[user_idx]
    return pred

def recommend_user_based(user_id, k=10, topn_neighbors=25):
    if user_id not in user2idx:
        raise ValueError("User not found in training set.")
    uid = user2idx[user_id]
    preds = predict_user_based_scores(uid, topn_neighbors=topn_neighbors)
    seen = items_seen_in_train(train_csr[uid])
    # Exclude seen items
    mask = np.ones(n_items, dtype=bool)
    if len(seen) > 0:
        mask[list(seen)] = False
    preds_unseen = np.where(mask, preds, -np.inf)
    topk_idx = np.argpartition(-preds_unseen, k)[:k]
    topk_idx = topk_idx[np.argsort(-preds_unseen[topk_idx])]

    recs = []
    for ii in topk_idx:
        movie_id = idx2item[ii]
        recs.append((movie_id, title_map.get(movie_id, str(movie_id)), preds_unseen[ii]))
    return recs

print("User-based CF ready.")


User-based CF ready.


Precision@K Evaluation (User-Based CF)

In [21]:
# Cell 5: Evaluate Precision@K for User-Based CF

def build_user_test_relevant(test_df, threshold=RATING_THRESHOLD):
    """
    Returns: dict[userId] -> set(movieId) that are relevant in test (rating >= threshold).
    Only consider items that exist in training's item universe (so we can recommend them).
    """
    relevant = defaultdict(set)
    for r in test_df.itertuples(index=False):
        if (r.userId in user2idx) and (r.movieId in item2idx):
            if r.rating >= threshold:
                relevant[r.userId].add(r.movieId)
    return relevant

user_test_rel = build_user_test_relevant(test_ratings, RATING_THRESHOLD)

def precision_at_k_usercf(k=10, topn_neighbors=25, max_users=None):
    users_to_eval = [u for u in unique_users if len(user_test_rel.get(u, set())) > 0]
    if max_users is not None:
        users_to_eval = users_to_eval[:max_users]

    precisions = []
    for u in tqdm(users_to_eval, desc=f"Evaluating UserCF P@{k}"):
        recs = recommend_user_based(u, k=k, topn_neighbors=topn_neighbors)
        rec_items = {mid for (mid, _, _) in recs}
        hits = len(rec_items & user_test_rel[u])
        precisions.append(hits / k)
    return float(np.mean(precisions)) if len(precisions) > 0 else np.nan

P_at_10_user = precision_at_k_usercf(k=10, topn_neighbors=25)
print(f"User-Based CF Precision@10: {P_at_10_user:.4f}")


Evaluating UserCF P@10: 100%|██████████| 602/602 [00:00<00:00, 1316.43it/s]

User-Based CF Precision@10: 0.1477





Item-Based CF: Similarity, Prediction, Recommendation & Evaluation

In [22]:
# Cell 6: Item-based collaborative filtering with cosine similarity

# Build item-centered matrix
item_means = np.zeros(n_items)
train_csr_T = train_csr.transpose().tocsr()  # (n_items, n_users)
train_item_centered = train_csr_T.astype(float).copy()
for i in range(n_items):
    row = train_csr_T[i]
    if row.nnz > 0:
        mean_i = row.data.mean()
        item_means[i] = mean_i
        train_item_centered[i, row.indices] = row.data - mean_i

# Normalize rows for cosine
train_item_centered_norm = normalize(train_item_centered, norm='l2', axis=1, copy=True)
item_sim = cosine_similarity(train_item_centered_norm, dense_output=False)  # (n_items, n_items)

def predict_item_based_for_user(user_idx, topn_neighbors=50):
    """
    For the given user, predict scores for all items using item-item similarity and the user's known ratings.
    """
    user_row = train_csr[user_idx]  # (1, n_items)
    rated_items = user_row.indices
    rated_vals  = user_row.data

    if len(rated_items) == 0:
        # No history; fallback to global item means
        return item_means.copy()

    # Build a weighted sum of similarities * (user rating - item mean) + item mean
    preds = np.zeros(n_items, dtype=float)
    denom = np.zeros(n_items, dtype=float)

    for idx, r_ui in zip(rated_items, rated_vals):
        # Similarities of current rated item to all others
        sim_row = item_sim[idx].toarray().ravel() if sparse.issparse(item_sim) else item_sim[idx]
        sim_row[idx] = 0.0  # exclude self
        if topn_neighbors is not None and topn_neighbors > 0:
            nn = np.argpartition(-sim_row, topn_neighbors)[:topn_neighbors]
            s  = sim_row[nn]
            preds[nn] += s * (r_ui - item_means[idx])
            denom[nn] += np.abs(s)
        else:
            preds += sim_row * (r_ui - item_means[idx])
            denom += np.abs(sim_row)

    with np.errstate(divide='ignore', invalid='ignore'):
        centered = np.divide(preds, denom, out=np.zeros_like(preds), where=denom>0)
    final_pred = centered + item_means
    return final_pred

def recommend_item_based(user_id, k=10, topn_neighbors=50):
    if user_id not in user2idx:
        raise ValueError("User not found in training set.")
    uid = user2idx[user_id]
    preds = predict_item_based_for_user(uid, topn_neighbors=topn_neighbors)
    seen = items_seen_in_train(train_csr[uid])
    mask = np.ones(n_items, dtype=bool)
    if len(seen) > 0:
        mask[list(seen)] = False
    preds_unseen = np.where(mask, preds, -np.inf)
    topk_idx = np.argpartition(-preds_unseen, k)[:k]
    topk_idx = topk_idx[np.argsort(-preds_unseen[topk_idx])]

    recs = []
    for ii in topk_idx:
        movie_id = idx2item[ii]
        recs.append((movie_id, title_map.get(movie_id, str(movie_id)), preds_unseen[ii]))
    return recs

def precision_at_k_itemcf(k=10, topn_neighbors=50, max_users=None):
    users_to_eval = [u for u in unique_users if len(user_test_rel.get(u, set())) > 0]
    if max_users is not None:
        users_to_eval = users_to_eval[:max_users]

    precisions = []
    for u in tqdm(users_to_eval, desc=f"Evaluating ItemCF P@{k}"):
        recs = recommend_item_based(u, k=k, topn_neighbors=topn_neighbors)
        rec_items = {mid for (mid, _, _) in recs}
        hits = len(rec_items & user_test_rel[u])
        precisions.append(hits / k)
    return float(np.mean(precisions)) if len(precisions) > 0 else np.nan

P_at_10_item = precision_at_k_itemcf(k=10, topn_neighbors=50)
print(f"Item-Based CF Precision@10: {P_at_10_item:.4f}")


Evaluating ItemCF P@10: 100%|██████████| 602/602 [00:13<00:00, 44.15it/s]

Item-Based CF Precision@10: 0.0035





SVD implementation using scikit-learn instead of Surprise & Precision@K

In [23]:
# Cell 7: Alternative SVD implementation using scikit-learn instead of Surprise
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
from tqdm import tqdm

# Create user-item matrix from training data
def create_user_item_matrix_svd(train_ratings, user2idx_svd, item2idx_svd):
    """Create sparse user-item rating matrix for SVD"""
    n_users = len(user2idx_svd)
    n_items = len(item2idx_svd)

    # Map to indices
    user_indices = train_ratings['userId'].map(user2idx_svd)
    item_indices = train_ratings['movieId'].map(item2idx_svd)
    ratings = train_ratings['rating'].values

    # Create sparse matrix
    user_item_matrix = csr_matrix(
        (ratings, (user_indices, item_indices)),
        shape=(n_users, n_items)
    )

    return user_item_matrix

# Create separate mappings for SVD (to avoid conflicts with your existing mappings)
unique_users_svd = train_ratings['userId'].unique()
unique_items_svd = train_ratings['movieId'].unique()
user2idx_svd = {user: idx for idx, user in enumerate(unique_users_svd)}
item2idx_svd = {item: idx for idx, item in enumerate(unique_items_svd)}
idx2user_svd = {idx: user for user, idx in user2idx_svd.items()}
idx2item_svd = {idx: item for item, idx in item2idx_svd.items()}

print(f"Creating user-item matrix with {len(unique_users_svd)} users and {len(unique_items_svd)} items")

# Create the user-item matrix
user_item_matrix = create_user_item_matrix_svd(train_ratings, user2idx_svd, item2idx_svd)

# Apply SVD
print("Fitting SVD model...")
n_components = 50  # Same as n_factors in Surprise
svd_model = TruncatedSVD(n_components=n_components, random_state=RANDOM_STATE)

# Fit SVD on the user-item matrix
user_factors = svd_model.fit_transform(user_item_matrix)
item_factors = svd_model.components_.T

# Get global mean rating for bias
global_mean = train_ratings['rating'].mean()

# Calculate user and item biases
user_means = np.array([
    train_ratings[train_ratings['userId'] == idx2user_svd[i]]['rating'].mean()
    if i < len(idx2user_svd) else global_mean
    for i in range(len(unique_users_svd))
])
user_biases = user_means - global_mean

item_means = np.array([
    train_ratings[train_ratings['movieId'] == idx2item_svd[i]]['rating'].mean()
    if i < len(idx2item_svd) else global_mean
    for i in range(len(unique_items_svd))
])
item_biases = item_means - global_mean

def predict_rating(user_id, item_id):
    """Predict rating for user-item pair"""
    if user_id not in user2idx_svd or item_id not in item2idx_svd:
        return global_mean

    user_idx = user2idx_svd[user_id]
    item_idx = item2idx_svd[item_id]

    # Biased SVD prediction: global_mean + user_bias + item_bias + dot_product
    prediction = (global_mean +
                 user_biases[user_idx] +
                 item_biases[item_idx] +
                 np.dot(user_factors[user_idx], item_factors[item_idx]))

    # Clip to rating scale
    min_rating = train_ratings['rating'].min()
    max_rating = train_ratings['rating'].max()
    return np.clip(prediction, min_rating, max_rating)

# Build train user-item sets for filtering
train_user_items = defaultdict(set)
for r in train_ratings.itertuples(index=False):
    train_user_items[r.userId].add(r.movieId)

all_train_items_svd = set(unique_items_svd)

def svd_topk_for_user(user_id, k=10):
    """Recommend top-K items not seen in train for this user using SVD predictions."""
    if user_id not in user2idx_svd:
        return []

    # Get unseen items
    unseen = list(all_train_items_svd - train_user_items[user_id])

    # Predict ratings for unseen items
    predictions = []
    for item_id in unseen:
        pred_rating = predict_rating(user_id, item_id)
        predictions.append((item_id, pred_rating))

    # Sort by predicted rating (descending)
    predictions.sort(key=lambda x: -x[1])

    # Return top-K with movie titles
    topk = predictions[:k]
    return [(mid, title_map.get(mid, str(mid)), rating) for (mid, rating) in topk]

def precision_at_k_svd(k=10, max_users=None):
    """Calculate Precision@K for SVD model"""
    users_to_eval = [u for u in unique_users_svd if len(user_test_rel.get(u, set())) > 0]
    if max_users is not None:
        users_to_eval = users_to_eval[:max_users]

    precisions = []
    for u in tqdm(users_to_eval, desc=f"Evaluating SVD P@{k}"):
        recs = svd_topk_for_user(u, k=k)
        rec_items = {mid for (mid, _, _) in recs}
        hits = len(rec_items & user_test_rel[u])
        precisions.append(hits / k)

    return float(np.mean(precisions)) if len(precisions) > 0 else np.nan

# Calculate Precision@10
print("Calculating Precision@10...")
P_at_10_svd = precision_at_k_svd(k=10)
print(f"SVD (MF) Precision@10: {P_at_10_svd:.4f}")

# Show some sample recommendations
print("\nSample recommendations for first user:")
sample_user = unique_users_svd[0]
sample_recs = svd_topk_for_user(sample_user, k=5)
for i, (movie_id, title, pred_rating) in enumerate(sample_recs, 1):
    print(f"{i}. {title} (Predicted rating: {pred_rating:.2f})")

Creating user-item matrix with 610 users and 8977 items
Fitting SVD model...
Calculating Precision@10...


Evaluating SVD P@10: 100%|██████████| 602/602 [33:32<00:00,  3.34s/it]


SVD (MF) Precision@10: 0.0654

Sample recommendations for first user:
1. Casino (1995) (Predicted rating: 5.00)
2. Persuasion (1995) (Predicted rating: 5.00)
3. City of Lost Children, The (Cité des enfants perdus, La) (1995) (Predicted rating: 5.00)
4. Twelve Monkeys (a.k.a. 12 Monkeys) (1995) (Predicted rating: 5.00)
5. The Brain (1969) (Predicted rating: 5.00)


Demo: Show Top Recommendations for a Sample User

In [24]:
# Cell 8: Demo recommendations for one sample user across methods

# Pick a user with at least one relevant test item and who exists in train
candidate_users = [u for u in unique_users if len(user_test_rel.get(u, set())) > 0]
demo_user = int(candidate_users[0]) if len(candidate_users) else int(unique_users[0])

print(f"Demo user: {demo_user}")

print("\nUser-Based CF top-10:")
for mid, title, score in recommend_user_based(demo_user, k=10, topn_neighbors=25):
    print(f"{title}  (pred: {score:.3f})")

print("\nItem-Based CF top-10:")
for mid, title, score in recommend_item_based(demo_user, k=10, topn_neighbors=50):
    print(f"{title}  (pred: {score:.3f})")

print("\nSVD (MF) top-10:")
for mid, title, score in svd_topk_for_user(demo_user, k=10):
    print(f"{title}  (pred: {score:.3f})")


Demo user: 1

User-Based CF top-10:
Star Wars: Episode IV - A New Hope (1977)  (pred: 5.026)
Forrest Gump (1994)  (pred: 4.897)
Shawshank Redemption, The (1994)  (pred: 4.894)
Pulp Fiction (1994)  (pred: 4.866)
Blade Runner (1982)  (pred: 4.826)
Godfather, The (1972)  (pred: 4.813)
One Flew Over the Cuckoo's Nest (1975)  (pred: 4.803)
Fight Club (1999)  (pred: 4.734)
Reservoir Dogs (1992)  (pred: 4.715)
Casablanca (1942)  (pred: 4.695)

Item-Based CF top-10:
Final Fantasy VII: Advent Children (2004)  (pred: 7.800)
Pacific Rim: Uprising (2018)  (pred: 7.583)
Pirates of Silicon Valley (1999)  (pred: 7.500)
Kinky Boots (2005)  (pred: 7.446)
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)  (pred: 7.375)
Friday the 13th Part VI: Jason Lives (1986)  (pred: 7.361)
Seed of Chucky (Child's Play 5) (2004)  (pred: 7.342)
Smokey and the Bandit II (1980)  (pred: 7.322)
Professional, The (Le professionnel) (1981)  (pred: 7.300)
Birthday Girl (2001)  (pred: 7.300)

SVD (MF) top-10:
Casino (1995)

THE END!!!