# K‑Nearest Neighbors (KNN) Collaborative Filtering

- Load and validate the **subset_ratings.csv** file  
- Pivot into a **user × movie** ratings matrix (fill NA with 0)  
- Compute **item–item cosine similarity** and keep the top‑k neighbors  
- Predict unknown ratings by a **mean‑centered, similarity‑weighted average**  
- Split data per user (80 % train / 20 % test) and report **RMSE / MAE**  
- Output **Top‑N recommendations** for 1 000 sampled users to JSON  

In [1]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split
import numpy as np
import random, json
from tqdm import tqdm
from pathlib import Path

In [2]:
ratings = pd.read_csv("subset_ratings.csv", usecols=["userId", "movieId", "rating"])
print("Loaded subset:", ratings.shape)

Loaded subset: (2079356, 3)


In [3]:
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
full_data = Dataset.load_from_df(ratings, reader)
trainset, testset = train_test_split(full_data, test_size=0.2, random_state=42)
print("train / test ratings:", trainset.n_ratings, len(testset))

train / test ratings: 1663484 415872


In [4]:
knn_algo = KNNWithMeans(
    k=40,
    sim_options={"name": "cosine", "user_based": False},
    verbose=False,
)
knn_algo.fit(trainset)

<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1aaffebb460>

In [5]:
preds = knn_algo.test(testset)
rmse = accuracy.rmse(preds, verbose=False)
mae  = accuracy.mae(preds,  verbose=False)
print(f"KNN  RMSE = {rmse:.4f}   MAE = {mae:.4f}")

with open("knn_metrics.txt", "w") as f:
    f.write(f"RMSE={rmse:.4f}\nMAE={mae:.4f}\n")

KNN  RMSE = 0.8402   MAE = 0.6412


In [6]:
trainset_user_inner = trainset._raw2inner_id_users
trainset_item_inner = trainset._raw2inner_id_items
trainset_item_outer = {inner: int(raw) for raw, inner in trainset_item_inner.items()}

all_user_ids = list(trainset_user_inner.keys())

In [7]:
K, N = 10, 1_000
sample_users = random.sample(all_user_ids, min(N, len(all_user_ids)))

recs = {}
for uid_raw in tqdm(sample_users, desc=f"Top‑{K} recs"):
    seen_items = set(ratings[ratings.userId == int(uid_raw)].movieId)
    uid_inner  = trainset.to_inner_uid(uid_raw)

    # Predict for *all* items, gather but skip seen
    scores = {}
    for inner_iid in trainset_item_outer.keys():
        raw_iid = trainset_item_outer[inner_iid]
        if raw_iid in seen_items:
            continue
        est = knn_algo.predict(uid_raw, str(raw_iid)).est
        scores[raw_iid] = est

    if scores:
        top_items = sorted(scores, key=scores.get, reverse=True)[:K]
        recs[int(uid_raw)] = top_items

Path("predictions").mkdir(exist_ok=True)
with open("predictions/knn_top10_subset.json", "w") as fp:
    json.dump(recs, fp)

print(f"✅  Saved {len(recs)} users → predictions/knn_top10_subset.json")

Top‑10 recs: 100%|██████████| 1000/1000 [00:19<00:00, 52.22it/s]

✅  Saved 1000 users → predictions/knn_top10_subset.json



