In [1]:
import json
from pathlib import Path
import random

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
movie_tag_matrix = pd.read_csv("subset_movie_tag_matrix.csv", index_col=0)
ratings_small    = pd.read_csv("subset_ratings.csv")
movies_small     = pd.read_csv("subset_movies.csv")


In [3]:
movie_tag_matrix.index = movie_tag_matrix.index.astype(int)
print("Loaded tag‑matrix shape:", movie_tag_matrix.shape)

Loaded tag‑matrix shape: (6977, 1128)


In [4]:
row_norms   = np.linalg.norm(movie_tag_matrix.values, axis=1)
movie_norm  = movie_tag_matrix.div(
    pd.Series(row_norms, index=movie_tag_matrix.index).replace(0, 1),
    axis=0,
)


In [5]:
def split_per_user(df: pd.DataFrame, test_fraction: float = 0.20, seed: int = 7):
    """Return (train_df, test_df) with an 80‑20 split for each individual user."""
    train, test = [], []
    for _uid, grp in df.groupby("userId"):
        if len(grp) == 1:
            train.append(grp)
            continue
        tr, te = train_test_split(grp, test_size=test_fraction, random_state=seed)
        train.append(tr)
        test.append(te)
    return pd.concat(train), pd.concat(test)

train_ratings, test_ratings = split_per_user(ratings_small, 0.2)
valid_movie_ids = set(movie_norm.index)
train_ratings   = train_ratings[train_ratings.movieId.isin(valid_movie_ids)].reset_index(drop=True)
test_ratings    = test_ratings [test_ratings .movieId.isin(valid_movie_ids)].reset_index(drop=True)

In [6]:
min_rating, max_rating = train_ratings.rating.min(), train_ratings.rating.max()

user_profiles = {}
for u, grp in train_ratings.groupby("userId"):
    feats   = movie_norm.loc[grp.movieId].values
    weights = ((grp.rating - min_rating) / (max_rating - min_rating)).values[:, None]
    vec     = (weights * feats).sum(axis=0)
    if vec.sum() > 0:
        vec /= np.linalg.norm(vec)
    user_profiles[u] = vec

In [7]:
pivot     = train_ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)
user_ids  = pivot.index.values               # length = U
movie_ids = pivot.columns.values.astype(int) # length = M

In [8]:
user_vecs  = pivot.values                    # shape (U, M)
sim_users  = cosine_similarity(user_vecs)    # (U, U)
abs_sim_u  = np.sum(np.abs(sim_users), axis=1)  # denom per user (length U)

In [10]:
movie_norm_arr = movie_norm.loc[movie_ids].values  # align order


def recommend_hybrid_fast_uCF(user_id: int, K: int = 10, a: float = 0.7):
    """Return Top‑K movie recommendations using user‑user CF + CBF blend."""
    if user_id not in pivot.index:
        return pd.DataFrame(columns=["title", "genres", "score"])

    u_idx   = np.where(user_ids == user_id)[0][0]

    cf_num  = sim_users[u_idx].dot(user_vecs)
    cf      = np.divide(cf_num,
                        abs_sim_u[u_idx],
                        out=np.zeros_like(cf_num),
                        where=abs_sim_u[u_idx] > 0)

    p_u     = user_profiles.get(user_id, np.zeros(movie_norm_arr.shape[1]))
    cbf     = movie_norm_arr.dot(p_u)                # shape (M,)

    hyb     = a * cf + (1.0 - a) * cbf

    watched = set(train_ratings.loc[train_ratings.userId == user_id, "movieId"])
    mask    = np.isin(movie_ids, list(watched), invert=True)
    hyb    *= mask

    if K < len(hyb):
        idx      = np.argpartition(-hyb, K)[:K]
    else:
        idx      = np.arange(len(hyb))
    order        = np.argsort(-hyb[idx])
    top_ids      = movie_ids[idx][order]
    top_scores   = hyb[idx][order]

    df = movies_small.set_index("movieId").loc[top_ids, ["title", "genres"]].copy()
    df["score"] = top_scores
    return df


In [11]:
K      = 10
N      = 1000
valid_users = list(set(pivot.index) | set(user_profiles.keys()))
subset_user_ids = random.sample(valid_users, min(N, len(valid_users)))

preds = {}
for u in tqdm(subset_user_ids, desc=f"Generating Top-{K} Hybrid Recs (userCF)"):
    recs_df = recommend_hybrid_fast_uCF(u, K=K)
    if not recs_df.empty:
        preds[int(u)] = recs_df.index.tolist()

Path("predictions").mkdir(exist_ok=True)
with open("predictions/hybrid_userCF_top10_subset.json", "w") as f:
    json.dump(preds, f)

print(f"Generated recommendations for {len(preds)} users out of {len(subset_user_ids)} sampled.")


Generating Top-10 Hybrid Recs (userCF): 100%|██████████| 1000/1000 [00:17<00:00, 57.93it/s]


Generated recommendations for 1000 users out of 1000 sampled.
