In [1]:
import json
from pathlib import Path
import random

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
movie_tag_matrix = pd.read_csv("subset_movie_tag_matrix.csv", index_col=0)
ratings_small    = pd.read_csv("subset_ratings.csv")
movies_small     = pd.read_csv("subset_movies.csv")


In [3]:
movie_tag_matrix.index = movie_tag_matrix.index.astype(int)
print("Loaded tag‑matrix shape:", movie_tag_matrix.shape)

Loaded tag‑matrix shape: (7008, 1128)


In [4]:
row_norms   = np.linalg.norm(movie_tag_matrix.values, axis=1)
movie_norm  = movie_tag_matrix.div(
    pd.Series(row_norms, index=movie_tag_matrix.index).replace(0, 1),
    axis=0,
)


In [5]:
def split_per_user(df: pd.DataFrame, test_fraction: float = 0.20, seed: int = 7):
    """Return (train_df, test_df) with an 80‑20 split for each individual user."""
    train, test = [], []
    for _uid, grp in df.groupby("userId"):
        if len(grp) == 1:
            train.append(grp)
            continue
        tr, te = train_test_split(grp, test_size=test_fraction, random_state=seed)
        train.append(tr)
        test.append(te)
    return pd.concat(train), pd.concat(test)

train_ratings, test_ratings = split_per_user(ratings_small, 0.2)
valid_movie_ids = set(movie_norm.index)
train_ratings   = train_ratings[train_ratings.movieId.isin(valid_movie_ids)].reset_index(drop=True)
test_ratings    = test_ratings [test_ratings .movieId.isin(valid_movie_ids)].reset_index(drop=True)

In [6]:
min_rating, max_rating = train_ratings.rating.min(), train_ratings.rating.max()

user_profiles = {}
for u, grp in train_ratings.groupby("userId"):
    feats   = movie_norm.loc[grp.movieId].values
    weights = ((grp.rating - min_rating) / (max_rating - min_rating)).values[:, None]
    vec     = (weights * feats).sum(axis=0)
    if vec.sum() > 0:
        vec /= np.linalg.norm(vec)
    user_profiles[u] = vec

In [7]:
pivot     = train_ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)
user_ids  = pivot.index.values               # length = U
movie_ids = pivot.columns.values.astype(int) # length = M

In [8]:
user_vecs  = pivot.values                    # shape (U, M)
sim_users  = cosine_similarity(user_vecs)    # (U, U)
abs_sim_u  = np.sum(np.abs(sim_users), axis=1)  # denom per user (length U)

In [16]:
movie_norm_arr = movie_norm.loc[movie_ids].values  # align order


def recommend_hybrid_fast_uCF(user_id: int, K: int = 10, a: float = 0.7):
    """Return Top‑K movie recommendations using user‑user CF + CBF blend."""
    if user_id not in pivot.index:
        return pd.DataFrame(columns=["title", "genres", "score"])

    u_idx   = np.where(user_ids == user_id)[0][0]

    cf_num  = sim_users[u_idx].dot(user_vecs)
    cf      = np.divide(cf_num,
                        abs_sim_u[u_idx],
                        out=np.zeros_like(cf_num),
                        where=abs_sim_u[u_idx] > 0)

    p_u     = user_profiles.get(user_id, np.zeros(movie_norm_arr.shape[1]))
    cbf     = movie_norm_arr.dot(p_u)                # shape (M,)

    hyb     = a * cf + (1.0 - a) * cbf

    watched = set(train_ratings.loc[train_ratings.userId == user_id, "movieId"])
    mask    = np.isin(movie_ids, list(watched), invert=True)
    hyb    *= mask

    if K < len(hyb):
        idx      = np.argpartition(-hyb, K)[:K]
    else:
        idx      = np.arange(len(hyb))
    order        = np.argsort(-hyb[idx])
    top_ids      = movie_ids[idx][order]
    top_scores   = hyb[idx][order]

    df = movies_small.set_index("movieId").loc[top_ids, ["title", "genres"]].copy()
    df["score"] = top_scores
    return df


In [17]:
import numpy as np

# — full movie list from your tag‐matrix —
all_movies = np.array(sorted(movie_norm.index), dtype=int)
movie_norm_full = movie_norm.loc[all_movies].values   # (M_full × T)

# — train_movies are exactly the columns in your CF pivot —
train_movies = pivot.columns.values.astype(int)       # (M',)
# sim_matrix here is already M'×M' from pivot.columns

pos_train_in_all = np.searchsorted(all_movies, train_movies)


def recommend_hybrid_item(
    user_id:int,
    K:int=10,
    a:float=0.7
) -> pd.DataFrame:
    """
    Hybrid recommender for ITEM‐cold scenario.
    Blends item–item CF (computed only on train_movies)
    with CBF on all_movies.
    """
    if user_id not in pivot.index:
        return pd.DataFrame(columns=['title','genres','score'])

    # --- CF part on train_movies ---
    r_u     = pivot.loc[user_id].values               # (M',)
    cf_train= sim_matrix.dot(r_u)                     # (M',)
    cf_train= cf_train / abs_sim_sum                  # (M',)

    # expand CF into full movie list
    cf_full = np.zeros(len(all_movies))
    # for each train_movie at position i, find its index j in all_movies
    # and copy the CF score there
    for i, m in enumerate(train_movies):
        j = np.searchsorted(all_movies, m)
        cf_full[j] = cf_train[i]

    # --- CBF part on all_movies ---
    prof    = user_profiles.get(user_id, np.zeros(movie_norm_full.shape[1]))
    cbf_full= movie_norm_full.dot(prof)                # (M_full,)

    # # --- blend & mask seen ---
    # hyb     = a*cf_full + (1-a)*cbf_full
    # seen    = set(train_ratings.query("userId==@user_id").movieId)
    # mask    = np.isin(all_movies, list(seen), invert=True)
    # hyb    *= mask

        # --- blend ---
    hyb = a*cf_full + (1-a)*cbf_full

    # 1) ban every warm‐movie (so Top-K is drawn only from the cold set)
    hyb[pos_train_in_all] = -np.inf

    # 2) still remove any the user actually rated in train
    seen    = set(train_ratings.query("userId==@user_id").movieId)
    pos_seen = np.searchsorted(all_movies, list(seen))
    hyb[pos_seen] = -np.inf

    # --- top-K selection ---
    idx     = np.argpartition(-hyb, K)[:K]
    ordered = idx[np.argsort(-hyb[idx])]
    picks   = all_movies[ordered]
    scores  = hyb[ordered]

    df = movies_small.set_index('movieId').loc[picks, ['title','genres']].copy()
    df['score'] = scores
    return df


In [11]:
# K      = 10
# N      = 1000
# valid_users = list(set(pivot.index) | set(user_profiles.keys()))
# subset_user_ids = random.sample(valid_users, min(N, len(valid_users)))

# preds = {}
# for u in tqdm(subset_user_ids, desc=f"Generating Top-{K} Hybrid Recs (userCF)"):
#     recs_df = recommend_hybrid_fast_uCF(u, K=K)
#     if not recs_df.empty:
#         preds[int(u)] = recs_df.index.tolist()

# Path("predictions").mkdir(exist_ok=True)
# with open("predictions/hybrid_userCF_top10_subset.json", "w") as f:
#     json.dump(preds, f)

# print(f"Generated recommendations for {len(preds)} users out of {len(subset_user_ids)} sampled.")


Generating Top-10 Hybrid Recs (userCF): 100%|██████████| 1000/1000 [00:43<00:00, 23.21it/s]


Generated recommendations for 1000 users out of 1000 sampled.


# ColdStart

In [12]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# ─────────────────────────────────────────────────────────────────────────────
# 0) PICK SCENARIO: "STANDARD", "USER" (new-user cold), or "ITEM" (new-item cold)
# ─────────────────────────────────────────────────────────────────────────────
scenario = "ITEM"   # or "USER", or "ITEM"

In [13]:


if scenario == "STANDARD":
    ratings_small = pd.read_csv("subset_ratings.csv", usecols=["userId","movieId","rating"])
    train_ratings, test_ratings = split_per_user(ratings_small)

elif scenario == "USER":
    train_ratings = pd.read_csv("evaluation/user_cold_train.csv", usecols=["userId","movieId","rating"])
    test_ratings  = pd.read_csv("evaluation/user_cold_test.csv",  usecols=["userId","movieId","rating"])

else:  # "ITEM"
    train_ratings = pd.read_csv("evaluation/item_cold_train.csv", usecols=["userId","movieId","rating"])
    test_ratings  = pd.read_csv("evaluation/item_cold_test.csv",  usecols=["userId","movieId","rating"])

In [14]:


# # ─────────────────────────────────────────────────────────────────────────────
# # 1) FILTER to movies for which we have tags
# # ─────────────────────────────────────────────────────────────────────────────
# valid_movies   = set(movie_norm.index)
# train_ratings  = train_ratings [train_ratings .movieId.isin(valid_movies)].reset_index(drop=True)
# test_ratings   = test_ratings  [test_ratings  .movieId.isin(valid_movies)].reset_index(drop=True)

# # ─────────────────────────────────────────────────────────────────────────────
# # 2) REBUILD user_profiles from train_ratings
# # ─────────────────────────────────────────────────────────────────────────────
# min_r, max_r = train_ratings.rating.min(), train_ratings.rating.max()
# user_profiles = {}
# for u, grp in train_ratings.groupby("userId"):
#     feats   = movie_norm.loc[grp.movieId].values
#     weights = ((grp.rating - min_r) / (max_r - min_r)).values[:, None]
#     vec     = (weights * feats).sum(axis=0)
#     if vec.sum() > 0:
#         vec /= np.linalg.norm(vec)
#     user_profiles[u] = vec

# # ─────────────────────────────────────────────────────────────────────────────
# # 3) REBUILD CF structures (pivot, sim_users, abs_sim_u)
# # ─────────────────────────────────────────────────────────────────────────────
# pivot      = train_ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)
# user_vecs  = pivot.values                           # U×M
# user_ids   = pivot.index.values                     # length U
# movie_ids  = pivot.columns.values.astype(int)       # length M

# sim_users  = cosine_similarity(user_vecs)            # U×U
# abs_sim_u  = np.sum(np.abs(sim_users), axis=1)      # length U

# # align content‐based matrix to the pivot’s movie order
# movie_norm_arr = movie_norm.loc[movie_ids].values   # M×T

# # ─────────────────────────────────────────────────────────────────────────────
# # 4) GENERATE Top-K for every user in test_ratings
# # ─────────────────────────────────────────────────────────────────────────────
# K = 10
# preds_ucf = {}

# for u in tqdm(test_ratings.userId.unique(), desc=f"Hybrid-uCF Top-{K} ({scenario})"):
#     df_rec = recommend_hybrid_fast_uCF(u, K=K, a=0.7)
#     preds_ucf[int(u)] = df_rec.index.tolist()  # even empty list if no recs


# # ─────────────────────────────────────────────────────────────────────────────
# # 5) DUMP to JSON
# # ─────────────────────────────────────────────────────────────────────────────
# outfn = Path("coldstart_pred") / f"hybrid_userCF_{scenario.lower()}_top{K}.json"
# outfn.parent.mkdir(exist_ok=True)
# with open(outfn, "w") as f:
#     json.dump(preds_ucf, f, indent=2)

# print(f"✅  Saved {len(preds_ucf)} users → {outfn}")


In [18]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# ───────────────────────────────────────────────────────────────────────
# ITEM‐cold scenario
# ───────────────────────────────────────────────────────────────────────
scenario = "ITEM"
if scenario == "ITEM":
    # 4a) load your pre-split cold-item CSVs
    train_ratings = pd.read_csv("evaluation/item_cold_train.csv", usecols=["userId","movieId","rating"])
    test_ratings  = pd.read_csv("evaluation/item_cold_test.csv",  usecols=["userId","movieId","rating"])

    # 4b) filter to only movies we have tags for
    valid = set(movie_norm.index)
    train_ratings = train_ratings[train_ratings.movieId.isin(valid)].reset_index(drop=True)
    test_ratings  = test_ratings [test_ratings.movieId.isin(valid)].reset_index(drop=True)

    # 4c) rebuild CF structures on train_ratings
    pivot      = train_ratings.pivot(
                      index="userId", columns="movieId", values="rating"
                  ).fillna(0)
    # scale to [0,1]
    scaled     = MinMaxScaler().fit_transform(pivot)
    pivot.iloc[:,:] = scaled

    # item–item similarity among the train set movies
    train_movies = pivot.columns.values.astype(int)   # M' movies
    item_vecs    = pivot.values.T                      # M'×U
    sim_matrix   = cosine_similarity(item_vecs)        # M'×M'
    abs_sim_sum  = np.sum(np.abs(sim_matrix), axis=1)  # length M'

    # full list of genome‐tagged movies
    all_movies      = np.array(sorted(movie_norm.index), dtype=int)  # M_full
    movie_norm_full = movie_norm.loc[all_movies].values             # M_full×T

    # rebuild user_profiles just in case
    min_r, max_r = train_ratings.rating.min(), train_ratings.rating.max()
    user_profiles = {}
    for u, grp in train_ratings.groupby("userId"):
        feats   = movie_norm.loc[grp.movieId].values
        wts     = ((grp.rating - min_r) / (max_r - min_r)).values[:,None]
        prof    = (wts * feats).sum(0)
        if prof.sum(): prof /= np.linalg.norm(prof)
        user_profiles[u] = prof

    # 4d) now generate Top-K with your item‐cold hybrid
    preds_item = {}
    for u in tqdm(test_ratings.userId.unique(), desc="Hybrid Top-10 (ITEM)"):
        df = recommend_hybrid_item(u, K=10, a=0.7)
        preds_item[int(u)] = df.index.tolist()

    # 4e) dump to JSON
    outfn = Path("coldstart_pred")/f"hybrid_item_top10.json"
    outfn.parent.mkdir(exist_ok=True)
    with open(outfn,"w") as fp:
        json.dump(preds_item, fp, indent=2)
    print("✅ saved", len(preds_item), "users →", outfn)


Hybrid Top-10 (ITEM): 100%|██████████| 9787/9787 [05:17<00:00, 30.83it/s]

✅ saved 9787 users → coldstart_pred\hybrid_item_top10.json



