In [1]:
import numpy as np
import pandas as pd


In [None]:
movie_tag_matrix    = pd.read_csv('subset_movie_tag_matrix.csv', index_col=0)
ratings_small = pd.read_csv('subset_ratings.csv')
movies_small = pd.read_csv('subset_movies.csv')


In [3]:
movie_tag_matrix.head()

Unnamed: 0_level_0,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,0.032,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,0.0205,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,0.02675,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185
4,0.03725,0.0395,0.03675,0.031,0.06825,0.0405,0.02325,0.087,0.05125,0.03025,...,0.0575,0.03375,0.02275,0.03975,0.18525,0.05925,0.015,0.01525,0.0645,0.013
5,0.042,0.05275,0.05925,0.03675,0.07525,0.12525,0.0285,0.085,0.0295,0.02875,...,0.0425,0.02825,0.0215,0.026,0.14275,0.02075,0.0165,0.01675,0.1075,0.01825


In [4]:
movie_tag_matrix.index = movie_tag_matrix.index.astype(int)


In [5]:
print("Loaded tag matrix shape:", movie_tag_matrix.shape)
print("Example row:\n", movie_tag_matrix.iloc[0, :10])

Loaded tag matrix shape: (7008, 1128)
Example row:
 007             0.02500
007 (series)    0.02500
18th century    0.05775
1920s           0.09675
1930s           0.14675
1950s           0.21700
1960s           0.06700
1970s           0.26275
1980s           0.26200
19th century    0.03200
Name: 1, dtype: float64


In [6]:
print("Movies in tag matrix:", len(movie_tag_matrix.index))
print("Movies in metadata :", len(movies_small.movieId.unique()))
print("Intersection size   :", len(set(movie_tag_matrix.index) & set(movies_small.movieId)))


Movies in tag matrix: 7008
Movies in metadata : 7049
Intersection size   : 7008


In [7]:
row_norms = np.linalg.norm(movie_tag_matrix.values, axis=1)
movie_norm = movie_tag_matrix.div(
    pd.Series(row_norms, index=movie_tag_matrix.index).replace(0, 1),
    axis=0
)

In [8]:
# Build user profile here
from sklearn.model_selection import train_test_split

def split_per_user(df, test_fraction=0.2, seed=7):
    train, test = [], []
    for _, group in df.groupby('userId'):
        train_group, test_group = train_test_split(
            group, test_size=test_fraction, random_state=seed
        ) if len(group) >1 else (group, None)
        train.append(train_group)
        if test_group is not None:
            test.append(test_group)
    return pd.concat(train), pd.concat(test)

train_ratings, test_ratings = split_per_user(ratings_small, test_fraction=0.2)

valid_movie_ids = set(movie_norm.index)

train_ratings = train_ratings[train_ratings.movieId.isin(valid_movie_ids)].reset_index(drop=True)
test_ratings  = test_ratings [test_ratings .movieId.isin(valid_movie_ids)].reset_index(drop=True)


min_rating, max_rating = train_ratings.rating.min(), train_ratings.rating.max()

# build user profile
user_profiles = {}
for u, grp in train_ratings.groupby('userId'):
    valid_rows = grp[grp.movieId.isin(movie_norm.index)]
    if valid_rows.empty:
        continue                  
    feats   = movie_norm.loc[valid_rows.movieId].values
    weights = ((valid_rows.rating - min_rating) / (max_rating - min_rating)).values[:, None]
    vec     = (weights * feats).sum(axis=0)
    if vec.sum():
        vec /= np.linalg.norm(vec)
    user_profiles[u] = vec

In [9]:
from sklearn.preprocessing import MinMaxScaler

pivot = train_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
collab_matrix = pd.DataFrame(
    MinMaxScaler().fit_transform(pivot),  # scales each rating row to [0,1]
    index=pivot.index,
    columns=pivot.columns
)
collab_matrix.index = collab_matrix.index.astype(int)
collab_matrix.columns = collab_matrix.columns.astype(int)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
item_vectors = pivot.T
import numpy as np

def cf_predict(user_id, movie_id, k=None):
    """
    Predict user_id's rating of movie_id by item-item CF:
     - Compute cosine similarity between movie_id and every movie the user rated
     - Take a weighted average of the user's ratings
    """
    # 1) If user or movie is unknown, bail out
    if user_id not in pivot.index or movie_id not in item_vectors.index:
        return 0.0

    # 2) Movies this user has rated
    user_ratings = pivot.loc[user_id]
    rated = user_ratings[user_ratings > 0]

    if rated.empty:
        return 0.0

    # 3) Compute similarities
    target_vec = item_vectors.loc[movie_id].values.reshape(1, -1)
    neighbor_vecs = item_vectors.loc[rated.index].values
    sims = cosine_similarity(target_vec, neighbor_vecs).flatten()

    # 4) (Optional) keep only top‑k neighbors
    if k is not None and k < len(sims):
        idx = np.argsort(sims)[-k:]
        sims = sims[idx]
        ratings = rated.values[idx]
    else:
        ratings = rated.values

    # 5) Weighted average
    if sims.sum() == 0:
        return 0.0
    return np.dot(sims, ratings) / np.sum(np.abs(sims))


In [11]:
def hybrid_score(user_id, movie_id, a=0.7):
    """Return hybrid score in [0,1] for given (u,m)."""
    # Collaborative part
    collab = 0
    if user_id in collab_matrix.index and movie_id in collab_matrix.columns:
        collab = cf_predict(user_id, movie_id, k=50)
    # Content‑based part
    content = 0
    prof = user_profiles.get(user_id)
    if prof is not None and movie_id in movie_norm.index:
        content = prof.dot(movie_norm.loc[movie_id].values)

    # Fallback rules
    if collab == 0 and prof is None:      # totally new user + new item
        return 0
    if collab == 0:                       # new item (no CF signal)
        return content
    if prof is None:                      # new user (no content signal)
        return collab
    # Otherwise blend
    return a * collab + (1 - a) * content

In [12]:
# def recommend_hybrid(user_id, top_n=10, a=0.7):
#     """Return a DataFrame of top-N movie recommendations."""
#     scores = {
#         m: hybrid_score(user_id, m, a)
#         for m in movie_norm.index
#     }
#     # Remove already‑watched movies
#     watched = set(train_ratings[train_ratings.userId == user_id].movieId)
#     ranked  = sorted(
#         ((m, s) for m, s in scores.items() if m not in watched),
#         key=lambda x: x[1],
#         reverse=True
#     )[:top_n]
#     rec_ids = [m for m, _ in ranked]

#     movie_info = movies_small.set_index('movieId')
#     valid_ids = [m for m in rec_ids if m in movie_info.index]

#     return movie_info.loc[valid_ids][['title', 'genres']].assign(
#         score=[s for m, s in ranked if m in movie_info.index]
#     )
#     # return movies_small.set_index('movieId').loc[rec_ids][['title', 'genres']]\
#     #        .assign(score=[s for _, s in ranked])

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# — your pivot is user×movie like before —
pivot = train_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# (a) item_vectors: movie×user
item_vecs     = pivot.T.values                   # shape (M, U)
movie_ids     = pivot.columns.values             # shape (M,)
user_ids      = pivot.index.values               # shape (U,)

# (b) item–item similarity matrix (cosine)
sim_matrix    = cosine_similarity(item_vecs)     # shape (M, M)

# (c) denominator for weighted average (sum of abs similarities)
abs_sim_sum   = np.sum(np.abs(sim_matrix), axis=1)  # shape (M,)

# (d) content matrix and movie_ids come from your previous code:
movie_norm_arr = movie_norm.values               # shape (M, T)


In [19]:
def recommend_hybrid_fast(user_id, K=10, a=0.7):
    # — 1) get user’s rating vector (over movies) —
    if user_id not in pivot.index:
        return pd.DataFrame(columns=['title','genres','score'])
    r_u = pivot.loc[user_id].values               # shape (M,)

    # — 2) item–item CF score for every movie (vectorized!) —
    num = sim_matrix.dot(r_u)                     # numerator: sum_j sim(i,j)*r_u[j]
    # avoid divide-by-zero
    cf = np.divide(num, abs_sim_sum, out=np.zeros_like(num), where=abs_sim_sum>0)

    # — 3) content‑based CBF score for every movie —
    p_u = user_profiles.get(user_id, np.zeros(movie_norm_arr.shape[1])) 
    cbf = movie_norm_arr.dot(p_u)                 # shape (M,)

    # — 4) blend them —
    hyb = a * cf + (1 - a) * cbf                  # shape (M,)

    # — 5) mask out already watched movies —
    watched = set(train_ratings[train_ratings.userId==user_id].movieId)
    mask    = np.isin(movie_ids, list(watched), invert=True)
    hyb    *= mask

    # — 6) pick Top‐K via argpartition & sort —
    idx     = np.argpartition(-hyb, K)[:K]
    top_ids = movie_ids[idx]
    top_sc  = hyb[idx]
    order   = np.argsort(-top_sc)
    top_ids, top_sc = top_ids[order], top_sc[order]

    # — 7) build your DataFrame of titles/genres/scores —
    df = movies_small.set_index('movieId').loc[top_ids, ['title','genres']].copy()
    df['score'] = top_sc
    return df


In [20]:
cbf_users = set(user_profiles.keys())
cf_users  = set(collab_matrix.index)

# Hybrid can handle anyone in either:
valid_users = list(cbf_users.union(cf_users))
print("Total users in hybrid:", len(valid_users))

Total users in hybrid: 10000


In [21]:
import random

N = 1000  # number of users to process
subset_user_ids = random.sample(valid_users, min(N, len(valid_users)))


In [22]:
import json
from pathlib import Path
from tqdm import tqdm

K = 10
preds = {}

for u in tqdm(subset_user_ids, desc=f"Generating Top-{K} Hybrid Recs"):
    try:
        recs = recommend_hybrid_fast(u, K=K).index.tolist()
        preds[int(u)] = recs
    except KeyError:
        # if anything still misaligns, just skip
        continue

# Save
Path("predictions").mkdir(exist_ok=True)
with open("predictions/hybrid_top10_subset.json","w") as f:
    json.dump(preds, f)

print(f"Generated recommendations for {len(preds)} users out of {len(subset_user_ids)} sampled.")


Generating Top-10 Hybrid Recs: 100%|██████████| 1000/1000 [00:17<00:00, 55.75it/s]

Generated recommendations for 1000 users out of 1000 sampled.



