In [12]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae
from collections import defaultdict
from tqdm import tqdm


In [42]:
ratings = pd.read_csv("data/ratings.csv")
movies  = pd.read_csv("data/movies.csv")
tags    = pd.read_csv("data/tags.csv")
links   = pd.read_csv("data/links.csv")
movie_genre = pd.read_csv("data/movie_genre.csv")
genre = pd.read_csv("data/genre.csv")
print("ratings")
print(ratings.head())
print("movies")
print(movies.head())
print("tags")
print(tags.head())
print("links")
print(links.head())
print("movie_genre")
print(movie_genre.head())
print("genre")
print(genre.head())



ratings
   userId  movieId  rating        date      time
0       1        1     4.0  2000-07-30  18:45:03
1       1        3     4.0  2000-07-30  18:20:47
2       1        6     4.0  2000-07-30  18:37:04
3       1       47     5.0  2000-07-30  19:03:35
4       1       50     5.0  2000-07-30  18:48:51
movies
   movieId                              title 
0        1                    Toy Story (1995)
1        2                      Jumanji (1995)
2        3             Grumpier Old Men (1995)
3        4            Waiting to Exhale (1995)
4        5  Father of the Bride Part II (1995)
tags
   userId  movieId              tag        date      time
0       2    60756            funny  2015-10-24  19:29:54
1       2    60756  Highly quotable  2015-10-24  19:29:56
2       2    60756     will ferrell  2015-10-24  19:29:52
3       2    89774     Boxing story  2015-10-24  19:33:27
4       2    89774              MMA  2015-10-24  19:33:20
links
   movieId  imdbId  tmdbId
0        1  114709     

In [22]:
ratings = ratings.sort_values(['userId', 'date', 'time'])

N = 4  # number of last ratings per user for test

# test = last N per user
test  = ratings.groupby('userId', group_keys=False).tail(N)

# train = all other ratings
train = ratings.drop(test.index)

In [23]:

reader = Reader(rating_scale=(0.5, 5.0))
train_data = Dataset.load_from_df(train[['userId','movieId','rating']], reader)
trainset = train_data.build_full_trainset()

algo = SVD(n_factors=50, reg_all=0.02, lr_all=0.005, random_state=42)
algo.fit(trainset)
pred = algo.predict(uid=1, iid=2)  # userId=1, movieId=2
print(pred.est)  # predicted rating


4.100063219341552


In [24]:

# convert test DataFrame to surprise format
testset = list(test[['userId','movieId','rating']].itertuples(index=False, name=None))
predictions = algo.test(testset)

rmse(predictions)
mae(predictions)


RMSE: 0.9302
MAE:  0.7047


0.704687826849841

In [26]:

# create a binary genre matrix: movieId x genre
genre_list = genre['genre'].tolist()
movie_vec_df = movie_genre.pivot(index='movieId', columns='genre', values='genreId').notna().astype(int)
movie_vec_df = movie_vec_df.reindex(columns=genre_list, fill_value=0)  # ensure all genres
movie_vec = {mid: movie_vec_df.loc[mid].to_numpy() for mid in movie_vec_df.index}


In [27]:
def content_score(uid, iid):
    # liked movies: rating >= 4.0
    liked = train[(train.userId==uid) & (train.rating>=4.0)]['movieId'].tolist()
    if not liked or iid not in movie_vec:
        return 0.0
    user_vec = np.mean([movie_vec[m] for m in liked if m in movie_vec], axis=0)
    return float(np.dot(user_vec, movie_vec.get(iid, np.zeros_like(user_vec))))


In [28]:
def hybrid_score(uid, iid, alpha=0.7):
    svd_score = algo.predict(uid, iid).est
    c_score = content_score(uid, iid)
    return alpha * svd_score + (1-alpha) * c_score


In [35]:
# REPLACE your prior block with this one
import numpy as np
import math
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD

# --- assumptions (same as before) ---
# train, test: DataFrames with columns userId, movieId, rating
# movie_ids: np.array of all movieIds in consistent order with genre_matrix
# genre_matrix: np.array shape (num_movies, num_genres) matching movie_ids order
# user_genre_vec: dict {userId: user_genre_vector} (can be empty)
# train_seen: dict {userId: set(movieId)} already prepared
# alpha: hybrid weight (e.g. 0.7)
# K: Top-K (e.g. 10)

# ---------- 1) retrain SVD with best params ----------
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

# Best params you found
best_params = {'n_factors': 20, 'lr_all': 0.005, 'reg_all': 0.05}
algo = SVD(n_factors=best_params['n_factors'],
           lr_all=best_params['lr_all'],
           reg_all=best_params['reg_all'],
           random_state=42)
algo.fit(trainset)

# ---------- 2) prepare mappings and SVD internals for vectorized scoring ----------
trainset = algo.trainset  # surprise internal trainset reference
global_mean = trainset.global_mean

# map raw item id -> inner id (only items in trainset)
raw_to_inner = {}
for inner_i in range(trainset.n_items):
    raw_i = trainset.to_raw_iid(inner_i)
    try:
        raw_to_inner[int(raw_i)] = inner_i
    except:
        raw_to_inner[raw_i] = inner_i

qi = algo.qi            # item-factor matrix shape (n_items, n_factors)
bi = algo.bi            # item biases shape (n_items,)
# algo.pu, algo.bu are user factors / biases for users in trainset

# make movie_index_map for genre_matrix lookups (movie_ids -> row index)
movie_index_map = {mid: i for i, mid in enumerate(movie_ids)}

# default user genre vector
default_user_vec = np.mean(list(user_genre_vec.values()), axis=0) if user_genre_vec else np.zeros(genre_matrix.shape[1])

# ---------- helper: compute vectorized SVD scores for a user (only for items in trainset) ----------
def svd_scores_for_user(raw_uid):
    """
    Returns a dict: raw_item_id -> svd_score for items that exist in trainset.
    If user not in trainset, returns item scores = global_mean + bi (no user factors).
    """
    scores = {}
    try:
        inner_uid = trainset.to_inner_uid(str(raw_uid))  # convert raw -> inner uid
        pu = algo.pu[inner_uid]      # user factors
        bu = algo.bu[inner_uid]      # user bias
        # vectorized score for all inner items: global + bu + bi + qi.dot(pu)
        vec_scores = global_mean + bu + bi + qi.dot(pu)   # shape (n_items,)
    except ValueError:
        # cold user (no factors in trainset) -> use global_mean + bi (item bias only)
        vec_scores = global_mean + bi

    # map inner indices back to raw ids and return dict for quick lookup
    for inner_i in range(trainset.n_items):
        raw_i = trainset.to_raw_iid(inner_i)
        try:
            raw_id = int(raw_i)
        except:
            raw_id = raw_i
        scores[raw_id] = float(vec_scores[inner_i])
    return scores

# ---------- content score function (vectorized lookup) ----------
def content_score(uid, iid):
    u_vec = user_genre_vec.get(uid, default_user_vec)
    idx = movie_index_map.get(iid, None)
    return float(genre_matrix[idx] @ u_vec) if idx is not None else 0.0

# ---------- Top-K generator using vectorized SVD scores per user ----------
def get_topk_vectorized(users, mode='svd', K=10):
    """
    mode: 'svd' or 'hybrid'
    returns: dict userId -> list of top-K raw movieIds
    """
    recs = {}
    for uid in tqdm(users, desc=f"Generating Top-K Recs ({mode})"):
        seen = train_seen.get(uid, set())
        # candidate movie ids (raw ids)
        candidates = [mid for mid in movie_ids if mid not in seen]
        if not candidates:
            recs[uid] = []
            continue

        # get vectorized SVD scores dict for this user (fast)
        svd_dict = svd_scores_for_user(uid)

        # build score arrays
        svd_scores = np.array([svd_dict.get(int(mid), svd_dict.get(mid, global_mean)) for mid in candidates], dtype=float)
        if mode == 'svd':
            combined_scores = svd_scores
        else:
            # compute content scores vectorized
            u_vec = user_genre_vec.get(uid, default_user_vec)
            # map candidates to genre_matrix rows
            cand_idxs = [movie_index_map.get(mid, None) for mid in candidates]
            # handle missing indices
            valid = [i for i,ci in enumerate(cand_idxs) if ci is not None]
            c_scores = np.zeros(len(candidates), dtype=float)
            if valid:
                rows = genre_matrix[[cand_idxs[i] for i in valid]]
                c_scores_valid = rows @ u_vec
                for vi, val in zip(valid, c_scores_valid):
                    c_scores[vi] = float(val)
            combined_scores = alpha * svd_scores + (1 - alpha) * c_scores

        top_idx = np.argsort(combined_scores)[-K:][::-1]
        recs[uid] = [candidates[i] for i in top_idx]
    return recs

# ---------- metrics (unchanged) ----------
def precision_recall_at_k(recs, test_gt, K=10):
    precisions, recalls = [], []
    for uid, pred in recs.items():
        gt = test_gt.get(uid, set())
        if not gt: continue
        hits = sum(1 for i in pred if i in gt)
        precisions.append(hits/K)
        recalls.append(hits/min(len(gt), K))
    return np.mean(precisions) if precisions else 0.0, np.mean(recalls) if recalls else 0.0

def apk(actual_set, pred_list, K=10):
    if not actual_set: return 0.0
    score, hits = 0.0, 0
    for i, p in enumerate(pred_list[:K], start=1):
        if p in actual_set:
            hits += 1
            score += hits/i
    return score / min(len(actual_set), K)

def map_at_k(recs, test_gt, K=10):
    return np.mean([apk(test_gt.get(uid, set()), pred, K) for uid, pred in recs.items()]) if recs else 0.0

def ndcg_at_k(recs, test_gt, K=10):
    ndcgs = []
    for uid, pred in recs.items():
        gt = test_gt.get(uid, set())
        if not gt: continue
        dcg = sum((1 if p in gt else 0)/math.log2(i+1) for i,p in enumerate(pred[:K], start=1))
        idcg = sum(1/math.log2(i+1) for i in range(1, min(len(gt), K)+1))
        ndcgs.append(dcg/idcg if idcg>0 else 0.0)
    return np.mean(ndcgs) if ndcgs else 0.0

def catalog_coverage(recs, all_items): return len(set(x for r in recs.values() for x in r)) / len(all_items)

def ild(recs, genre_matrix, movie_index_map):
    vals=[]
    for rec in recs.values():
        if len(rec)<2: continue
        idxs = [movie_index_map[i] for i in rec if i in movie_index_map]
        if len(idxs) < 2: continue
        sims = cosine_similarity(genre_matrix[idxs])
        pairs = [1-sims[i,j] for i in range(len(sims)) for j in range(i+1,len(sims))]
        if pairs: vals.append(np.mean(pairs))
    return np.mean(vals) if vals else 0.0

def novelty(recs, train):
    pop = train['movieId'].value_counts().to_dict()
    total = len(train)
    scores=[]
    for rec in recs.values():
        if not rec: continue
        scores.append(np.mean([-math.log(max(pop.get(mid,1e-9)/total,1e-9)) for mid in rec]))
    return np.mean(scores) if scores else 0.0

# ---------- prepare test_gt and users ----------
test_gt = test.groupby('userId')['movieId'].apply(set).to_dict()
users = list(test_gt.keys())

# ---------- run evaluation ----------
svd_recs = get_topk_vectorized(users, mode='svd', K=K)
hybrid_recs = get_topk_vectorized(users, mode='hybrid', K=K)

for name, recs in [("SVD", svd_recs), ("Hybrid", hybrid_recs)]:
    p,r = precision_recall_at_k(recs, test_gt, K)
    ndcg = ndcg_at_k(recs, test_gt, K)
    mAP = map_at_k(recs, test_gt, K)
    cov = catalog_coverage(recs, movie_ids)
    div = ild(recs, genre_matrix, movie_index_map)
    nov = novelty(recs, train)
    print(f"{name} — Prec@{K}: {p:.6f}, Rec@{K}: {r:.6f}, NDCG@{K}: {ndcg:.6f}, MAP@{K}: {mAP:.6f}, Coverage: {cov:.3f}, ILD: {div:.3f}, Novelty: {nov:.3f}")


Generating Top-K Recs (svd): 100%|██████████| 610/610 [00:24<00:00, 24.68it/s]
Generating Top-K Recs (hybrid): 100%|██████████| 610/610 [00:36<00:00, 16.63it/s]


SVD — Prec@10: 0.007213, Rec@10: 0.018033, NDCG@10: 0.016093, MAP@10: 0.008246, Coverage: 0.005, ILD: 0.783, Novelty: 7.320
Hybrid — Prec@10: 0.007213, Rec@10: 0.018033, NDCG@10: 0.013244, MAP@10: 0.006002, Coverage: 0.013, ILD: 0.452, Novelty: 7.335


In [32]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV

reader = Reader(rating_scale=(0.5,5.0))
data = Dataset.load_from_df(train[['userId','movieId','rating']], reader)

param_grid = {
  'n_factors': [20,50,100],
  'lr_all': [0.002, 0.005],
  'reg_all':[0.02, 0.05, 0.1]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=3, n_jobs=4)
gs.fit(data)
print(gs.best_params['rmse'])
# then train final algo with gs.best_params['rmse']


{'n_factors': 20, 'lr_all': 0.005, 'reg_all': 0.05}


In [34]:
import numpy as np
trainset = algo.trainset
global_mean = trainset.global_mean

def vectorized_svd_scores_for_user(raw_uid):
    try:
        iu = trainset.to_inner_uid(str(raw_uid))
    except ValueError:
        return None  # cold user
    pu = algo.pu[iu]          # user factors
    bu = algo.bu[iu]          # user bias
    # item factors/ biases for all items in trainset
    qi = algo.qi              # shape (n_items, n_factors)
    bi = algo.bi              # shape (n_items,)
    scores = global_mean + bu + bi + qi.dot(pu)
    # map inner item ids -> raw ids
    raw_items = np.array([trainset.to_raw_iid(i) for i in range(trainset.n_items)], dtype=int)
    return raw_items, scores


In [36]:
import numpy as np

# Range of alpha values to test (SVD weight)
alpha_values = np.linspace(0, 1, 11)  # 0.0, 0.1, ..., 1.0
results = []

print("Grid search over alpha for hybrid weighting:")

for a in alpha_values:
    alpha = a  # update global alpha used in hybrid_score
    hybrid_recs = get_topk(users, hybrid_score, K)
    p,r = precision_recall_at_k(hybrid_recs, test_gt, K)
    ndcg = ndcg_at_k(hybrid_recs, test_gt, K)
    results.append((alpha, p, r, ndcg))
    print(f"alpha={alpha:.1f} — Prec@{K}: {p:.6f}, Rec@{K}: {r:.6f}, NDCG@{K}: {ndcg:.6f}")

# Find best alpha based on Precision@K
best_alpha, best_prec, best_rec, best_ndcg = max(results, key=lambda x: x[1])
print(f"\nBest alpha: {best_alpha:.1f} — Prec@{K}: {best_prec:.6f}, Rec@{K}: {best_rec:.6f}, NDCG@{K}: {best_ndcg:.6f}")


Grid search over alpha for hybrid weighting:


Generating Top-K Recs: 100%|██████████| 610/610 [02:59<00:00,  3.40it/s]


alpha=0.0 — Prec@10: 0.001639, Rec@10: 0.004098, NDCG@10: 0.003428


Generating Top-K Recs: 100%|██████████| 610/610 [03:06<00:00,  3.27it/s]


alpha=0.1 — Prec@10: 0.001639, Rec@10: 0.004098, NDCG@10: 0.003803


Generating Top-K Recs: 100%|██████████| 610/610 [02:54<00:00,  3.50it/s]


alpha=0.2 — Prec@10: 0.002295, Rec@10: 0.005738, NDCG@10: 0.004947


Generating Top-K Recs: 100%|██████████| 610/610 [02:58<00:00,  3.42it/s]


alpha=0.3 — Prec@10: 0.003607, Rec@10: 0.009016, NDCG@10: 0.008153


Generating Top-K Recs: 100%|██████████| 610/610 [03:10<00:00,  3.20it/s]


alpha=0.4 — Prec@10: 0.005082, Rec@10: 0.012705, NDCG@10: 0.011213


Generating Top-K Recs: 100%|██████████| 610/610 [03:13<00:00,  3.15it/s]


alpha=0.5 — Prec@10: 0.006066, Rec@10: 0.015164, NDCG@10: 0.013294


Generating Top-K Recs: 100%|██████████| 610/610 [03:15<00:00,  3.12it/s]


alpha=0.6 — Prec@10: 0.007213, Rec@10: 0.018033, NDCG@10: 0.013719


Generating Top-K Recs: 100%|██████████| 610/610 [13:38:25<00:00, 80.50s/it]        


alpha=0.7 — Prec@10: 0.007213, Rec@10: 0.018033, NDCG@10: 0.013076


Generating Top-K Recs: 100%|██████████| 610/610 [02:17<00:00,  4.45it/s]


alpha=0.8 — Prec@10: 0.007213, Rec@10: 0.018033, NDCG@10: 0.013646


Generating Top-K Recs: 100%|██████████| 610/610 [02:44<00:00,  3.70it/s]


alpha=0.9 — Prec@10: 0.006885, Rec@10: 0.017213, NDCG@10: 0.013115


Generating Top-K Recs: 100%|██████████| 610/610 [02:43<00:00,  3.74it/s]

alpha=1.0 — Prec@10: 0.006885, Rec@10: 0.017213, NDCG@10: 0.013412

Best alpha: 0.6 — Prec@10: 0.007213, Rec@10: 0.018033, NDCG@10: 0.013719



