In [1]:
import os, numpy as np, pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm, trange        #  ← tqdm is now in play!

# --------------------------------------------------------------------
# 1. Load data --------------------------------------------------------
# --------------------------------------------------------------------
clean_folder   = "clean"
books_df       = pd.read_csv(os.path.join(clean_folder, "books.csv"))
book_tags_df   = pd.read_csv(os.path.join(clean_folder, "book_tags.csv"))
tags_df        = pd.read_csv(os.path.join(clean_folder, "tags.csv"))
ratings_df     = pd.read_csv(os.path.join(clean_folder, "ratings.csv"))



In [2]:
# --------------------------------------------------------------------
# 2. Build the user-item matrix (CSR) --------------------------------
# --------------------------------------------------------------------
row  = ratings_df["user_id"].values - 1
col  = ratings_df["book_id"].values - 1
data = ratings_df["rating"].values
n_users, n_items = row.max() + 1, col.max() + 1

R_csr = sparse.csr_matrix((data, (row, col)), shape=(n_users, n_items))



In [None]:
# --------------------------------------------------------------------
# 3. Mean-centre items & compute similarity --------------------------
# --------------------------------------------------------------------
item_means = R_csr.sum(0) / (R_csr != 0).sum(0)
R_centered = sparse.csr_matrix(R_csr - item_means) # mean-centre
R_centered.eliminate_zeros() # remove empty row

In [None]:
k   = 40
sim = cosine_similarity(R_centered.T, dense_output=False)


In [None]:
# Keep only top-k neighbours per item … with a progress bar 🟢
for i in trange(sim.shape[0], desc="Pruning neighbours"):
    if sim.indptr[i] == sim.indptr[i+1]:
        continue                         # item has no neighbours
    topk = sim.data[sim.indptr[i]:sim.indptr[i+1]].argsort()[:-k-1:-1]
    mask_idx = np.setdiff1d(
        np.arange(sim.indptr[i], sim.indptr[i+1]), 
        sim.indptr[i] + topk, 
        assume_unique=True
    )
    sim.data[mask_idx] = 0
sim.eliminate_zeros() # remove empty rows

In [None]:
# --------------------------------------------------------------------
# 4. Predict & recommend — progress bar optional ---------------------
# --------------------------------------------------------------------
def _predict_user_item(uid, iid, R, S, means):
    """Return predicted rating for user uid (0-idx) & item iid (0-idx)."""
    user_row   = R.getrow(uid)
    rated_iids = user_row.indices
    if iid in rated_iids:
        return user_row[0, iid]

    # neighbours user has rated
    neigh_mask = np.in1d(rated_iids, S[iid].indices)
    neigh_iids = rated_iids[neigh_mask]
    if len(neigh_iids) == 0:
        return means[0, iid]

    sims    = S[iid, neigh_iids].A1
    ratings = user_row[0, neigh_iids].A1 - means[0, neigh_iids]
    return means[0, iid] + (sims @ ratings) / (np.abs(sims).sum() + 1e-9)

def recommend(uid, top_n=10, show_tqdm=False):
    """Return a DataFrame of top_n book recommendations for 1-indexed uid."""
    iterator = trange if show_tqdm else range
    preds = np.array([
        _predict_user_item(uid-1, j, R_csr, sim, item_means)
        for j in iterator(n_items, desc=f"Scoring user {uid}")
    ])

    already = set(R_csr.getrow(uid-1).indices)
    ranked  = [j for j in preds.argsort()[::-1] if j not in already][:top_n]
    return (books_df
            .set_index("book_id")
            .loc[[j+1 for j in ranked], ["title", "authors"]]
            .assign(score=preds[ranked]))

In [None]:
print(recommend(uid=1, top_n=10, show_tqdm=True))