In [9]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.neighbors import NearestNeighbors

class UserBasedCFOptimized:
    def __init__(self, clean_folder="clean", k=40):
        # 1) Load ratings & books
        ratings = pd.read_csv(os.path.join(clean_folder, "ratings.csv"))
        self.books = pd.read_csv(os.path.join(clean_folder, "books.csv"))

        # 2) Build user×item CSR  CSR is faster for sparse matrix operations
        #    than COO, which is faster for building the matrix
        row = ratings.user_id.values - 1
        col = ratings.book_id.values - 1
        data = ratings.rating.values
        self.n_users = row.max() + 1
        self.n_items = col.max() + 1
        self.R_csr   = sparse.csr_matrix(
            (data, (row, col)),
            shape=(self.n_users, self.n_items)
        )

        # 3) Compute each user’s mean μ_u
        nz_counts     = (self.R_csr != 0).sum(1).A1    # ratings per user
        self.mu       = (self.R_csr.sum(1).A1 / nz_counts)

        # 4) Demean only nonzeros via COO
        R_coo         = self.R_csr.tocoo(copy=True)
        R_coo.data    = R_coo.data.astype(np.float64)
        R_coo.data   -= self.mu[R_coo.row]
        self.Rd      = R_coo.tocsr()
        self.Rd.eliminate_zeros()

        # 5) Build sparse kNN graph on users
        knn_graph = NearestNeighbors(
            n_neighbors=k+1,    # include self
            metric="cosine",
            algorithm="brute",
            n_jobs=-1
        ).fit(self.Rd).kneighbors_graph(
            self.Rd,
            mode="distance"
        )
        knn_graph.setdiag(0)
        knn_graph.eliminate_zeros()
        knn_graph.data = 1.0 - knn_graph.data  # distance → similarity
        self.S = knn_graph.tocsr()             # shape: (n_users × n_users)

    def recommend(self, user_id, top_n=10):
        u = user_id - 1

        # 6) Extract this user’s demeaned ratings
        u_vec = self.Rd[u, :].toarray().ravel()   # Δr_{u,i}
        seen  = u_vec != 0

        # cold-start: no ratings → most popular books
        if not seen.any():
            pop = self.R_csr.sum(0).A1
            idx = np.argsort(pop)[::-1][:top_n]
            return (
                self.books.set_index("book_id")
                          .loc[idx+1, ["title","authors"]]
                          .assign(score=pop[idx])
                          .reset_index()
            )

        # 7) Vectorized prediction
        #   numerator = S[u,:] ⋅ Δr vectors  → shape (n_items,)
        #   denominator = sum |S[u,:]|      → scalar
        s_u   = self.S.getrow(u)                # sparse row of length n_users
        num   = s_u.dot(self.Rd).toarray().ravel()
        den   = np.abs(s_u.data).sum() + 1e-9
        preds = self.mu[u] + num/den            # shape (n_items,)

        # 8) Mask out already-seen & pick top_n
        preds[seen] = -np.inf
        top_idx  = np.argpartition(preds, -top_n)[-top_n:]
        top_idx  = top_idx[np.argsort(preds[top_idx])[::-1]]

        return (
            self.books.set_index("book_id")
                      .loc[top_idx+1, ["title","authors"]]
                      .assign(score=preds[top_idx])
                      .reset_index()
        )

# ─── USAGE ─────────────────────────────────────────────────────────────
if __name__ == "__main__":
    model = UserBasedCFOptimized(clean_folder="clean", k=40)
    print(model.recommend(user_id=4, top_n=10))

KeyboardInterrupt: 

In [None]:
import os
import math
import pandas as pd
from tqdm import tqdm

def evaluate_dcg_user_cf(model, to_read_test_path, top_n=1000):
    """
    Evaluate average DCG@top_n for a user-based CF model against a wishlist.

    Parameters
    ----------
    model : UserBasedCFOptimized
        Fitted CF model with .recommend(user_id, top_n) → DataFrame including 'book_id'
    to_read_test_path : str
        Path to 'to_read_test.csv' containing ['user_id','book_id']
    top_n : int
        Number of recommendations to score per user

    Returns
    -------
    float
        Mean DCG@top_n over all users in the test set
    """
    to_read = pd.read_csv(to_read_test_path)
    users   = to_read['user_id'].unique()
    
    total_dcg = 0.0
    count     = 0

    for uid in tqdm(users, desc="Evaluating DCG (user-based)"):
        # ground-truth wishlist for this user
        wish = to_read.loc[to_read['user_id'] == uid, 'book_id'].tolist()
        if not wish:
            continue

        # get CF recommendations
        recs    = model.recommend(user_id=uid, top_n=top_n)
        rec_ids = recs['book_id'].tolist()

        # compute DCG: 1/log2(rank+2) for each hit
        dcg = 0.0
        for rank, bid in enumerate(rec_ids):
            if bid in wish:
                dcg += 1.0 / math.log2(rank + 2)

        total_dcg += dcg
        count     += 1

    return (total_dcg / count) if count > 0 else 0.0


if __name__ == "__main__":
    # 1) import or define your user-based model

    # 2) instantiate & fit
    model = UserBasedCFOptimized(clean_folder="clean", k=40)

    # 3) evaluate on the test wishlist
    test_path = os.path.join("clean", "to_read_test.csv")
    avg_dcg   = evaluate_dcg_user_cf(model, test_path, top_n=10)

    print(f"Average DCG@10 (user-based): {avg_dcg:.4f}")

Evaluating DCG (user-based): 100%|██████████| 7479/7479 [00:19<00:00, 378.23it/s]

Average DCG@10 (user-based): 1.2930





In [11]:
import os
import math
import pandas as pd
from tqdm import tqdm

def evaluate_ndcg_user_cf(model, to_read_test_path, top_n=10):
    """
    Evaluate average NDCG@top_n for a user-based CF model.
    """
    to_read = pd.read_csv(to_read_test_path)
    users   = to_read['user_id'].unique()
    
    total_ndcg = 0.0
    count      = 0

    for uid in tqdm(users, desc="Evaluating NDCG (user-based)"):
        wish = to_read.loc[to_read['user_id'] == uid, 'book_id'].tolist()[:8]
        if not wish:
            continue

        recs     = model.recommend(user_id=uid, top_n=top_n)
        rec_ids  = recs['book_id'].tolist()

        dcg = sum(
            1.0 / math.log2(rank + 2)
            for rank, bid in enumerate(rec_ids)
            if bid in wish
        )

        idcg = sum(
            1.0 / math.log2(i + 2)
            for i in range(min(len(wish), top_n))
        )

        ndcg = (dcg / idcg) if idcg > 0 else 0.0
        total_ndcg += ndcg
        count      += 1


    return (total_ndcg / count) if count > 0 else 0.0


if __name__ == "__main__":
    model     = UserBasedCFOptimized(clean_folder="clean", k=40)
    test_path = os.path.join("clean", "to_read_test.csv")
    avg_ndcg  = evaluate_ndcg_user_cf(model, test_path, top_n=100)
    print(f"Average NDCG@10 (user-based): {avg_ndcg:.4f}")


Evaluating NDCG (user-based): 100%|██████████| 7479/7479 [00:18<00:00, 398.85it/s]

Average NDCG@10 (user-based): 0.0662



