In [None]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import trange

class ItemCFRecommender:
    def __init__(self, clean_folder="clean", k=40):
        self.clean = clean_folder
        self.k     = k
        self._load_data()
        self._build_matrix()
        self._mean_center()
        self._compute_similarity()

    def _load_data(self):
        self.books_df   = pd.read_csv(os.path.join(self.clean, "books.csv"))
        self.ratings_df = pd.read_csv(os.path.join(self.clean, "ratings.csv"))

    def _build_matrix(self):
        row = self.ratings_df["user_id"].values - 1
        col = self.ratings_df["book_id"].values - 1
        data = self.ratings_df["rating"].values
        self.n_users = row.max() + 1
        self.n_items = col.max() + 1
        self.R_csr   = sparse.csr_matrix((data, (row, col)),
                                         shape=(self.n_users, self.n_items))

    def _mean_center(self):
        # subtract each item’s mean to turn cosine into Pearson-like
        self.item_means = (self.R_csr.sum(0) /
                           (self.R_csr != 0).sum(0)).A1
        self.R_centered = sparse.csr_matrix(self.R_csr - self.item_means)
        self.R_centered.eliminate_zeros()

    def _compute_similarity(self):
        # full item×item cosine on centered data
        sim = cosine_similarity(self.R_centered.T, dense_output=False)
        # prune to top‐k neighbors per item
        for i in trange(sim.shape[0], desc="Pruning neighbours"):
            start, end = sim.indptr[i], sim.indptr[i+1]
            if end - start <= self.k:
                continue
            block = sim.data[start:end]
            topk_idx = np.argpartition(block, -self.k)[-self.k:]
            mask_idx = np.setdiff1d(
                np.arange(start, end),
                start + topk_idx,
                assume_unique=True
            )
            sim.data[mask_idx] = 0
        sim.eliminate_zeros()
        self.sim = sim

    def _predict(self, u, i):
        # u, i are zero‐indexed
        user_row = self.R_csr.getrow(u)
        rated    = user_row.indices
        if i in rated:
            return user_row[0, i]
        common = np.intersect1d(rated, self.sim[i].indices, assume_unique=True)
        if common.size == 0:
            return self.item_means[i]
        sims  = self.sim[i, common].A1
        diffs = user_row[0, common].A1 - self.item_means[common]
        return self.item_means[i] + sims.dot(diffs) / (np.abs(sims).sum() + 1e-9)

    def recommend(self, user_id, top_n=10, show_progress=False):
        u = user_id - 1 
        it = trange(self.n_items, desc=f"Scoring user {user_id}") \
             if show_progress else range(self.n_items)
        preds = np.array([self._predict(u, j) for j in it])

        seen = set(self.R_csr.getrow(u).indices)
        ranked = [j for j in preds.argsort()[::-1] if j not in seen][:top_n]
        recs = (self.books_df
                .set_index("book_id")
                .loc[[j+1 for j in ranked], ["title", "authors"]]
                .assign(score=preds[ranked]))
        return recs.reset_index()

In [None]:
model = ItemCFRecommender(clean_folder="clean", k=40)
    # (Optional) show tqdm bars when scoring:
top10 = model.recommend(user_id=123, top_n=10, show_progress=True)
print(top10)

In [None]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

class FastItemCF:
    def __init__(self, clean_folder="clean", k=40, n_factors=100):
        self.k = k
        # 1) load
        ratings = pd.read_csv(os.path.join(clean_folder, "ratings.csv"))
        self.books = pd.read_csv(os.path.join(clean_folder, "books.csv"))
        
        # 2) build CSR
        row = ratings.user_id.values - 1
        col = ratings.book_id.values - 1
        data= ratings.rating.values
        self.n_items = col.max()+1
        R = sparse.csr_matrix((data, (row, col)),
                              shape=(row.max()+1, self.n_items))
        
        # 3) mean-center
        self.mu = (R.sum(0)/(R!=0).sum(0)).A1
        Rc = R - self.mu
        self.Rc = sparse.csr_matrix(Rc)
        self.Rc.eliminate_zeros()
        
        # 4) SVD → item factors
        svd = TruncatedSVD(n_components=n_factors, random_state=42)
        self.item_factors = svd.fit_transform(self.Rc.T)    # (n_items × f)
        
        # 5) kNN in f-dim space
        nn = NearestNeighbors(n_neighbors=self.k+1,
                              metric="cosine",
                              algorithm="brute",
                              n_jobs=-1)
        nn.fit(self.item_factors)
        distances, neighbors = nn.kneighbors(self.item_factors)
        # drop self (first neighbor is itself)
        self.knn_idx = neighbors[:,1:]
        self.knn_sim = 1 - distances[:,1:]                # cosine → similarity

    def recommend(self, user_id, top_n=10):
        u = user_id - 1 #
        # Compute user_row correctly by summing over rated items
        rated_items = self.Rc[u, :].indices
        user_row = sparse.csr_matrix((self.Rc[u, rated_items].toarray().ravel(), 
                                      (np.zeros_like(rated_items), rated_items)),
                                     shape=(1, self.n_items))
        rated_mask = user_row.toarray().ravel() != 0
        diffs = (user_row.toarray().ravel() - self.mu) * rated_mask
        
        # build a sparse kNN-graph matrix on the fly:
        rows = np.repeat(np.arange(self.n_items), self.k)
        cols = self.knn_idx.flatten()
        sims = self.knn_sim.flatten()
        S = sparse.csr_matrix((sims, (rows, cols)), 
                              shape=(self.n_items, self.n_items))
        
        # vectorized numerator & denominator
        num = S[:, rated_mask].dot(diffs[rated_mask])
        den = np.abs(S[:, rated_mask]).sum(axis=1).A1 + 1e-9
        preds = self.mu + num/den
        
        # zero out already-rated
        preds[rated_mask] = -np.inf
        
        top_idx = np.argpartition(preds, -top_n)[-top_n:]
        top_idx = top_idx[np.argsort(preds[top_idx])[::-1]]
        
        recs = (self.books
                .set_index("book_id")
                .loc[top_idx+1, ["title","authors"]]
                .assign(score=preds[top_idx]))
        return recs.reset_index()

# Usage:
model = FastItemCF(clean_folder="clean", k=40, n_factors=100)
print(model.recommend(user_id=123, top_n=10))

   book_id                                          title  \
0    10000                            The First World War   
1     3337                          UnWholly (Unwind, #2)   
2     3336  Feast of Fools (The Morganville Vampires, #4)   
3     3335                  Imagine: How Creativity Works   
4     3334                      The Marriage of Opposites   
5     3333                                     Skinny Dip   
6     3332                   The Coldest Girl in Coldtown   
7     3338          Consider the Lobster and Other Essays   
8     3330          A Fistful of Charms (The Hollows, #4)   
9     3322                       The Lady and the Unicorn   

                authors  score  
0           John Keegan   -inf  
1       Neal Shusterman   -inf  
2          Rachel Caine   -inf  
3          Jonah Lehrer   -inf  
4         Alice Hoffman   -inf  
5          Carl Hiaasen   -inf  
6           Holly Black   -inf  
7  David Foster Wallace   -inf  
8          Kim Harrison   -inf 

In [7]:
print(model.recommend(user_id=0, top_n=10))

   book_id                                          title  \
0    10000                            The First World War   
1     3337                          UnWholly (Unwind, #2)   
2     3336  Feast of Fools (The Morganville Vampires, #4)   
3     3335                  Imagine: How Creativity Works   
4     3334                      The Marriage of Opposites   
5     3333                                     Skinny Dip   
6     3332                   The Coldest Girl in Coldtown   
7     3338          Consider the Lobster and Other Essays   
8     3330          A Fistful of Charms (The Hollows, #4)   
9     3322                       The Lady and the Unicorn   

                authors  score  
0           John Keegan   -inf  
1       Neal Shusterman   -inf  
2          Rachel Caine   -inf  
3          Jonah Lehrer   -inf  
4         Alice Hoffman   -inf  
5          Carl Hiaasen   -inf  
6           Holly Black   -inf  
7  David Foster Wallace   -inf  
8          Kim Harrison   -inf 

In [3]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.neighbors import NearestNeighbors

class ItemBasedCFOptimized:
    def __init__(self, clean_folder="clean", k=40):
        # 1) Load ratings & books
        ratings = pd.read_csv(os.path.join(clean_folder, "ratings.csv"))
        self.books = pd.read_csv(os.path.join(clean_folder, "books.csv"))

        # 2) Build user×item CSR
        row = ratings.user_id.values - 1
        col = ratings.book_id.values - 1
        data = ratings.rating.values
        self.n_items = col.max() + 1
        self.R_csr   = sparse.csr_matrix(
            (data, (row, col)),
            shape=(row.max()+1, self.n_items)
        )

        # 3) Compute item means μᵢ
        #    (note: .A1 turns the 1×n sparse sum into a flat numpy array)
        nz_counts    = (self.R_csr != 0).sum(0).A1
        self.mu      = self.R_csr.sum(0).A1 / nz_counts

        # 4) Demean *only* the nonzero entries via COO
        R_coo        = self.R_csr.tocoo(copy=True)
        R_coo.data = R_coo.data.astype(np.float64)  # Ensure data is float64
        R_coo.data -= self.mu[R_coo.col]
        self.Rc      = R_coo.tocsr()
        self.Rc.eliminate_zeros()

        # 5) Build the sparse kNN graph S (items×items), distances → similarities
        knn_graph = NearestNeighbors(
            n_neighbors=k + 1,  # Include self in neighbors
            metric="cosine",
            algorithm="brute",
            n_jobs=-1
        ).fit(self.Rc.T).kneighbors_graph(
            self.Rc.T,
            mode="distance"
        )
        # Remove self-loops by setting diagonal elements to 0
        knn_graph.setdiag(0)
        knn_graph.eliminate_zeros()
        # convert distances to cosine‐similarities
        knn_graph.data = 1.0 - knn_graph.data
        self.S = knn_graph.tocsr()

    def recommend(self, user_id, top_n=10):
        u = user_id - 1

        # 6) Grab the user's demeaned ratings (sparse row → dense mask)
        user_vec = self.Rc[u, :].toarray().ravel()     # Δrᵤ,i = rᵤ,i − μᵢ
        seen     = user_vec != 0
        if not seen.any():
            # cold-start fallback: most popular books
            pop = self.R_csr.sum(0).A1
            idx = np.argsort(pop)[::-1][:top_n]
            return (
                self.books.set_index("book_id")
                          .loc[idx+1, ["title","authors"]]
                          .assign(score=pop[idx])
                          .reset_index()
            )

        # 7) Vectorized prediction:
        #    numerator = ∑_{j∈seen} S[i,j] * Δrᵤ,j
        #    denom     = ∑_{j∈seen} |S[i,j]|
        num  = self.S[:, seen].dot(user_vec[seen])
        den  = np.abs(self.S[:, seen]).sum(axis=1).A1 + 1e-9
        preds = self.mu + num/den

        # 8) Mask out already-rated items & pick top-n
        preds[seen] = -np.inf
        top_idx = np.argpartition(preds, -top_n)[-top_n:]
        top_idx = top_idx[np.argsort(preds[top_idx])[::-1]]

        return (
            self.books.set_index("book_id")
                      .loc[top_idx+1, ["title","authors"]]
                      .assign(score=preds[top_idx])
                      .reset_index()
        )

# ─── USAGE ─────────────────────────────────────────────────────────────
if __name__ == "__main__":
    model = ItemBasedCFOptimized(clean_folder="clean", k=40)
    print(model.recommend(user_id=4, top_n=9))

   book_id                                              title  \
0      737          A Long Way Gone: Memoirs of a Boy Soldier   
1     6644                                         Jesus Land   
2     5658                      All Things Wise and Wonderful   
3      472                           Man's Search for Meaning   
4     1656  All Things Bright and Beautiful (All Creatures...   
5      994  All Creatures Great and Small (All Creatures G...   
6      513  The Hiding Place: The Triumphant True Story of...   
7     4242                      My Friend Flicka (Flicka, #1)   
8     2523                             The Incredible Journey   

                                             authors     score  
0                                       Ishmael Beah  5.672018  
1                                     Julia Scheeres  5.489396  
2                                      James Herriot  5.480613  
3                                   Viktor E. Frankl  5.463528  
4                       

In [4]:
import os
import math
import pandas as pd
from tqdm import tqdm

# ─── Your existing CF code ────────────────────────────────────────────
# (Make sure this is already defined or imported)
# from your_module import ItemBasedCFOptimized

# ─── DCG evaluation ────────────────────────────────────────────────────
def evaluate_dcg_cf(model, to_read_test_path, top_n=10):
    """
    Evaluate average DCG of an item-based CF model against a wishlist.

    Parameters
    ----------
    model : ItemBasedCFOptimized
      Fitted CF model with .recommend(user_id, top_n) → DataFrame(book_id,...)
    to_read_test_path : str
      Path to 'to_read_test.csv' containing columns ['user_id','book_id']
    top_n : int
      Number of recommendations to consider per user

    Returns
    -------
    avg_dcg : float
      Mean DCG score over all users in the test set
    """
    # load the wishlist
    to_read = pd.read_csv(to_read_test_path)
    users   = to_read['user_id'].unique()
    
    total_dcg = 0.0
    count     = 0

    for uid in tqdm(users, desc="Evaluating DCG"):
        wish = to_read.loc[to_read['user_id']==uid, 'book_id'].tolist()
        if not wish:
            continue

        # get top-N CF recommendations
        recs = model.recommend(user_id=uid, top_n=top_n)
        rec_list = recs['book_id'].tolist()

        # compute DCG: sum(1 / log2(rank+2)) for hits
        dcg = 0.0
        for rank, bid in enumerate(rec_list):
            if bid in wish:
                dcg += 1.0 / math.log2(rank + 2)

        total_dcg += dcg
        count     += 1

    return (total_dcg / count) if count > 0 else 0.0

# ─── USAGE ─────────────────────────────────────────────────────────────
if __name__ == "__main__":
    # 1) instantiate your model
    model = ItemBasedCFOptimized(clean_folder="clean", k=40)

    # 2) evaluate DCG on your test wishlist
    test_path = os.path.join("clean", "to_read_test.csv")
    avg_dcg   = evaluate_dcg_cf(model, test_path, top_n=10)

    print(f"Average DCG@10: {avg_dcg:.4f}")

Evaluating DCG: 100%|██████████| 7479/7479 [00:16<00:00, 458.19it/s]

Average DCG@10: 0.0194



