In [1]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer, util

# Load data
clean_folder = "clean"
books_df = pd.read_csv(os.path.join(clean_folder, "books.csv"))
book_tags_df = pd.read_csv(os.path.join(clean_folder, "book_tags.csv"))
tags_df = pd.read_csv(os.path.join(clean_folder, "tags.csv"))

# Standardize column name for merging
book_tags_df.rename(columns={'book_id': 'goodreads_book_id'}, inplace=True)

# Rename 'book_id' in books_df to 'goodreads_book_id' for merging
books_df.rename(columns={'book_id': 'goodreads_book_id'}, inplace=True)

# Merge book_tags with tag names
book_tags_merged = pd.merge(book_tags_df, tags_df, on='tag_id', how='inner')

# Merge with books
books_with_tags = pd.merge(
    books_df[['goodreads_book_id', 'title', 'authors', 'average_rating',
              'ratings_count', 'original_publication_year', 'language_code']],
    book_tags_merged,
    on='goodreads_book_id',
    how='inner'
)

# Group by book and concatenate tag names into a single string
books_tagged = books_with_tags.groupby('goodreads_book_id').agg({
    'title': 'first',
    'authors': 'first',
    'average_rating': 'first',
    'ratings_count': 'first',
    'original_publication_year': 'first',
    'language_code': 'first',
    'tag_name': lambda x: ' '.join(set(x))  # deduplicated tag list
}).reset_index()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model
model = SentenceTransformer('BAAI/bge-m3')

# Encode tag text for each book
books_tagged['tag_text'] = books_tagged['tag_name']
book_tag_embeddings0 = model.encode(books_tagged['tag_text'].tolist(), convert_to_tensor=True)

In [3]:
def recommend_by_multiple_genres(user_genres, top_n=10):
    """
    user_genres: str — comma-separated genres, e.g., "Fantasy, Mystery, Romance"
    top_n: int — number of results to return
    """
    # Parse and clean genres
    genre_list = [g.strip() for g in user_genres.split(',') if g.strip()]
    
    if not genre_list:
        raise ValueError("Please input at least one genre!")

    # Embed each genre separately
    genre_embeddings = model.encode(genre_list, convert_to_tensor=True)

    # Compute average embedding (user profile)
    user_embedding = genre_embeddings.mean(dim=0)

    # Compute cosine similarity with all books
    scores = util.pytorch_cos_sim(user_embedding, book_tag_embeddings0)[0]
    top_results = scores.topk(top_n)

    # Extract matching rows
    results = books_tagged.iloc[top_results[1].cpu().numpy()].copy()
    results['similarity'] = top_results[0].cpu().numpy()
    return results[[
        'goodreads_book_id', 'title', 'authors', 'average_rating',
        'ratings_count', 'original_publication_year',
        'language_code', 'similarity'
    ]]

In [4]:
user_input = "Fantasy, Science Fiction"
recommendations = recommend_by_multiple_genres(user_input, top_n=10)
print(recommendations)

      goodreads_book_id                                              title  \
8910               8911                The Blood Mirror (Lightbringer, #4)   
2055               2056                     New Spring (Wheel of Time, #0)   
6677               6678  The Wheel of Time: Boxed Set #1 (Wheel of Time...   
8550               8551            The Queen's Poisoner (Kingfountain, #1)   
5959               5960            Dragon Haven (Rain Wild Chronicles, #2)   
8633               8634                                     حوجن [Ḥawjan]   
8184               8185     Morgawr (The Voyage of the Jerle Shannara, #3)   
6637               6638                   Cross My Heart (Alex Cross, #21)   
5066               5067       The Dragon Keeper (Rain Wild Chronicles, #1)   
388                 389                    The Final Empire (Mistborn, #1)   

                                           authors  average_rating  \
8910                                   Brent Weeks            4.29   
2

In [5]:
ratings_df = pd.read_csv(os.path.join(clean_folder, "ratings_test.csv"))

In [9]:
# ------------------------------------
# 4. Filter Users + Sample
# ------------------------------------
#   We'll keep only users who rated >60 and <160 books,
#   then randomly sample 20% of those rating rows.

# Count how many ratings each user did
user_counts = ratings_df.groupby('user_id')['book_id'].count().reset_index()
user_counts.rename(columns={'book_id': 'count_ratings'}, inplace=True)

# Keep those who rated between 60 and 160
eligible_users = user_counts[
    (user_counts['count_ratings'] > 20) &
    (user_counts['count_ratings'] < 30)
]
print(f"Eligible users: {len(eligible_users)}")

Eligible users: 30776


In [10]:
# Merge to keep only those users' ratings
filtered_ratings = pd.merge(
    ratings_df,
    eligible_users[['user_id']],
    on='user_id',
    how='inner'
)

In [11]:


print(f"Filtered user count: {len(eligible_users)}")
print(f"Filtered ratings shape: {filtered_ratings.shape}")


Filtered user count: 30776
Filtered ratings shape: (745239, 3)


In [185]:
import numpy as np
import pandas as pd
import math
from tqdm import tqdm
from collections import Counter

def compute_ndcg_at_k(ground_truth_ids, recommended_ids, user_data, k=10):
    """
    Compute NDCG for a single user at cutoff k, manually.
    
    Params
    ------
    ground_truth_ids : set
        Set of book_ids the user rated >= 5 (i.e. 'relevant').
    recommended_ids : list
        List of recommended book_ids in the final top-k (rank order).
    user_data : pd.DataFrame
        The subset of the ratings DataFrame for this user alone
        (so we can look up the actual rating for each book).
    k : int
        The number of items to consider (already ensured recommended_ids has up to k items).
        
    Returns
    -------
    ndcg_val : float
        The NDCG for this user at k.
    df_debug : pd.DataFrame
        A table showing rank, book_id, actual rating, item-level DCG
        (so we can see how DCG is accumulated).
    """
    recommended_top_k = recommended_ids[:k]
    
    # 1) Build a binary relevance list: 1 if rating >=5, else 0
    relevance = [1 if b in ground_truth_ids else 0 for b in recommended_top_k]
    
    # 2) Compute DCG for each rank i in [0..k-1], storing item-level contributions
    dcg_values = []
    for i, rel in enumerate(relevance):
        rank = i + 1  # rank is 1-based
        dcg_i = (2 ** rel - 1) / math.log2(rank + 1)
        dcg_values.append(dcg_i)
    
    dcg = sum(dcg_values)
    
    # 3) Compute IDCG by sorting relevance in descending order
    ideal_relevance = sorted(relevance, reverse=True)
    idcg_values = []
    for i, rel in enumerate(ideal_relevance):
        rank = i + 1
        idcg_i = (2 ** rel - 1) / math.log2(rank + 1)
        idcg_values.append(idcg_i)
    
    idcg = sum(idcg_values)
    ndcg_val = dcg / idcg if idcg > 0 else 0.0

    # Build a debug DataFrame
    debug_rows = []
    for i, book_id in enumerate(recommended_top_k):
        rank = i + 1
        # Actual rating from user_data
        row = user_data[user_data['book_id'] == book_id]
        rating = row['rating'].values[0] if not row.empty else 0
        
        debug_rows.append({
            'Rank': rank,
            'book_id': book_id,
            'User Rating': rating,
            'DCG Contribution': dcg_values[i]
        })
        
    df_debug = pd.DataFrame(debug_rows)
    
    return ndcg_val, df_debug

def evaluate_ndcg(ratings_subset, top_n=10):
    """
    For each user:
      1) Gather user's relevant books (rating >=5).
      2) Build a 'genre query' from top-5 tags of those relevant books.
      3) Get a larger set of recommended items using `recommend_by_multiple_genres`.
      4) Skip any book that the user has not rated (rating=0).
      5) Keep collecting items (in rank order) up to `top_n`.
      6) Compute NDCG (manually) and store it.
      7) Print a table showing rank, book_id, user rating, DCG contribution for each user.
    Finally, return the average NDCG@k across users.
    """
    user_ids = ratings_subset['user_id'].unique()
    ndcg_list = []

    # Pre-build a dictionary from goodreads_book_id -> book_id
    if 'goodreads_book_id' not in books_tagged.columns:
        raise KeyError("'goodreads_book_id' column is missing in books_tagged DataFrame.")
    
    # Map from goodreads_book_id to book_id
    # (If your data is different, adjust accordingly.)
    gr2id = {}
    for gid in books_tagged['goodreads_book_id']:
        gr2id[gid] = gid  # If they are truly the same, or else do a real map

    # Alternatively, if books_tagged *does* have a separate 'book_id' column:
    #   gr2id = dict(zip(books_tagged['goodreads_book_id'], books_tagged['book_id']))

    all_debug_tables = []  # to store or display if you want

    for uid in tqdm(user_ids, desc="Evaluating NDCG"):
        # -- A) Get user data
        user_data = ratings_subset[ratings_subset['user_id'] == uid].copy()
        
        # relevant_books = rated >= 5
        relevant_books = set(user_data[user_data['rating'] >= 5]['book_id'].unique())
        if len(relevant_books) == 0:
            continue  # no relevant => skip

        # -- B) Build a naive 'genre query' from top 5 tags of relevant books
        #    You may need to use a suitable merge or direct indexing. 
        #    Below: if the 'goodreads_book_id' in books_tagged is the same as
        #    the user's 'book_id', it’s simpler, but typically you might need
        #    a separate map or join if columns differ.  
        #    We'll assume the columns line up or you can adapt as needed.

        relevant_tags = books_tagged[books_tagged['goodreads_book_id'].isin(relevant_books)]['tag_name']
        all_words = " ".join(relevant_tags.tolist()).split()
        word_counts = Counter(all_words)
        top_5_tags = [w for w, c in word_counts.most_common(5)]
        if not top_5_tags:
            continue
        
        user_genres = ", ".join(top_5_tags)

        # -- C) Get a larger set of recommendations
        #    We'll ask for top_n * 5 to have enough items to skip from.
        recs = recommend_by_multiple_genres(user_genres, top_n=top_n * 5)
        # recs should have 'goodreads_book_id' in rank order
        recommended_gids = recs['goodreads_book_id'].tolist()

        # -- D) Build a final list that ONLY includes items user rated >0
        #       i.e. skip rating=0. Keep collecting until we have `top_n`.
        final_recs = []
        for gid in recommended_gids:
            bk_id = gr2id.get(gid, None)
            if bk_id is None:
                continue  # not in dictionary

            # Check user's rating
            row = user_data[user_data['book_id'] == bk_id]
            user_rating = row['rating'].values[0] if not row.empty else 0
            if user_rating > 0:
                final_recs.append(bk_id)
                if len(final_recs) == top_n:
                    break

        if len(final_recs) < 1:
            # No recommended items that user actually rated => skip
            continue

        # -- E) Compute NDCG for these final recommendations
        ndcg_val, df_debug = compute_ndcg_at_k(
            ground_truth_ids=relevant_books,
            recommended_ids=final_recs,
            user_data=user_data,
            k=top_n
        )
        ndcg_list.append(ndcg_val)

        first = True
        if first:
            all_debug_tables.append(df_debug)
            first = False 
        # Print a debug table for this user
        # print(f"\nUser: {uid}  (NDCG@{top_n} = {ndcg_val:.4f})")
        # print(df_debug.to_string(index=False))  # or display as you like
        # print("----------------------------------------------------")

    # -- F) Return the average
    if ndcg_list:
        return np.mean(ndcg_list)
    else:
        return 0.0

# Example usage:
avg_ndcg = evaluate_ndcg(filtered_ratings, top_n=10)
print(f"\nGlobal Average NDCG@10 = {avg_ndcg:.4f}")

Evaluating NDCG: 100%|██████████| 3996/3996 [03:25<00:00, 19.47it/s]


Global Average NDCG@10 = 0.4389





Top users with the most rated books: [28158, 7563, 24143, 37834, 6630]
No data found for user 6630.


In [111]:
print(filtered_ratings_20p)

       user_id  book_id  rating
0        19009       56       2
1        10484     5500       3
2         3106      456       3
3        44407     3647       5
4         5419     6753       3
...        ...      ...     ...
74115     9814     5006       4
74116    39590       12       5
74117    38866     3070       5
74118    50769      345       2
74119    22529     2405       3

[74120 rows x 3 columns]


In [None]:
def evaluate_ndcg(ratings_subset, top_n=10):
    """
    Modified so that:
     - We DO skip items that the user rated but rated <5.
       This effectively ignores items the user explicitly
       disliked or gave a low rating to, ensuring we move
       further down the recommendation list until we find
       either not-rated (0) or relevant (≥5).
    """
    import numpy as np
    from sklearn.metrics import ndcg_score
    from collections import Counter
    import pandas as pd
    
    user_ids = ratings_subset['user_id'].unique()
    ndcg_list = []

    # Pre-build a dictionary from goodreads_book_id -> book_id
    if 'goodreads_book_id' not in books_tagged.columns:
        raise KeyError("'goodreads_book_id' column is missing in books_tagged DataFrame.")
    gr2id = books_tagged.set_index('goodreads_book_id').index.to_series().to_dict()

    first_user_debug = True  # We will only show the table for the first user

    for uid in tqdm(user_ids, desc="Evaluating NDCG (skip low-rated)"):
        # A) Get user data
        user_data = ratings_subset[ratings_subset['user_id'] == uid]
        # Relevant books = rated ≥ 5
        relevant_books = set(user_data[user_data['rating'] >= 5]['book_id'].unique())
        if len(relevant_books) == 0:
            continue

        # B) Build a naive 'genre query' from top 5 tags of relevant books
        relevant_tags = books_tagged[books_tagged['goodreads_book_id'].isin(relevant_books)]['tag_name']
        from collections import Counter
        all_words = " ".join(relevant_tags.tolist()).split()
        word_counts = Counter(all_words)
        top_5_tags = [w for w, c in word_counts.most_common(5)]
        if not top_5_tags:
            continue

        user_genres = ", ".join(top_5_tags)

        # C) Get a larger recommendation list
        len_books = len(books_df)
        # print(f"Number of books in the dataset: {len_books}")
        recs = recommend_by_multiple_genres(user_genres, len_books)
        recommended_gids = recs['goodreads_book_id'].tolist()

        # D) Build the final list:
        #    - Skip items that the user explicitly rated <5
        #    - Keep items user rated ≥5
        final_recs = []
        for gid in recommended_gids:
            bk_id = gr2id.get(gid, None)
            if bk_id is None:
                continue  # not found in dictionary

            row = user_data[user_data['book_id'] == bk_id]
            if not row.empty:
                user_rating = row['rating'].values[0]
            else:
                continue  # Skip if the user never rated this book

            # ### CHANGED HERE ###
            # If the user did rate it and rating < 5 => skip it
            if user_rating > 0 and user_rating < 5:
                continue

            # Otherwise (rating=0 or rating>=5), include it
            final_recs.append(bk_id)

            if len(final_recs) == top_n:
                break

        recommended_top_k = final_recs
        if len(recommended_top_k) < 2:
            continue

        # Compute a binary relevance label for each item in final_recs
        # (1 if user actually rated ≥5, 0 otherwise)
        relevance = [1 if b in relevant_books else 0 for b in recommended_top_k]

        # We create some dummy predicted scores just to have a strictly decreasing
        # list: e.g. [N, N-1, ..., 1].
        dcg = 0.0
        for i, rel in enumerate(relevance):
            dcg += rel / np.log2(i + 2)  # i+2 because log2(1) is undefined
        predicted_scores = [dcg] * len(recommended_top_k)

        # Show debug table for the first user only
        if first_user_debug:
            debug_rows = []
            for i, book_id in enumerate(recommended_top_k):
                # The user rating for this book:
                row = user_data.loc[user_data['book_id'] == book_id, 'rating']
                user_rating = row.values[0] if not row.empty else user_rating
                debug_rows.append({
                    'Rank': i + 1,
                    'book_id': book_id,
                    'User Rating': user_rating,
                    'Relevance': 1 if book_id in relevant_books else 0,
                    'Predicted Score': predicted_scores[i]
                })

            df_debug = pd.DataFrame(debug_rows)
            print("** DEBUG TABLE FOR USER:", uid, "**")
            print(df_debug)
            
            first_user_debug = False  # do not print for subsequent users

        # Compute NDCG for this user
        y_true = np.array([relevance])
        y_score = np.array([predicted_scores])
        ndcg_val = ndcg_score(y_true, y_score) 
        ndcg_list.append(ndcg_val)
        
        print(f"User {uid} NDCG@{top_n}: {ndcg_val:.4f}")

    if ndcg_list:
        return np.mean(ndcg_list)
    else:
        return 0.0
    
avg_ndcg = evaluate_ndcg(filtered_ratings, top_n=10)
print(f"\nAverage NDCG@10 = {avg_ndcg:.4f}")

Evaluating NDCG (skip low-rated):   0%|          | 1/3996 [00:01<1:21:50,  1.23s/it]

** DEBUG TABLE FOR USER: 7 **
   Rank  book_id  User Rating  Relevance  Predicted Score
0     1      416            5          1         3.304666
1     2      760            5          1         3.304666
2     3      612            5          1         3.304666
3     4     3711            5          1         3.304666
4     5       55            5          1         3.304666
5     6     2487            5          1         3.304666
User 7 NDCG@10: 1.0000


Evaluating NDCG (skip low-rated):   0%|          | 3/3996 [00:01<38:02,  1.75it/s]  

User 75 NDCG@10: 1.0000


Evaluating NDCG (skip low-rated):   0%|          | 4/3996 [00:02<40:39,  1.64it/s]

User 143 NDCG@10: 1.0000


Evaluating NDCG (skip low-rated):   0%|          | 5/3996 [00:03<42:58,  1.55it/s]

User 145 NDCG@10: 1.0000


Evaluating NDCG (skip low-rated):   0%|          | 6/3996 [00:04<43:57,  1.51it/s]

User 173 NDCG@10: 1.0000


Evaluating NDCG (skip low-rated):   0%|          | 7/3996 [00:04<44:37,  1.49it/s]

User 178 NDCG@10: 1.0000


Evaluating NDCG (skip low-rated):   0%|          | 8/3996 [00:05<45:09,  1.47it/s]

User 202 NDCG@10: 1.0000


Evaluating NDCG (skip low-rated):   0%|          | 9/3996 [00:06<45:01,  1.48it/s]

User 215 NDCG@10: 1.0000





KeyboardInterrupt: 