In [1]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer, util

# Load data
clean_folder = "clean"
books_df = pd.read_csv(os.path.join(clean_folder, "books.csv"))
book_tags_df = pd.read_csv(os.path.join(clean_folder, "book_tags.csv"))
tags_df = pd.read_csv(os.path.join(clean_folder, "tags.csv"))

# # Standardize column name for merging
# book_tags_df.rename(columns={'book_id': 'book_id'}, inplace=True)

# # Rename 'book_id' in books_df to 'book_id' for merging
# books_df.rename(columns={'book_id': 'book_id'}, inplace=True)

# Merge book_tags with tag names
book_tags_merged = pd.merge(book_tags_df, tags_df, on='tag_id', how='inner')

# Merge with books
books_with_tags = pd.merge(
    books_df[['book_id', 'title', 'authors', 'average_rating',
              'ratings_count', 'original_publication_year', 'language_code']],
    book_tags_merged,
    on='book_id',
    how='inner'
)

# Group by book and concatenate tag names into a single string
books_tagged = books_with_tags.groupby('book_id').agg({
    'title': 'first',
    'authors': 'first',
    'original_publication_year': 'first',
    'language_code': 'first',
    'tag_name': lambda x: ' '.join(set(x))  # deduplicated tag list
}).reset_index()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model
model = SentenceTransformer('BAAI/bge-m3')

# Encode tag text for each book
books_tagged['tag_text'] = books_tagged['tag_name']
book_tag_embeddings0 = model.encode(books_tagged['tag_text'].tolist(), convert_to_tensor=True)

In [3]:
def recommend_by_multiple_genres(user_genres, top_n=10):
    """
    user_genres: str — comma-separated genres, e.g., "Fantasy, Mystery, Romance"
    top_n: int — number of results to return
    """
    # Parse and clean genres
    genre_list = [g.strip() for g in user_genres.split(',') if g.strip()]
    
    if not genre_list:
        raise ValueError("Please input at least one genre!")

    # Embed each genre separately
    genre_embeddings = model.encode(genre_list, convert_to_tensor=True)

    # Compute average embedding (user profile)
    user_embedding = genre_embeddings.mean(dim=0)

    # Compute cosine similarity with all books
    scores = util.pytorch_cos_sim(user_embedding, book_tag_embeddings0)[0]
    top_results = scores.topk(top_n)

    # Extract matching rows
    results = books_tagged.iloc[top_results[1].cpu().numpy()].copy()
    results['similarity'] = top_results[0].cpu().numpy()
    return results[[
        'book_id', 'title', 'authors', 'original_publication_year',
        'language_code', 'similarity'
    ]]

In [4]:
user_input = "Fantasy, Paranormal, Fiction, Science Fiction, Graphic Novels, Novel, Urban Fiction"
recommendations = recommend_by_multiple_genres(user_input, top_n=10)
print(recommendations)

      book_id                                     title  \
6637     6638          Cross My Heart (Alex Cross, #21)   
5691     5692         Alex Cross, Run (Alex Cross, #20)   
4021     4022           Cross Country (Alex Cross, #14)   
8466     8467        Silken Prey (Lucas Davenport, #23)   
6293     6294                Private Games (Private #3)   
3692     3693        Buried Prey (Lucas Davenport, #21)   
4489     4490          Kill Alex Cross (Alex Cross #18)   
8643     8644               Private London (Private #4)   
7937     7938  I, Michael Bennett (Michael Bennett, #5)   
2980     2981            Double Cross (Alex Cross, #13)   

                                authors  original_publication_year  \
6637                    James Patterson                     2013.0   
5691                    James Patterson                     2013.0   
4021                    James Patterson                     2008.0   
8466                      John Sandford                     2013.0   


In [10]:
ratings_df = pd.read_csv(os.path.join(clean_folder, "ratings.csv"))
to_read_test_df = pd.read_csv(os.path.join(clean_folder, "to_read_test.csv"))

In [12]:
# ------------------------------------
# 4. Filter Users + Sample
# ------------------------------------
#   We'll keep only users who rated >60 and <160 books,
#   then randomly sample 20% of those rating rows.

# Count how many ratings each user did
user_counts = ratings_df.groupby('user_id')['book_id'].count().reset_index()
user_counts.rename(columns={'book_id': 'count_ratings'}, inplace=True)

# Keep those who rated between 60 and 160
eligible_users = user_counts[
    (user_counts['count_ratings'] > 30) &
    (user_counts['count_ratings'] < 100)
]
print(f"Eligible users: {len(eligible_users)}")

Eligible users: 16150


In [13]:
# Merge to keep only those users' ratings
filtered_ratings = pd.merge(
    ratings_df,
    eligible_users[['user_id']],
    on='user_id',
    how='inner'
)

In [14]:


print(f"Filtered user count: {len(eligible_users)}")
print(f"Filtered ratings shape: {filtered_ratings.shape}")

# print user id in filtered ratings
print("User IDs in filtered ratings:")
print(filtered_ratings['user_id'].unique())


Filtered user count: 16150
Filtered ratings shape: (1355729, 3)
User IDs in filtered ratings:
[    2     6     8 ... 52013 33111 49802]


In [None]:
# Evaluation
# ------------------------------------
# input test file
# loop to all user
# get user rated book from ratings.csv
# get tag from first 3 book compare to tag_id in UI_tag.csv most match use that genre as a genre to test as user_test_genre
# get first 5 book from to_read_test.csv as wishlist (will be goal of evaluation)
# get recommendation from recommend_by_multiple_genres(user_test_genre, 5)
# compare the recommendation with wishlist score using dcg
# sum up all the score
# end loop
# average the score

In [17]:
from tqdm import tqdm
from collections import Counter
import math

# Evaluation BEST ONE
# ------------------------------------
# input test file
# loop to all user
# get user rated book from ratings.csv
# get tag from first 3 book compare to tag_id in UI_tag.csv most match use that genre as a genre to test as user_test_genre
# get first 5 book from to_read_test.csv as wishlist (will be goal of evaluation)
# get recommendation from recommend_by_multiple_genres(user_test_genre, 5)
# compare the recommendation with wishlist score using ndcg
# sum up all the score
# end loop
# average the score
# ------------------------------------

def evaluate_ndcg(eligible_users, ratings_df, books_tagged, to_read_test_df, top_n=10):
    """
    Evaluate the NDCG score for recommendations.

    Parameters:
    - eligible_users: DataFrame containing eligible user IDs.
    - ratings_df: DataFrame containing user ratings.
    - books_tagged: DataFrame containing books with their tags.
    - to_read_test: DataFrame containing users' wishlist books.
    - top_n: Number of recommendations to consider for NDCG calculation.

    Returns:
    - average_ndcg_score: The average NDCG score across all users.
    """

    # Initialize variables for evaluation
    total_ndcg_score = 0
    user_count = 0

    # Loop through all eligible users with tqdm progress bar
    for user_id in tqdm(eligible_users['user_id'], desc="Evaluating NDCG"):
        # Get books rated by the user
        user_ratings = ratings_df[ratings_df['user_id'] == user_id]
        rated_books = user_ratings.sort_values(by='rating', ascending=False).head(3)['book_id'].tolist()

        # Get tags for the top 3 rated books
        relevant_tags = books_tagged[books_tagged['book_id'].isin(rated_books)]['tag_name']
        all_words = " ".join(relevant_tags.tolist()).split()
        word_counts = Counter(all_words)
        top_tags = [w for w, c in word_counts.most_common(5)]
        if not top_tags:
            continue

        user_test_genre = ", ".join(top_tags)

        # Get the user's wishlist (goal of evaluation)
        wishlist = to_read_test_df[to_read_test_df['user_id'] == user_id]['book_id'].tolist()
        if not wishlist:
            continue

        # Get recommendations
        recommendations = recommend_by_multiple_genres(user_test_genre, top_n=top_n)
        recommended_books = recommendations['book_id'].tolist()

        # Calculate DCG score
        dcg_score = 0
        for i, book_id in enumerate(recommended_books):
            if book_id in wishlist:  # check if the recommended book is in the wishlist
                dcg_score += 1 / math.log2(i + 2)  # DCG formula

        # Calculate IDCG score
        idcg_score = 0
        for i in range(min(len(wishlist), top_n)):
            idcg_score += 1 / math.log2(i + 2)  # Ideal DCG formula

        # Calculate NDCG score
        ndcg_score = dcg_score / idcg_score if idcg_score > 0 else 0
        total_ndcg_score += ndcg_score
        user_count += 1

    # Calculate average NDCG score
    average_ndcg_score = total_ndcg_score / user_count if user_count > 0 else 0
    return average_ndcg_score

evaluation_score = evaluate_ndcg(eligible_users, ratings_df, books_tagged, to_read_test_df)
print(f"Average NDCG Score: {evaluation_score:.4f}")


Evaluating NDCG: 100%|██████████| 16150/16150 [00:42<00:00, 377.78it/s]

Average NDCG Score: 0.0043





In [None]:
import numpy as np
import pandas as pd
import math
from tqdm import tqdm
from collections import Counter

def compute_ndcg_at_k(ground_truth_ids, recommended_ids, user_data, k=10):
    """
    Compute NDCG for a single user at cutoff k, manually.
    
    Params
    ------
    ground_truth_ids : set
        Set of book_ids the user rated >= 5 (i.e. 'relevant').
    recommended_ids : list
        List of recommended book_ids in the final top-k (rank order).
    user_data : pd.DataFrame
        The subset of the ratings DataFrame for this user alone
        (so we can look up the actual rating for each book).
    k : int
        The number of items to consider (already ensured recommended_ids has up to k items).
        
    Returns
    -------
    ndcg_val : float
        The NDCG for this user at k.
    df_debug : pd.DataFrame
        A table showing rank, book_id, actual rating, item-level DCG
        (so we can see how DCG is accumulated).
    """
    recommended_top_k = recommended_ids[:k]
    
    # 1) Build a binary relevance list: 1 if rating >=5, else 0
    relevance = [1 if b in ground_truth_ids else 0 for b in recommended_top_k]
    
    # 2) Compute DCG for each rank i in [0..k-1], storing item-level contributions
    dcg_values = []
    for i, rel in enumerate(relevance):
        rank = i + 1  # rank is 1-based
        dcg_i = (2 ** rel - 1) / math.log2(rank + 1)
        dcg_values.append(dcg_i)
    
    dcg = sum(dcg_values)
    
    # 3) Compute IDCG by sorting relevance in descending order
    ideal_relevance = sorted(relevance, reverse=True)
    idcg_values = []
    for i, rel in enumerate(ideal_relevance):
        rank = i + 1
        idcg_i = (2 ** rel - 1) / math.log2(rank + 1)
        idcg_values.append(idcg_i)
    
    idcg = sum(idcg_values)
    ndcg_val = dcg / idcg if idcg > 0 else 0.0

    # Build a debug DataFrame
    debug_rows = []
    for i, book_id in enumerate(recommended_top_k):
        rank = i + 1
        # Actual rating from user_data
        row = user_data[user_data['book_id'] == book_id]
        rating = row['rating'].values[0] if not row.empty else 0
        
        debug_rows.append({
            'Rank': rank,
            'book_id': book_id,
            'User Rating': rating,
            'DCG Contribution': dcg_values[i]
        })
        
    df_debug = pd.DataFrame(debug_rows)
    
    return ndcg_val, df_debug

def evaluate_ndcg(ratings_subset, top_n=10):
    """
    For each user:
      1) Gather user's relevant books (rating >=5).
      2) Build a 'genre query' from top-5 tags of those relevant books.
      3) Get a larger set of recommended items using `recommend_by_multiple_genres`.
      4) Skip any book that the user has not rated (rating=0).
      5) Keep collecting items (in rank order) up to `top_n`.
      6) Compute NDCG (manually) and store it.
      7) Print a table showing rank, book_id, user rating, DCG contribution for each user.
    Finally, return the average NDCG@k across users.
    """
    user_ids = ratings_subset['user_id'].unique()
    ndcg_list = []

    # Pre-build a dictionary from book_id -> book_id
    if 'book_id' not in books_tagged.columns:
        raise KeyError("'book_id' column is missing in books_tagged DataFrame.")
    
    # Map from book_id to book_id
    # (If your data is different, adjust accordingly.)
    gr2id = {}
    for gid in books_tagged['book_id']:
        gr2id[gid] = gid  # If they are truly the same, or else do a real map

    # Alternatively, if books_tagged *does* have a separate 'book_id' column:
    #   gr2id = dict(zip(books_tagged['book_id'], books_tagged['book_id']))

    all_debug_tables = []  # to store or display if you want

    for uid in tqdm(user_ids, desc="Evaluating NDCG"):
        # -- A) Get user data
        user_data = ratings_subset[ratings_subset['user_id'] == uid].copy()
        
        # relevant_books = rated >= 5
        relevant_books = set(user_data[user_data['rating'] >= 5]['book_id'].unique())
        if len(relevant_books) == 0:
            continue  # no relevant => skip

        # -- B) Build a naive 'genre query' from top 5 tags of relevant books
        #    You may need to use a suitable merge or direct indexing. 
        #    Below: if the 'book_id' in books_tagged is the same as
        #    the user's 'book_id', it’s simpler, but typically you might need
        #    a separate map or join if columns differ.  
        #    We'll assume the columns line up or you can adapt as needed.

        relevant_tags = books_tagged[books_tagged['book_id'].isin(relevant_books)]['tag_name']
        all_words = " ".join(relevant_tags.tolist()).split()
        word_counts = Counter(all_words)
        top_5_tags = [w for w, c in word_counts.most_common(5)]
        if not top_5_tags:
            continue
        
        user_genres = ", ".join(top_5_tags)

        # -- C) Get a larger set of recommendations
        #    We'll ask for top_n * 5 to have enough items to skip from.
        recs = recommend_by_multiple_genres(user_genres, top_n=top_n * 5)
        # recs should have 'book_id' in rank order
        recommended_gids = recs['goodreads_book_id'].tolist()

        # -- D) Build a final list that ONLY includes items user rated >0
        #       i.e. skip rating=0. Keep collecting until we have `top_n`.
        final_recs = []
        for gid in recommended_gids:
            bk_id = gr2id.get(gid, None)
            if bk_id is None:
                continue  # not in dictionary

            # Check user's rating
            row = user_data[user_data['book_id'] == bk_id]
            user_rating = row['rating'].values[0] if not row.empty else 0
            if user_rating > 0:
                final_recs.append(bk_id)
                if len(final_recs) == top_n:
                    break

        if len(final_recs) < 1:
            # No recommended items that user actually rated => skip
            continue

        # -- E) Compute NDCG for these final recommendations
        ndcg_val, df_debug = compute_ndcg_at_k(
            ground_truth_ids=relevant_books,
            recommended_ids=final_recs,
            user_data=user_data,
            k=top_n
        )
        ndcg_list.append(ndcg_val)

        first = True
        if first:
            all_debug_tables.append(df_debug)
            first = False
        # Print a debug table for this user
        print(f"\nUser: {uid}  (NDCG@{top_n} = {ndcg_val:.4f})")
        print(df_debug.to_string(index=False))  # or display as you like
        print("----------------------------------------------------")

    # -- F) Return the average
    if ndcg_list:
        return np.mean(ndcg_list)
    else:
        return 0.0

# Example usage:
avg_ndcg = evaluate_ndcg(filtered_ratings, top_n=10)
print(f"\nGlobal Average NDCG@10 = {avg_ndcg:.4f}")

Evaluating NDCG: 100%|██████████| 3996/3996 [03:25<00:00, 19.47it/s]


Global Average NDCG@10 = 0.4389





Top users with the most rated books: [28158, 7563, 24143, 37834, 6630]
No data found for user 6630.


In [111]:
print(filtered_ratings_20p)

       user_id  book_id  rating
0        19009       56       2
1        10484     5500       3
2         3106      456       3
3        44407     3647       5
4         5419     6753       3
...        ...      ...     ...
74115     9814     5006       4
74116    39590       12       5
74117    38866     3070       5
74118    50769      345       2
74119    22529     2405       3

[74120 rows x 3 columns]
