In [3]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer, util

# Load data
clean_folder = "clean"
books_df = pd.read_csv(os.path.join(clean_folder, "books.csv"))
book_tags_df = pd.read_csv(os.path.join(clean_folder, "book_tags.csv"))
tags_df = pd.read_csv(os.path.join(clean_folder, "tags.csv"))

# Standardize column name for merging
book_tags_df.rename(columns={'book_id': 'goodreads_book_id'}, inplace=True)

# Rename 'book_id' in books_df to 'goodreads_book_id' for merging
books_df.rename(columns={'book_id': 'goodreads_book_id'}, inplace=True)

# Merge book_tags with tag names
book_tags_merged = pd.merge(book_tags_df, tags_df, on='tag_id', how='inner')

# Merge with books
books_with_tags = pd.merge(
    books_df[['goodreads_book_id', 'title', 'authors', 'average_rating',
              'ratings_count', 'original_publication_year', 'language_code']],
    book_tags_merged,
    on='goodreads_book_id',
    how='inner'
)

# Group by book and concatenate tag names into a single string
books_tagged = books_with_tags.groupby('goodreads_book_id').agg({
    'title': 'first',
    'authors': 'first',
    'average_rating': 'first',
    'ratings_count': 'first',
    'original_publication_year': 'first',
    'language_code': 'first',
    'tag_name': lambda x: ' '.join(set(x))  # deduplicated tag list
}).reset_index()

In [None]:
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode tag text for each book
books_tagged['tag_text'] = books_tagged['tag_name']
book_tag_embeddings0 = model.encode(books_tagged['tag_text'].tolist(), convert_to_tensor=True)

In [8]:
def recommend_by_multiple_genres(user_genres, top_n=10):
    """
    user_genres: str — comma-separated genres, e.g., "Fantasy, Mystery, Romance"
    top_n: int — number of results to return
    """
    # Parse and clean genres
    genre_list = [g.strip() for g in user_genres.split(',') if g.strip()]
    
    if not genre_list:
        raise ValueError("Please input at least one genre!")

    # Embed each genre separately
    genre_embeddings = model.encode(genre_list, convert_to_tensor=True)

    # Compute average embedding (user profile)
    user_embedding = genre_embeddings.mean(dim=0)

    # Compute cosine similarity with all books
    scores = util.pytorch_cos_sim(user_embedding, book_tag_embeddings)[0]
    top_results = scores.topk(top_n)

    # Extract matching rows
    results = books_tagged.iloc[top_results[1].cpu().numpy()].copy()
    results['similarity'] = top_results[0].cpu().numpy()
    return results[[
        'goodreads_book_id', 'title', 'authors', 'average_rating',
        'ratings_count', 'original_publication_year',
        'language_code', 'similarity'
    ]]

In [None]:
user_input = "Fantasy, Science Fiction, Young"
recommendations = recommend_by_multiple_genres(user_input, top_n=10)
print(recommendations)

      goodreads_book_id                                              title  \
5216               5217                 The Skull Throne (Demon Cycle, #4)   
3454               3455           The Lake House (When the Wind Blows, #2)   
5842               5843                                           The Fold   
1735               1736                                        Dark Matter   
9981               9982                              Feverborn (Fever, #8)   
2997               2998                    Magic Bleeds (Kate Daniels, #4)   
6317               6318  The Cat Who Walks Through Walls (The World As ...   
4327               4328                          A Knight in Shining Armor   
2083               2084                                            Anathem   
2605               2606                 The Daylight War (Demon Cycle, #3)   

                 authors  average_rating  ratings_count  \
5216      Peter V. Brett            4.14          15946   
3454     James Patterso