In [3]:
import pandas as pd
import os

# Load CSVs
clean_folder = "clean"
books_df = pd.read_csv(os.path.join(clean_folder, "books.csv"))
book_tags_df = pd.read_csv(os.path.join(clean_folder, "book_tags.csv"))
tags_df = pd.read_csv(os.path.join(clean_folder, "tags.csv"))

# Rename book_tags_df to match books_df
book_tags_df.rename(columns={'book_id': 'goodreads_book_id'}, inplace=True)

# Merge book_tags with tags to get tag names
book_tags_merged = pd.merge(book_tags_df, tags_df, on='tag_id', how='inner')

# Merge tags onto books
books_with_tags = pd.merge(
    books_df[['book_id', 'title', 'authors', 'average_rating',
              'ratings_count', 'original_publication_year', 'language_code']].rename(columns={'book_id': 'goodreads_book_id'}),
    book_tags_merged,
    on='goodreads_book_id',
    how='inner'
)

# Group tags by book and create a single string per book
books_tagged = books_with_tags.groupby('goodreads_book_id').agg({
    'title': 'first',
    'authors': 'first',
    'average_rating': 'first',
    'ratings_count': 'first',
    'original_publication_year': 'first',
    'language_code': 'first',
    'tag_name': lambda x: ' '.join(set(x))  # de-duplicate tags
}).reset_index()

# Preview
print(books_tagged.head())

   goodreads_book_id                                              title  \
0                  1            The Hunger Games (The Hunger Games, #1)   
1                  2  Harry Potter and the Sorcerer's Stone (Harry P...   
2                  3                            Twilight (Twilight, #1)   
3                  4                              To Kill a Mockingbird   
4                  5                                   The Great Gatsby   

                       authors  average_rating  ratings_count  \
0              Suzanne Collins            4.34        4780653   
1  J.K. Rowling, Mary GrandPré            4.44        4602479   
2              Stephenie Meyer            3.57        3866839   
3                   Harper Lee            4.25        3198671   
4          F. Scott Fitzgerald            3.89        2683664   

   original_publication_year language_code  \
0                     2008.0           eng   
1                     1997.0           eng   
2                   

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a compact transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for each book's tag string
book_tag_embeddings = model.encode(books_tagged['tag_name'].tolist(), convert_to_tensor=True)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def recommend_by_genre_tags(user_query, top_n=10):
    # Embed user query (e.g. "Fantasy")
    query_emb = model.encode(user_query, convert_to_tensor=True)
    
    # Compute cosine similarity with all book tag vectors
    scores = util.pytorch_cos_sim(query_emb, book_tag_embeddings)[0]
    top_results = scores.topk(top_n)

    # Retrieve matching books
    results = books_tagged.iloc[top_results[1].cpu().numpy()].copy()
    results['similarity'] = top_results[0].cpu().numpy()
    return results[[
        'goodreads_book_id', 'title', 'authors', 'average_rating',
        'ratings_count', 'original_publication_year',
        'language_code', 'similarity'
    ]]

In [8]:
recs = recommend_by_genre_tags("Fantasy", top_n=10)
print(recs)

      goodreads_book_id                                              title  \
1900               1901                            Shadowfever (Fever, #5)   
1890               1891                             Dreamfever (Fever, #4)   
1945               1946                               Faefever (Fever, #3)   
6718               6719                          Storm Born (Dark Swan #1)   
9981               9982                              Feverborn (Fever, #8)   
7279               7280                                 Burned (Fever, #7)   
9191               9192                        Thorn Queen (Dark Swan, #2)   
5565               5566                     Dragon Bound (Elder Races, #1)   
6677               6678  The Wheel of Time: Boxed Set #1 (Wheel of Time...   
5727               5728             Dance of the Gods (Circle Trilogy, #2)   

                 authors  average_rating  ratings_count  \
1900  Karen Marie Moning            4.46          73030   
1890  Karen Marie Monin