In [1]:
pip install transformers faiss-cpu sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.9.0 sentence-transformers-3.2.1


In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

  from tqdm.autonotebook import tqdm, trange


In [3]:
cookies = {
    # login cookies
}

headers = {
    # login headers
}

params = {
    'page': '16',
}

In [4]:
df = pd.read_csv("books_info.csv")
df.shape

(699, 6)

In [6]:
params = {
    'page': '26',
}
response = requests.get('https://www.goodreads.com/shelf/show/romance', params=params, cookies=cookies, headers=headers)
print(response.status_code)

soup = BeautifulSoup(response.content, 'html.parser')

# Find all book URLs on the page
books = soup.find_all('a', class_='bookTitle')

book_urls = ['https://www.goodreads.com' + book['href'] for book in books]

# # Output URLs
# for url in book_urls:
#     print(url)


book_info = [get_book_info(book_url) for book_url in book_urls]
# print(book_info)

df = pd.concat([df, pd.DataFrame(book_info)], ignore_index=True)
df.shape

200


(699, 6)

In [5]:
def get_book_info(url):
    # Fetch the page content
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract book title
    title = soup.find('div', class_='BookPageTitleSection__title').text.strip()

    # Extract author name
    author = soup.find('div', class_='BookPageMetadataSection__contributor').text.strip()

    # Extract book summary
    summary = soup.find('div', class_='BookPageMetadataSection__description').text.strip()

    # Extract average rating
    rating = soup.find('div', class_='RatingStatistics__column').get_text(strip=True)

    # Extract reviews
    reviews = soup.find_all('section', class_='ReviewText__content')
    review_texts = [review.get_text(strip=True) for review in reviews[:5]]  # Get top 5 reviews
    review_text = ''.join(review_texts)

    return {
        'title': title,
        'author': author,
        'summary': summary,
        'rating': rating,
        'reviews': review_text
    }

In [None]:
# Get info from url

book_info = [get_book_info(book_url) for book_url in book_urls]
print(book_info)



In [None]:
df = pd.DataFrame.from_dict(book_info)
df.head()

Unnamed: 0,title,author,summary,rating,reviews
0,Fallen #2Torment,Lauren Kate,Hell on earth.That’s what it’s like for Luce t...,3.84,"When I picked up this book, it had been about ..."
1,Bright Falls #3Iris Kelly Doesn't Date,Ashley Herring Blake,Everyone around Iris Kelly is in love. Her bes...,4.13,OKAY I CRIED this is literally the best one ye...
2,Lovelight #2In the Weeds,B.K. Borison,Evelyn St. James isn’t the kind of woman you f...,3.88,miss burger king wrote the HELL outta this rom...
3,Rokesbys #3The Other Miss Bridgerton,Julia Quinn,She was in the wrong place…Fiercely independen...,4.04,"Now, this is aKidnapped By A Pirateromance I c..."
4,Inn BoonsBoro Trilogy #2The Last Boyfriend,"Nora Roberts, MacLeod Andrews (Narrator)","Owen is the organizer of the Montgomery clan, ...",4.1,Avery and Owen are the ideal couple. I really ...


In [None]:
df.shape

(201, 5)

In [59]:
df.shape

(742, 6)

In [None]:
df.to_csv('books_info.csv', index=False)

In [None]:
tropes_list = [
    'found family', 'love triangle', 'chosen one',
    'grumpy and sunshine', 'mentor', 'forbidden love', 'redemption arc', 'enemies to lovers', 'friends to lovers', 'grumpy x sunshine',
    'forced proximity', 'single parent', 'marriage of convenience', 'fake dating', 'billionaire romance', 'sport romance',
    'LGBTQ+ romance', 'workplace romance', 'age gap', 'forbidden love','slow burn', 'rom-com', 'second chance romance', 'one night stand', 'arranged marriage',
    'reformed playboy', 'celebrity romance', 'roommate romance', "best friend's brother", "brother's best friend", 'spicy', 'opposite attract', "opposite's attract"
    'coming of age', 'secret relationship', 'second chances', 'boss and employee', 'class difference', 'morally grey characters', 'harem', 'reverse harem', 'MMFM', 'RH', 'my wife trope',
    'badass FMC', 'he falls first', 'she falls first', 'just one bed', 'mafia', 'bad girl x good boy', 'bad boy x good girl'
]

def detect_tropes(reviews, tropes_list):
    detected_tropes = []
    for review in reviews:
        for trope in tropes_list:
            if trope.lower() in review.lower():
                detected_tropes.append(trope)
    return set(detected_tropes)  # Return unique tropes detected

# Detect tropes in the reviews we scraped
df['book_tropes'] = df['reviews'].apply(lambda review: detect_tropes([review], tropes_list))
#print("Detected Tropes:", book_tropes)

In [None]:
df['book_tropes'] = df['book_tropes'].apply(lambda x: ', '.join(x) if isinstance(x, set) else x)

In [None]:

# Load a Sentence-BERT model for generating embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for book summaries
summary_embeddings = embedder.encode(df['summary'].tolist())

# Create a FAISS index for efficient similarity search
dimension = summary_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(summary_embeddings))

# Save indices and embeddings if needed
print(f"Number of books in index: {index.ntotal}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of books in index: 601


In [None]:

# Load the Sentence-BERT model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Combine summary, reviews, and book tropes into a single text string for each book
df['combined_text'] = df.apply(
    lambda row: f"{row['summary']} [SEP] {row['reviews']} [SEP] Tropes: {row['book_tropes']}",
    axis=1
)

# Generate embeddings for the combined text
combined_embeddings = embedder.encode(df['combined_text'].tolist())

# Create a FAISS index for efficient similarity search
dimension = combined_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(combined_embeddings))

print(f"Number of books in index: {index.ntotal}")


Number of books in index: 601


In [None]:
# Load a Sentence-BERT model for generating embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')
def get_recommendations(user_query, top_k=3, embedder=embedder):
    # Embed the user's query
    query_embedding = embedder.encode([user_query])

    # Search FAISS index for top matching books
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve top book details from DataFrame
    recommendations = []
    for i in indices[0]:
        book = df.iloc[i]
        recommendations.append({
            "title": book['title'],
            "author": book['author'],
            "rating": book['rating'],
            "tropes": book['book_tropes'],
            "summary": book['summary']
        })

    return recommendations

In [None]:
# Test query embedding (user input for book tropes)
query = "friends to lovers"

top_books = get_recommendations(query)
print("Top recommended books: \n", top_books)

Top recommended books: 
 [{'title': 'Tomorrow, and Tomorrow, and Tomorrow', 'author': 'Gabrielle Zevin', 'rating': 4.15, 'tropes': 'love triangle', 'summary': "In this exhilarating novel, two friends—often in love, but never lovers—come together as creative partners in the world of video game design, where success brings them fame, joy, tragedy, duplicity, and, ultimately, a kind of immortality.On a bitter-cold day, in the December of his junior year at Harvard, Sam Masur exits a subway car and sees, amid the hordes of people waiting on the platform, Sadie Green. He calls her name. For a moment, she pretends she hasn't heard him, but then, she turns, and a game begins: a legendary collaboration that will launch them to stardom. These friends, intimates since childhood, borrow money, beg favors, and, before even graduating college, they have created their first blockbuster,  Ichigo. Overnight, the world is theirs. Not even twenty-five years old, Sam and Sadie are brilliant, successful, 