In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [2]:
%pip install langchain-huggingface

Note: you may need to restart the kernel to use updated packages.


In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
import pandas as pd
books = pd.read_csv("books_cleaned.csv")

In [5]:
books["tagged_description"]

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 "In The Problem of Pain, C.S. Le...
                              ...                        
5192    9788172235222 On A Train Journey Home To North...
5193    9788173031014 This book tells the tale of a ma...
5194    9788179921623 Wisdom to Create a Life of Passi...
5195    9788185300535 This collection of the timeless ...
5196    9789027712059 Since the three volume edition o...
Name: tagged_description, Length: 5197, dtype: object

In [6]:
books["tagged_description"].to_csv("tagged_description.txt",
                                   lineterminator="\n",
                                   index=False,
                                   header=False)


In [7]:
raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 1170, which is longer than the specified 0
Created a chunk of size 1216, which is longer than the specified 0
Created a chunk of size 375, which is longer than the specified 0
Created a chunk of size 311, which is longer than the specified 0
Created a chunk of size 483, which is longer than the specified 0
Created a chunk of size 484, which is longer than the specified 0
Created a chunk of size 962, which is longer than the specified 0
Created a chunk of size 188, which is longer than the specified 0
Created a chunk of size 845, which is longer than the specified 0
Created a chunk of size 296, which is longer than the specified 0
Created a chunk of size 197, which is longer than the specified 0
Created a chunk of size 881, which is longer than the specified 0


Created a chunk of size 1090, which is longer than the specified 0
Created a chunk of size 1191, which is longer than the specified 0
Created a chunk of size 306, which is longer than the specified 0
Created a chunk of size 270, which is longer than the specified 0
Created a chunk of size 213, which is longer than the specified 0
Created a chunk of size 216, which is longer than the specified 0
Created a chunk of size 515, which is longer than the specified 0
Created a chunk of size 754, which is longer than the specified 0
Created a chunk of size 390, which is longer than the specified 0
Created a chunk of size 265, which is longer than the specified 0
Created a chunk of size 253, which is longer than the specified 0
Created a chunk of size 308, which is longer than the specified 0
Created a chunk of size 730, which is longer than the specified 0
Created a chunk of size 723, which is longer than the specified 0
Created a chunk of size 258, which is longer than the specified 0
Created 

In [8]:
documents[0]

Document(metadata={'source': 'tagged_description.txt'}, page_content='"9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, G

In [9]:
%pip install --upgrade google-genai

Note: you may need to restart the kernel to use updated packages.


In [10]:
%pip install langchain-google-genai google-generativeai langchain-chroma

Note: you may need to restart the kernel to use updated packages.


In [11]:
GEMINI_EMBEDDING_MODEL = "gemini-embedding-exp-03-07"

In [12]:
PERSIST_DIRECTORY = "db_books"

In [13]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma

In [14]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyCg_fq1QvhArgWOyEtn5kV_vvATPvamlAY"


In [15]:
import google.generativeai as genai

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])


In [16]:
def get_gemini_embedding(text):
    response = genai.embed_content(
        model="models/gemini-embedding-exp-03-07",
        content={"parts": [{"text": text}]}
    )
    return response['embedding']



In [24]:
import os
import time
import random
import chromadb
from chromadb import PersistentClient

# Use a safe writable path
PERSIST_DIRECTORY = "db-books"  # Change if needed

# Ensure the directory is writable
os.makedirs(PERSIST_DIRECTORY, exist_ok=True)

# Initialize Chroma
client = PersistentClient(path=PERSIST_DIRECTORY)
collection = client.get_or_create_collection("books")

# Retry wrapper for the embedding function
def get_embedding_with_retry(text, max_retries=5):
    for attempt in range(max_retries):
        try:
            return get_gemini_embedding(text)
        except Exception as e:
            if "429" in str(e) or "ResourceExhausted" in str(e):
                wait = (2 ** attempt) + random.random()
                print(f"Rate limit hit. Retrying in {wait:.2f}s...")
                time.sleep(wait)
            else:
                raise
    raise RuntimeError("Max retries exceeded for embedding")

# Add documents safely
for i, doc in enumerate(documents):
    doc_id = str(i)

    # Skip if already exists
    existing = collection.get(ids=[doc_id])
    if existing["ids"]:
        print(f"Skipping existing ID: {doc_id}")
        continue

    try:
        embedding = get_embedding_with_retry(doc.page_content)
        collection.add(
            ids=[doc_id],
            embeddings=[embedding],
            metadatas=[{"source": "book"}],
            documents=[doc.page_content]
        )
        print(f"Added ID: {doc_id}")
    except Exception as e:
        print(f"Failed to add ID {doc_id}: {e}")



Added ID: 0
Added ID: 1
Added ID: 2
Added ID: 3
Added ID: 4
Added ID: 5
Added ID: 6
Added ID: 7
Added ID: 8
Added ID: 9
Added ID: 10
Added ID: 11
Added ID: 12
Added ID: 13
Added ID: 14
Added ID: 15
Added ID: 16
Added ID: 17
Added ID: 18
Added ID: 19
Added ID: 20
Added ID: 21
Added ID: 22
Added ID: 23
Added ID: 24
Added ID: 25
Added ID: 26
Added ID: 27
Added ID: 28
Added ID: 29
Added ID: 30
Added ID: 31
Added ID: 32
Added ID: 33
Added ID: 34
Added ID: 35
Added ID: 36
Added ID: 37
Added ID: 38
Added ID: 39
Added ID: 40
Added ID: 41
Added ID: 42
Added ID: 43
Added ID: 44
Added ID: 45
Added ID: 46
Added ID: 47
Added ID: 48
Added ID: 49
Added ID: 50
Added ID: 51
Added ID: 52
Added ID: 53
Added ID: 54
Added ID: 55
Added ID: 56
Added ID: 57
Added ID: 58
Added ID: 59
Added ID: 60
Added ID: 61
Added ID: 62
Added ID: 63
Added ID: 64
Added ID: 65
Added ID: 66
Added ID: 67
Added ID: 68
Added ID: 69
Added ID: 70
Added ID: 71
Added ID: 72
Added ID: 73
Rate limit hit. Retrying in 1.00s...
Rate limit 

In [29]:
query = "A book to teach children about nature"
results = collection.query(
	query_embeddings=[get_gemini_embedding(query)],
	n_results=10
)
results

{'ids': [['3747',
   '3797',
   '442',
   '1739',
   '3748',
   '4557',
   '3750',
   '3765',
   '855',
   '1970']],
 'embeddings': None,
 'documents': [['"9780786808069 Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience."',
   '"9780789458209 Photographs and text explore the anatomy and life cycle of trees, examining the different kinds of bark, seeds, and leaves, the commercial processing of trees to make lumber, the creatures that live in trees, and other aspects."',
   '"9780067575208 First published more than three decades ago, this reissue of Rachel Carson\'s award-winning classic brings her unique vision to a new generation of readers. Stunning new photographs by Nick Kelsh beautifully complement Carson\'s intimate account of adventures with her young nephew, Roger, as t

In [31]:
books[books["isbn13"] == int(documents[0].page_content.split()[0].strip('"'))]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...


In [36]:
def retrieve_semantic_recommendations(query: str, top_k: int = 10) -> pd.DataFrame:
	results = collection.query(
		query_embeddings=[get_gemini_embedding(query)],
		n_results=top_k
	)
	books_list = [int(doc.split()[0].strip('"')) for doc in results["documents"][0]]
	return books[books["isbn13"].isin(books_list)]

retrieve_semantic_recommendations("books to teach kids about nature")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
442,9780067575208,006757520X,The Sense of Wonder,Rachel Carson,Nature,http://books.google.com/books/content?id=Zee5S...,"First published more than three decades ago, t...",1998.0,4.39,112.0,1160.0,The Sense of Wonder,9780067575208 First published more than three ...
855,9780143037392,0143037390,The Read-aloud Handbook,Jim Trelease,Language Arts & Disciplines,http://books.google.com/books/content?id=B2_yU...,Explains the importance of reading aloud to ch...,2006.0,4.4,432.0,4122.0,The Read-aloud Handbook,9780143037392 Explains the importance of readi...
1907,9780393315110,0393315118,Uncommon Ground: Rethinking the Human Place in...,William Cronon,Law,http://books.google.com/books/content?id=w04mj...,"Essays by revisionist historians, scientists, ...",1996.0,4.16,560.0,649.0,Uncommon Ground: Rethinking the Human Place in...,9780393315110 Essays by revisionist historians...
1970,9780395746561,0395746566,Sector 7,David Wiesner,Juvenile Fiction,http://books.google.com/books/content?id=DNszR...,While on a school trip to the Empire State Bui...,1999.0,4.21,48.0,4962.0,Sector 7,9780395746561 While on a school trip to the Em...
3747,9780786808069,0786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069 Children will discover the excit...
3748,9780786808373,0786808373,Baby Einstein: Birds,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=0jxHP...,"Introducing your baby to birds, cats, dogs, an...",2002.0,3.78,20.0,9.0,Baby Einstein: Birds,"9780786808373 Introducing your baby to birds, ..."
3749,9780786808380,0786808381,Baby Einstein: Babies,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=jv4NA...,"Introduce your babies to birds, cats, dogs, an...",2002.0,4.03,20.0,29.0,Baby Einstein: Babies,"9780786808380 Introduce your babies to birds, ..."
3750,9780786808397,078680839X,Baby Einstein: Dogs,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=qut8t...,"Introduce your baby to birds, cats, dogs, and ...",2002.0,3.81,20.0,26.0,Baby Einstein: Dogs,"9780786808397 Introduce your baby to birds, ca..."
3797,9780789458209,0789458209,Tree,David Burnie,Juvenile Nonfiction,http://books.google.com/books/content?id=Qwsqj...,Photographs and text explore the anatomy and l...,2000.0,4.07,64.0,5.0,Tree,9780789458209 Photographs and text explore the...
4557,9781551052700,1551052709,Ecuador Nature Guide,Christopher D. Jiggins,Botanique,http://books.google.com/books/content?id=1JjbG...,The guide provides information on 76 species o...,2000.0,5.0,96.0,1.0,Ecuador Nature Guide: Southwest Forests : Sozo...,9781551052700 The guide provides information o...
