In [None]:
# Setting environment variable `PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION` due to issues with importing 'langchain_chroma'
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

# Loading all the environment variables
from dotenv import load_dotenv
load_dotenv()

# Using Chroma for open-source vector database storage
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma

# Importing Data

In [4]:
import os

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

In [5]:
import pandas as pd

# Loading is data previously cleaned data
path = 'data/books_cleaned.csv'
books = pd.read_csv(path)

# Printing the data to check validity
books.head(2)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,words_in_description,title_and_subtitle
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,199,Gilead
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,205,Spider's Web: A Novel


In [22]:
books.shape

(5197, 13)

# Constructing Vector Search

In [23]:
# Manually constructing LangChain Document object
# Disregarding the document tagging step performed earlier to create `title_and_subtitle`

from langchain.schema import Document

document_object = [
    Document(
        page_content=row["description"],
        metadata={"source": str(row["isbn13"])}
    )
    for _, row in books.iterrows()
]

# Making sure each description was passed
len(document_object)

5197

In [24]:
# Checking how the object looks
document_object[0]

Document(metadata={'source': '9780002005883'}, page_content='A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celeb

In [30]:
# Initialzing the embeddings object
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001" )

# Constructing the database
db_books = Chroma.from_documents(document_object, embedding=embeddings)

In [42]:
# Checking to see if the database was properly constucted
query = "A book to teach children about nature"
query_response = db_books.similarity_search(query, k=10)
query_response

[Document(id='35daba5e-7685-451f-b4e1-f40821d042ae', metadata={'source': '9780786808069'}, page_content='Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.'),
 Document(id='9cd94356-5454-4bf4-974f-369c4f43d58a', metadata={'source': '9780786808717'}, page_content='A very special puddle sets Violet the mouse off on her latest nature discovery. It is through this puddle that Violet observes the effect rain has on the world around her. A Mylar puddle on the last page offers children a chance to see their reflection in a puddle, just like Violet!'),
 Document(id='d4123c64-8429-409f-9d6a-a69c5ecfbc6e', metadata={'source': '9780067575208'}, page_content='First published more than three decades ago, this reissue of Rachel Carson\'s award-winning classic brings her unique vision to a n

In [51]:
books[books["isbn13"] == int(query_response[0].metadata["source"])]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,words_in_description,title_and_subtitle
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,40,Baby Einstein: Neighborhood Animals


# Querying

In [58]:
# Function to retrieve all the recommendations as a dataframe
def retrive_books(books: pd.DataFrame, db_books: Chroma, query: str, no_of_recc: int = 10) -> pd.DataFrame:
    response_books = db_books.similarity_search(query, k=no_of_recc)
    isbn_list = [int(doc_object.metadata["source"]) for doc_object in response_books]
    return books[books["isbn13"].isin(isbn_list)]

In [59]:
retrive_books(books, db_books, "A book to teach children about nature", 10)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,words_in_description,title_and_subtitle
442,9780067575208,006757520X,The Sense of Wonder,Rachel Carson,Nature,http://books.google.com/books/content?id=Zee5S...,"First published more than three decades ago, t...",1998.0,4.39,112.0,1160.0,322,The Sense of Wonder
812,9780142302279,0142302279,Dirty Beasts,Roald Dahl,Juvenile Nonfiction,,Poems tell the stories of a smart pig who outw...,2002.0,4.02,32.0,3953.0,33,Dirty Beasts
1639,9780374422080,0374422087,Everything on a Waffle,Polly Horvath,Juvenile Fiction,http://books.google.com/books/content?id=NimVJ...,This Newbery Honor Book tells the story of 11 ...,2004.0,3.71,150.0,9631.0,44,Everything on a Waffle
2216,9780440421702,0440421705,Hoot,Carl Hiaasen,Bullying,http://books.google.com/books/content?id=uKUTo...,"Roy, who is new to his small Florida community...",2006.0,3.82,292.0,83557.0,40,Hoot
3214,9780689861130,0689861133,"Moo, Baa, la la La!",Sandra Boynton,Animal sounds,http://books.google.com/books/content?id=Gz40A...,Children will love joining in and imitating th...,2004.0,4.2,14.0,28261.0,27,"Moo, Baa, la la La!"
3522,9780753459645,0753459647,I Wonder Why the Sun Rises,Brenda Walpole,Juvenile Nonfiction,http://books.google.com/books/content?id=SqpYs...,What is a leap year? Why are bees busy in summ...,2006.0,3.95,32.0,50.0,48,I Wonder Why the Sun Rises: and Other Question...
3581,9780763620875,0763620874,Judy Moody Saves the World!,Megan McDonald,Juvenile Fiction,http://books.google.com/books/content?id=xDIRB...,When Judy Moody gets serious about protecting ...,2004.0,4.03,160.0,5883.0,34,Judy Moody Saves the World!
3747,9780786808069,0786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,40,Baby Einstein: Neighborhood Animals
3748,9780786808373,0786808373,Baby Einstein: Birds,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=0jxHP...,"Introducing your baby to birds, cats, dogs, an...",2002.0,3.78,20.0,9.0,64,Baby Einstein: Birds
3751,9780786808717,0786808713,Baby Einstein: What Does Violet See? Raindrops...,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=95IIA...,A very special puddle sets Violet the mouse of...,2002.0,3.25,18.0,16.0,52,Baby Einstein: What Does Violet See? Raindrops...
