In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import pandas as pd

books = pd.read_csv('books_cleaned.csv')
books

In [None]:
books['tagged_description']

In [None]:
books['tagged_description'].to_csv('tagged_description.txt',
                                   index=False,
                                   header=False,
                                   encoding='utf-8')


In [None]:
raw_documents = TextLoader("tagged_description.txt", encoding='utf-8').load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

In [None]:
documents[0]

In [None]:
huggingface_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2")

db_books = Chroma.from_documents(
    documents,
    embedding=huggingface_embeddings
)

In [None]:
query = "A book to teach children about nature"
docs = db_books.similarity_search(query, k = 10)
docs

In [None]:
books[books["isbn13"] == int(docs[0].page_content.strip('"').split()[0])]

In [None]:
def retrieve_semantic_recommendations(
        query: str,
        top_k: int = 10,
) -> pd.DataFrame:
    recs = db_books.similarity_search(query, 100)
    print(len(recs))
    books_list = []

    for i in range (len(recs)):
        books_list += [int(recs[i].page_content.strip('"').split()[0])]

    return books[books["isbn13"].isin(books_list)].head(top_k)

In [None]:
retrieve_semantic_recommendations("A book to teach children about nature")