In [10]:
from langchain_community.llms import Ollama
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader

In [11]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [12]:
books["tagged_description"].to_csv("tagged_descriptions.txt", sep="\n", index= False, header=False)

In [13]:
# NOt working because the splitter ignores the separator and includes the next lines if the chunk size is not satisfied within the current line
# so have to do it manually

# raw_documents = TextLoader("tagged_descriptions.txt", encoding="utf-8").load()
# text_splitter = CharacterTextSplitter(
#     separator="\n",
#     chunk_size=999,
#     chunk_overlap=0
# )   # the chunk size and overlap are set to 9999 becuase then it will prioritize splitting on newline instead of chunk size
# documents = text_splitter.split_documents(raw_documents)

In [14]:
from langchain_core.documents import Document   

documents = []

with open("tagged_descriptions.txt", "r", encoding="utf-8") as f:
    for line in f:
        text = line.strip()
        if text:
            documents.append(Document(page_content=text))


In [15]:
documents[0]

Document(metadata={}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and 

In [16]:
max(len(doc.page_content) for doc in documents)

5836

In [17]:
from langchain_ollama import OllamaEmbeddings

retriever = Chroma.from_documents(documents=documents, 
                                  embedding=OllamaEmbeddings(
                                      model="nomic-embed-text:latest"
                                ))

In [19]:
query = "A book about crime"

docs = retriever.similarity_search(query, k=10)
docs

[Document(metadata={}, page_content='9780761929949 Examining the role of crime in American politics and culture, The Politics of Injustice, Second Edition provides a better understanding of the nature of crime and punishment in America, as well as the cultural and political contexts in which they occur. Updated throughout, this book will be of interest to students in all areas of Criminology especially those involved in critical issues in Criminal Justice.'),
 Document(metadata={}, page_content='9780131730366 Dispelling current myths regarding organized crime, Lyman and Potter’s fourth edition reveals a truer picture of organized crime and criminal activity today. Providing scholarly treatment and a social perspective, the authors explore the concept of organized crime, the historical foundation for its evolution and development, and the current status of criminal groups in today’s society. Offering timely and respected research, this edition includes a thorough examination of drug tra

In [20]:
books[books["isbn13"] == int(docs[0].page_content.split()[0].strip())] 

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
3573,9780761929949,761929940,The Politics of Injustice,Katherine Beckett;Theodore Sasson,Political Science,http://books.google.com/books/content?id=GWM5D...,Examining the role of crime in American politi...,2000.0,3.76,272.0,32.0,The Politics of Injustice: Crime and Punishmen...,9780761929949 Examining the role of crime in A...


In [None]:
def retrieve_semantic_recommmendatinos(
        query: str,
        top_k: int = 10,
) -> pd.DataFrame:
    recs = retriever.similarity_search(query=query, k=50)
    book_list = []

    for i in range(0, len(recs)):
        book_list += [int(recs[i].page_content.strip('"').split()[0])] # had to strip the double colons because some of the descriptions are in them which raises a value error

    return books[books["isbn13"].isin(book_list)].head(top_k)

In [22]:
retrieve_semantic_recommmendatinos("A book about cold war")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
66,9780007162994,7162995,If I Die in a Combat Zone,Tim O'Brien,"Vietnam War, 1961-1975",http://books.google.com/books/content?id=0qUtS...,Perhaps the best book to emerge from the Vietn...,2003.0,3.95,208.0,11.0,If I Die in a Combat Zone,9780007162994 Perhaps the best book to emerge ...
282,9780060915186,60915188,An American Childhood,Annie Dillard,Biography & Autobiography,http://books.google.com/books/content?id=tRihT...,A book that instantly captured the hearts of r...,1988.0,3.91,255.0,7086.0,An American Childhood,9780060915186 A book that instantly captured t...
332,9780060976095,60976098,Fiskadoro,Denis Johnson,Fiction,http://books.google.com/books/content?id=YSTe7...,"Hailed by the New York Times as ""wildly ambiti...",1995.0,3.54,221.0,1463.0,Fiskadoro,9780060976095 Hailed by the New York Times as ...
375,9780061137037,61137030,Spider's House,Paul Bowles,Fiction,http://books.google.com/books/content?id=3_jMW...,"Set in Fez, Morocco, during that country's 195...",2006.0,4.03,432.0,1060.0,Spider's House: A Novel,"9780061137037 Set in Fez, Morocco, during that..."
518,9780099477310,99477319,Catch-22,Joseph Heller,Fiction,http://books.google.com/books/content?id=uciYm...,WITH AN INTRODUCTION BY HOWARD JACOBSON Explos...,1994.0,3.98,519.0,5307.0,Catch-22,9780099477310 WITH AN INTRODUCTION BY HOWARD J...
524,9780099483472,99483475,All Quiet on the Western Front,Erich Maria Remarque,"World War, 1914-1918",,All Quiet on the Western Front is probably the...,2005.0,3.95,216.0,1018.0,All Quiet on the Western Front,9780099483472 All Quiet on the Western Front i...
568,9780140139976,140139974,Sailor Song,Ken Kesey,Fiction,http://books.google.com/books/content?id=-pPSO...,"In Alaska to film a famous children's book, th...",1993.0,3.57,533.0,1956.0,Sailor Song,9780140139976 In Alaska to film a famous child...
578,9780140157185,140157182,The Assault,Reinaldo Arenas;Andrew Hurley,Fiction,http://books.google.com/books/content?id=Ftyzj...,"A surrealistic novel on a dictatorship, a Cuba...",1995.0,3.78,176.0,137.0,The Assault,9780140157185 A surrealistic novel on a dictat...
756,9780141185163,141185163,Orwell in Spain,George Orwell,Fiction,http://books.google.com/books/content?id=uVNpA...,"Including Homage to Catalonia, Orwell's profou...",2001.0,4.33,416.0,203.0,Orwell in Spain: the full text of Homage to Ca...,"9780141185163 Including Homage to Catalonia, O..."
1059,9780226817415,226817415,From Counterculture to Cyberculture,Fred Turner,Social Science,http://books.google.com/books/content?id=wz5Em...,"In the early 1960s, computers haunted the Amer...",2006.0,3.96,327.0,323.0,From Counterculture to Cyberculture: Stewart B...,"9780226817415 In the early 1960s, computers ha..."
