In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

In [2]:
from dotenv import load_dotenv
import os

# Load file .env
load_dotenv()
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [3]:
import pandas as pd

books = pd.read_csv('D://Workspace//LLM//Book Recommendation//dataset//books_cleaned.csv')
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine..."
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le..."


### **1. Splitting the books using CharacterTextSplitter**

**The problem in here:** When we query the database, the results will return the descriptions of the books. However, users don't want to receive the desciptions of books, the things that they need are the titles and the authors of the books.

**First solution (not effective):**
- We can filter Dataframe that store all books based on matching with the descriptions from the database -> This way is really random, slow and unaccurate because matching all words that are queried with the desciptions is impossible.

**Effective solution:**
- We merged the ISBN as the identifier in front of each descriptions and store into a file `tagged_description.txt`, as you can see below.
- When the database returns the descriptions, we can split the scripts to take the ISBN (identifier) to filter the accurate books from DataFrame and easily filter the titles and authors based on the ISBN instead of using the first solution.

In [4]:
books['tagged_description'].head()

0    9780002005883 A NOVEL THAT READERS and critics...
1    9780002261982 A new 'Christie for Christmas' -...
2    9780006178736 A memorable, mesmerizing heroine...
3    9780006280897 Lewis' work on the nature of lov...
4    9780006280934 "In The Problem of Pain, C.S. Le...
Name: tagged_description, dtype: object

In [None]:
# books['tagged_description'].to_csv("tagged_description.txt",
#                                    sep = '\n',
#                                    index = False,
#                                    header = False)

In [5]:
lengths = books['tagged_description'].str.len()
print(f"Mean length: {lengths.mean():.0f} characters")
print(f"Max length: {lengths.max():.0f} characters")

Mean length: 499 characters
Max length: 5800 characters


In [6]:
books[books['isbn13'] == 9780687002825]['tagged_description']

3176    9780687002825 Life at the end of the twentieth...
Name: tagged_description, dtype: object

Why we set the `CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator='\n')` ?
- `chunk_overlap = 0:` Each description is independent and seperate 'docs'. They don't have any semantic relations that need to be overlaped.

- `seperator = '\n':` You want to ensure that each book description (which is a line in the .txt file) is treated as a completely separate document. Setting the delimiter as the newline character () helps to ensure this.

- `chunk_size = 0:` This is an important and somewhat special setting in LangChain. When chunk_size is set to 0, the **CharacterTextSplitter** will prioritize splitting based on the **separator** rather than trying to divide based on the **chunk_size**.

- By setting `chunk_size = 0`, you force the **text_splitter** to only use `'\n'` as the primary splitting point, ensuring that each book description (one line) is treated as a complete and separate 'chunk' or 'document', regardless of its length.

In [7]:
loader = TextLoader("D:\\Workspace\\LLM\\Book Recommendation\\tagged_description.txt", encoding = 'UTF-8')
raw_documents = loader.load()

import logging

# Hinder the warnings from text_splitters
logging.getLogger("langchain_text_splitters.base").setLevel(logging.ERROR)

text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator='\n')
documents = text_splitter.split_documents(raw_documents)

In [None]:
documents[0]

Document(metadata={'source': '/content/drive/MyDrive/LLM/tagged_description.txt'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the sma

### **2. Building the vector database**

In [8]:
# Create embeddings and vector database
huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

database_books = Chroma.from_documents(
    documents,
    embedding=huggingface_embeddings,
    persist_directory="D:\\Workspace\\LLM\\Book Recommendation\\chroma")

### **3. Getting book recommendations using vector search**

In [9]:
query = "a book has some love stories"
docs = database_books.similarity_search(query, k = 3)
docs

[Document(id='5686327d-1bad-4f4b-992c-c9aa0e5db398', metadata={'source': 'D:\\Workspace\\LLM\\Book Recommendation\\tagged_description.txt'}, page_content='9780571206926 Laughable loves is a collection of stories that first appeared in print in Prague before 1968, but then was banned. The seven stories are all concerned with love, or rather with the complex erotic games and strategems employed by women and especially men as they try to come to terms with needs ad impulses that can start a terrifying train of events. Sexual attraction is shown as a game that often turns sour, an experience that brings with it painful insisghts and releases uncertainty, panic, vanity and a constant need for reassurance.'),
 Document(id='e534b907-28df-4acf-acde-c4d0e2a446c3', metadata={'source': 'D:\\Workspace\\LLM\\Book Recommendation\\tagged_description.txt'}, page_content="9780571193783 A parable about love, literature and fanaticism. A young university student becomes obsessed with a magical book that 

The query was running well, but it just returned the description of books. As we need, it have to return the titles and authors from the searching. So let's do it.

In [10]:
books[books['isbn13'] == int(docs[0].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
2825,9780571206926,571206921,Laughable Loves,Milan Kundera,Czech Republic,http://books.google.com/books/content?id=ZpupP...,Laughable loves is a collection of stories tha...,1999.0,3.87,287.0,14380.0,Laughable Loves,9780571206926 Laughable loves is a collection ...


In [12]:
# Function to retrieve books from description
def retrieve_semantic_recommendations(query: str, top_k: int = 5) -> pd.DataFrame:
  recommendations = database_books.similarity_search(query, k = top_k)

  book_list = []

  for i in range(0, len(recommendations)):
    book_list += ([int(recommendations[i].page_content.strip('"').split()[0])])

  return books[books['isbn13'].isin(book_list)]


In [13]:
retrieve_semantic_recommendations('books related to advanture')

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
215,9780060760441,60760443,The Reading Group,Elizabeth Noble,Fiction,http://books.google.com/books/content?id=IagWj...,The Reading Group follows the trials and tribu...,2005.0,3.34,429.0,6408.0,The Reading Group: A Novel,9780060760441 The Reading Group follows the tr...
990,9780192854520,192854526,Augustine: A Very Short Introduction,Henry Chadwick,Biography & Autobiography,http://books.google.com/books/content?id=QnrRB...,"Very Short Introductions offer stimulating, ac...",2001.0,3.74,144.0,219.0,Augustine: A Very Short Introduction,9780192854520 Very Short Introductions offer s...
2982,9780671035976,671035975,How to Stop Worrying and Start Living,Dale Carnegie,Psychology,http://books.google.com/books/content?id=zHBEK...,The first trade paperback edition of the class...,2004.0,4.11,358.0,57572.0,How to Stop Worrying and Start Living,9780671035976 The first trade paperback editio...
4052,9780822007807,822007800,CliffsNotes on Flaubert's Madame Bovary,James L. Roberts,Literary Criticism,http://books.google.com/books/content?id=aXHg3...,The original CliffsNotes study guides offer ex...,1964.0,3.23,80.0,13.0,CliffsNotes on Flaubert's Madame Bovary,9780822007807 The original CliffsNotes study g...
4084,9780826414755,826414753,Gabriel Garcia Marquez's Love in the Time of C...,Tom Fahy,Literary Criticism,http://books.google.com/books/content?id=ayjUA...,"A brilliant idea--short, perceptive books whic...",2003.0,4.23,188.0,1729.0,Gabriel Garcia Marquez's Love in the Time of C...,"9780826414755 A brilliant idea--short, percept..."


### **4. Reload the database and Run query**

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
import os

# Reload the stored database
persist_dir = 'D://Workspace//LLM//Book Recommendation//chroma'

huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
database_books = Chroma(persist_directory=persist_dir, embedding_function=huggingface_embeddings)

In [None]:
# Run query
query = "a book has a love story"
docs = database_books.similarity_search(query, k = 3)
docs

In [None]:
# Function to retrieve books from description
def retrieve_semantic_recommendations(query: str, top_k: int = 5) -> pd.DataFrame:
  recommendations = database_books.similarity_search(query, k = top_k)

  book_list = []

  for i in range(0, len(recommendations)):
    book_list += ([int(recommendations[i].page_content.strip('"').split()[0])])

  return books[books['isbn13'].isin(book_list)]
