In [None]:
!pip install langchain-community pypdf

In [None]:
!pip install sentence-transformers

In [None]:
!pip install chromadb

In [None]:
!pip install langchain_huggingface

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Load Document

In [198]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('/content/drive/My Drive/code/Generate/glasses.pdf')

data = loader.load()
print(data)

[Document(metadata={'producer': 'Skia/PDF m136 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'glasses', 'source': '/content/drive/My Drive/code/Generate/glasses.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Title:  The  Beginner’s  Guide  to  Keeping  Your  Glasses  Clean  (and  Actually  Seeing  Clearly)  \n1.  Introduction  \nGlasses  are  like  windows  for  your  eyes—except  they  somehow  attract  more  smudges  than  a  \ntoddler’s\n \nhands\n \non\n \na\n \ntouchscreen.\n \nWhether\n \nyou\n \nwear\n \nthem\n \nfor\n \nstyle\n \nor\n \nsurvival,\n \nkeeping\n \nthem\n \nclean\n \nand\n \nwell-maintained\n \nis\n \nessential\n \nfor\n \nboth\n \nyour\n \nvision\n \nand\n \nyour\n \nsanity.\n \n2.  How  to  Properly  Clean  Your  Glasses  \nSpoiler  alert:  Your  T-shirt  is  not  the  right  tool.  Here’s  what  to  do  instead:  \nThe  Right  Way:  \n1.  Rinse  First  –  Run  your  glasses  under  lukewarm  water  to  remove  dus

## Parsing Document

In [199]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import re
separator = r"\d+\."

chunk_size = 200
chunk_overlap = 50

rc_splitter = RecursiveCharacterTextSplitter(
        separators=separator,
		chunk_size=chunk_size,
		chunk_overlap=chunk_overlap)

docs = rc_splitter.split_documents(data)

In [200]:
print([doc.page_content for doc in docs[0:3]])

['Title:  The  Beginner’s  Guide  to  Keeping  Your  Glasses  Clean  (and  Actually  Seeing  Clearly)  \n1.  Introduction  \nGlasses  are  like  win', 'duction  \nGlasses  are  like  windows  for  your  eyes—except  they  somehow  attract  more  smudges  than  a  \ntoddler’s\n \nhan', 'dges  than  a  \ntoddler’s\n \nhands\n \non\n \na\n \ntouchscreen.\n \nWhether\n \nyou\n \nwear\n \nthem\n \nfor\n \nstyle\n \nor\n \nsurvival,\n \nkeeping\n \nthem\n \nclean\n \nand\n \nwell-maintaine']


In [201]:
print(f'Character count for each chunk: {[len(doc.page_content) for doc in docs[0:3]]}')

Character count for each chunk: [144, 127, 166]


In [202]:
# Iterate over docs and clean the \n in the text content (page_content)
cleaned_docs = []
for doc in docs:
    # Clean the \n in the page_content of each document
    cleaned_text = doc.page_content.replace('\n', ' ')
    # Create a new Document with the cleaned content
    cleaned_docs.append(doc.copy(update={'page_content': cleaned_text}))

print([cleaned_doc.page_content for cleaned_doc in cleaned_docs[0:3]])

['Title:  The  Beginner’s  Guide  to  Keeping  Your  Glasses  Clean  (and  Actually  Seeing  Clearly)   1.  Introduction   Glasses  are  like  win', 'duction   Glasses  are  like  windows  for  your  eyes—except  they  somehow  attract  more  smudges  than  a   toddler’s   han', 'dges  than  a   toddler’s   hands   on   a   touchscreen.   Whether   you   wear   them   for   style   or   survival,   keeping   them   clean   and   well-maintaine']


<ipython-input-202-9224ce36f7bb>:7: PydanticDeprecatedSince20: The `copy` method is deprecated; use `model_copy` instead. See the docstring of `BaseModel.copy` for details about how to handle `include` and `exclude`. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  cleaned_docs.append(doc.copy(update={'page_content': cleaned_text}))


## Initialize Embedding Model

In [203]:
from langchain.embeddings import HuggingFaceEmbeddings
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/bert-base-nli-mean-tokens")

## Save Embedding to ChromaDB

In [205]:
from langchain.vectorstores import Chroma

persist_directory = '/content/drive/MyDrive/code/Generate/chroma_vectorstore'

vectorstore = Chroma.from_documents(
    cleaned_docs,
    embedding=embed_model,
    persist_directory=persist_directory
)

## Initialize Text Generation Model

In [212]:
from langchain_huggingface import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id='google/flan-t5-large',
    task='text2text-generation',
    pipeline_kwargs={
        'max_new_tokens': 100,
        'temperature': 1,
        'top_p': 0.9, # Consider top 90% of the probability distribution for next token
        'do_sample': True
    }
)

Device set to use cpu


## Create Template for Prompt

In [213]:
from langchain_core.prompts import ChatPromptTemplate

message = """
Should I {question} to clean my glasses? Consider the following guidelines:
Guidelines:
{guideline}

Answer:
"""

prompt_template = ChatPromptTemplate.from_messages([("human", message)])

## Initialize Retriver from vectorstore
Enabling searching for most similar documents

In [214]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 2}
)

## Chaining Prompt with LLM

In [215]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = ({"guideline": retriever, "question": RunnablePassthrough()}
						| prompt_template
						| llm)

In [216]:
query = "Use strong detergent"

retrieved_docs = vectorstore.similarity_search(query, k=2)
print("Retrieved Documents:")
for doc in retrieved_docs:
    print(doc.page_content)

Retrieved Documents:
d   well-maintained   is   essential   for   both   your   vision   an
Title:  The  Beginner’s  Guide  to  Keeping  Your  Glasses  Clean  (and  Actually  Seeing  Clearly)   1.  Introduction   Glasses  are  like  win


In [217]:
response = rag_chain.invoke(query)
print("LLM Response:", response)

LLM Response: [['Avoid using strong detergents']]
