## RAG

In [1]:
import bs4
from langchain import hub
from langchain.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

USER_AGENT environment variable not set, consider setting it to identify your requests.


#### Chat model - OpenAI 

In [44]:
# import getpass
# import os

# if not os.environ.get("OPENAI_API_KEY"):
#   os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

# from langchain.chat_models import init_chat_model

# llm = init_chat_model("gpt-4o-mini", model_provider="openai")

#### Embedding model - OpenAI

In [45]:
# import getpass
# import os

# if not os.environ.get("OPENAI_API_KEY"):
#   os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

# from langchain_openai import OpenAIEmbeddings

# embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

#### Load PDF document

In [2]:
from langchain_community.document_loaders import PyPDFLoader

file_path = (
    './data/houndofbaskervil00doyluoft.pdf'
)
loader = PyPDFLoader(file_path)
pages = [] #List of Document
async for page in loader.alazy_load():
    pages.append(page)

#### Test that the document is loaded properly 

In [None]:
print(f"Metadata = \n {pages[0].metadata}")

# Print first 500 characters of 15th page
print(f"Content = \n {pages[25].page_content[:100]}")

# Total characters in 15th page
print(f"\n Total characters = {len(pages[25].page_content)}")

# Print the 35th page content, but not evertything, only first 300 characters
print(f"\n Content = \n {pages[35].page_content[:300]}")

Metadata = 
 {'producer': 'Internet Archive PDF 1.4.22; including mupdf and pymupdf/skimage', 'creator': 'Internet Archive', 'creationdate': '2023-10-24T08:24:30+00:00', 'title': 'The hound of the Baskervilles : another adventure of Sherlock Holmes', 'keywords': 'https://archive.org/details/houndofbaskervil00doyluoft', 'author': 'Doyle, Arthur Conan, Sir, 1859-1930; Paget, Sidney, 1861-1908', 'moddate': '2023-10-24T08:24:30+00:00', 'source': './data/houndofbaskervil00doyluoft.pdf', 'total_pages': 408, 'page': 0, 'page_label': ''}
Content = 
 The  Hound  of  the  Baskervilles 
from  his  friends  of  the  C.C.H.,"  was  en- 
graved upon  it,  with  the  date  "  1884."  It 
was  just  such  a  stick  as  the  old-fashioned 
family  practitioner  used  to  carry — dignified, 
solid,  and  reassuring. 
"  Well,  Watson,  what  do  you  make  of  it  ?  " 
Holmes  was  sitting  with  his  back  to  me, 
and  I  had  given  him  no  sign  

 Total characters = 1097


#### Split the document 

In [38]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)
chunks = text_splitter.split_documents(pages)

print(f"Number of chunks = {len(chunks)}")

Number of chunks = 696


#### Index chunks of the documents 

In [50]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

embedding_function = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

vector_store = Chroma(
    embedding_function=embedding_function,
    persist_directory="./chroma_langchain_db",
    collection_name="rag_demo",
    #embedding_model=sbert_model,
)

_ = vector_store.add_documents(documents=chunks)


#### Chat Model

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


#### Build & Compile Graph for Retrieval & Generation 

In [None]:



prompt = hub.pull("rlm/rag-prompt")

# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# Retrieve similar docs from vector db
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"], k=2)
    return {"context": retrieved_docs, "question": state["question"]}

# Lambda function to generate response
def generate_response(prompt_text: str) -> str:
    inputs = tokenizer(prompt_text, return_tensors="pt")
    outputs = model.generate(
        inputs["input_ids"],
        # max_new_tokens=200, 
        max_length=1000,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate answer using LLM
def generate_answer(state: State):
    docs_context = "\n".join([doc.page_content for doc in state["context"]])
    messages = prompt.invoke(
        {
            "context": docs_context,
            "question": state["question"],
        }
    )

    #print(f"Prompt = \n {messages}")

    prompt_text = messages.to_string()

    # Generate response using GPT-2
    response_text = generate_response(prompt_text)

    
    # Generate response using GPT-2
    response_text = generate_response(prompt_text)
    return {"answer": response_text}



In [None]:
question = "Who stole the shoe?"
state_input = {"question": question}
result = generate_answer(retrieve(state_input))
print(result["answer"])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: Who stole the shoe? 
Context: The  Hound  of  the  Baskervilles 
sucker  in    this  hotel,"  he   cried.     "  They'll 
find  they've  started  in  to  monkey  with  the 
wrong   man    unless    they  are   careful.     By 
thunder,  if  that  chap  can't  find  my  missing 
boot  there  will  be  trouble.    I  can  take  a  joke 
with  the  best,  Mr.  Holmes,  but  they've  got 
a  bit  over  the  mark  this  time." 
"  Still  looking  for  your  boot  ?'* 
"  Yes,  sir,  and  mean  to  find  it." 
"  But,  surely,  you  said  that  it  was  a  new 
brown  boot  ?  " 
"So  it  was,  sir.  And  now  it's  an  old 
black  one." 
"  What !  you  don't  mean  to  say   ?  " 
**  That's  just  what  I  do  mean  to  say.  I 
only  had