In [1]:
#!pip install langchain==0.1.14
##!pip install langchain-core==0.1.13
#!pip install langchain_experimental==0.0.56
#!pip install langchain-community==0.0.31
#!pip install faiss-cpu==1.8.0
#!pip install pdfplumber==0.11.0
#!pip install gradio==4.25.0

In [1]:
from langchain_community.document_loaders import PDFPlumberLoader,PyMuPDFLoader
import re
from unidecode import unidecode
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.chat_models import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from collections import defaultdict
from langchain.chains import RetrievalQA
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
import os

os.environ["OPENAI_API_KEY"] = "sk-your-openai-key"

In [2]:
def clean_text(text):
    # 1. Normalize unicode characters
    text = unidecode(text)

    # 2. Remove repeated dots or whitespace clutter
    text = re.sub(r'([!?.])(?:\s*\1)+', r'\1', text)
    text = re.sub(r'\s+', ' ', text)

    # 3. Remove page numbers or repeated headers/footers (basic heuristic)
    text = re.sub(r'Page\s+\d+|\d+\s+Page', '', text, flags=re.IGNORECASE)

    # 4. Remove any non-printable/control characters
    text = re.sub(r'[^\x20-\x7E]', '', text)

    # 5. Remove long runs of punctuation or special chars
    text = re.sub(r'[~`_*^=\\]+', '', text)

    # 6. Strip any leading/trailing spaces
    text = text.strip()

    return text

In [45]:
loader = PyMuPDFLoader("policy_docs.pdf")
docs = loader.load()


In [46]:
docs = [
    Document(
        page_content=clean_text(doc.page_content),
        metadata=doc.metadata
    )
    for doc in docs
]

In [47]:
combined_text = ""
for i, doc in enumerate(docs):
    combined_text += f"\n\n[Page {i + 1}]\n" + doc.page_content

In [48]:
#text_splitter = SemanticChunker(HuggingFaceEmbeddings())
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.create_documents([combined_text])

In [49]:
page_marker_regex = re.compile(r"\[Page (\d+)\]")

last_known_page = None

for chunk in chunks:
    match = page_marker_regex.search(chunk.page_content)
    if match:
        page_num = int(match.group(1))
        chunk.metadata["page"] = page_num
        last_known_page = page_num
    elif last_known_page is not None:
        chunk.metadata["page"] = last_known_page


In [50]:
len(chunks)

138

In [51]:
# Instantiate the embedding model
embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [52]:
# Create the vector store 
vector = FAISS.from_documents(chunks, embedder)

In [53]:
# Input
retriever = vector.as_retriever(search_type="similarity", search_kwargs={"k": 10})


In [54]:
retrieved_docs = retriever.invoke("List all action items in Focus Area 2.")
for doc in retrieved_docs:
    print(doc.metadata)

{'page': 7}
{'page': 21}
{'page': 22}
{'page': 7}
{'page': 10}
{'page': 12}
{'page': 15}
{'page': 22}
{'page': 18}
{'page': 21}


In [55]:
# Define llm
llama_llm = Ollama(model="llama3.1:latest")
openai_llm = ChatOpenAI(
    model_name="gpt-4o",  # Or "gpt-3.5-turbo"
    temperature=0,
    verbose=True
)

In [56]:
prompt = """
        You are given a question and a list of raw texts extracted from a PDF. There might or might not be a correlation between the texts.
        Your task is to answer the question based on the information from raw texts.
        
        Context: {context}
        
        Question: {question}


        Instructions:
        Consider yourself as a helpful RAG assistant that answers questions.
        Find and use the information from the raw texts to answer the question.
        Answer the question in a direct way without referring to the texts.
        List the individual page number(s) of the information that you used to answer the question. Cite it at the end of your response.
        Ensure that the answer is coherent, logically structured, and uses clear language.
    """

In [57]:
QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt) 

llm_chain = LLMChain(
                  llm=openai_llm, 
                  prompt=QA_CHAIN_PROMPT, 
                  callbacks=None, 
                  verbose=True)



In [58]:
# Create a dictionary to store chunks by page number
chunks_by_page = defaultdict(list)

# Organize chunks by their page number
for chunk in chunks:
    page_num = chunk.metadata.get("page")
    if page_num is not None:
        chunks_by_page[page_num].append(chunk)

# Function to get neighboring pages' chunks
def get_neighboring_chunks(page_num):
    neighboring_chunks = []
    
    # Get chunks for the current page
    neighboring_chunks.extend(chunks_by_page.get(page_num, []))
    
    # Get chunks for the previous page
    if page_num - 1 > 0:
        neighboring_chunks.extend(chunks_by_page.get(page_num - 1, []))
    
    # Get chunks for the next 2 pages
    neighboring_chunks.extend(chunks_by_page.get(page_num + 1, []))
    neighboring_chunks.extend(chunks_by_page.get(page_num + 2, []))
    
    return neighboring_chunks


In [59]:
query = "Summarize focus area 3"

# LLM response with extrapolated pages

In [60]:
retrieved_docs = retriever.invoke(query)

# Get the neighboring chunks for each retrieved document
neighboring_sets = []
for doc in retrieved_docs:
    page_num = doc.metadata["page"]
    neighboring_chunks = get_neighboring_chunks(page_num)
    
    # Add the set of neighboring chunks to the list
    neighboring_sets.append(neighboring_chunks)

# Combine the neighboring chunks into single text blocks
neighboring_texts = []
for chunks_set in neighboring_sets:
    combined_text = "\n".join([chunk.page_content for chunk in chunks_set])
    neighboring_texts.append(combined_text)

# Use the LLM to answer the query based on the top 3 sets
extrapolated_RAG_response = llm_chain.run({"context": neighboring_texts[:3], "question": query})




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
        You are given a question and a list of raw texts extracted from a PDF. There might or might not be a correlation between the texts.
        Your task is to answer the question based on the information from raw texts.
        
        Context: ['[Page 21]\n21 Focus Area 4: Develop AI expertise within the USPTO\'s workforce. The USPTO\'s dedicated workforce is our backbone, and the success of our AI Strategy rests on our employees\' readiness to navigate an increasingly AI-forward world. The USPTO is fortunate to have an exceptionally skilled workforce, with approximately 14,000 personnel boasting diverse talents across the sciences, engineering, law, IT, and other disciplines. We will invest in growing our workforce\'s AI expertise to meet the ever-changing landscape of AI opportunities and challenges. Providing foundational AI training will assist in carrying out the agency\'s work, which, in turn


[1m> Finished chain.[0m


# LLM response without extrapolated pages

In [61]:
document_prompt = PromptTemplate(
    input_variables=["page_content"],
    template="Context:\ncontent:{page_content}",
)

combine_documents_chain = StuffDocumentsChain(
                  llm_chain=llm_chain,
                  document_variable_name="context",
                  document_prompt=document_prompt,
                  callbacks=None,
              )

qa = RetrievalQA(
                  combine_documents_chain=combine_documents_chain,
                  verbose=True,
                  retriever=retriever,
                  return_source_documents=True,
              )

vanilla_RAG_response = qa(query)["result"]



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
        You are given a question and a list of raw texts extracted from a PDF. There might or might not be a correlation between the texts.
        Your task is to answer the question based on the information from raw texts.
        
        Context: Context:
content:21 Focus Area 4: Develop AI expertise within the USPTO's workforce. The USPTO's dedicated workforce is our backbone, and the success of our AI Strategy rests on our employees' readiness to navigate an increasingly AI-forward world. The USPTO is fortunate to have an exceptionally skilled workforce, with approximately 14,000 personnel boasting diverse talents across the sciences, engineering, law, IT, and other disciplines. We will invest in growing our workforce's AI expertise to meet the ever-changing landscape of AI opportunities and challenges. Providing foundational AI training will assist in c


[1m> Finished chain.[0m

[1m> Finished chain.[0m


# Compare both responses

In [62]:
print(extrapolated_RAG_response)

Focus Area 3 of the USPTO AI Strategy is centered on promoting the responsible use of AI within the USPTO and across the broader innovation ecosystem. This involves maximizing the benefits of AI in a responsible manner by adhering to principles such as safety, fairness, transparency, privacy, reliability, and accountability. The USPTO aims to maintain public trust in its AI adoption through value-aligned product development, risk mitigation, and transparent communication with stakeholders. The agency will also monitor the use of AI in the innovation ecosystem to encourage beneficial and responsible use through policy and guidance. This includes understanding AI's role in legal practice and promoting IP awareness and compliance with IP laws and policies. The USPTO emphasizes the importance of responsible AI practices, including lawful data sourcing and respecting IP rights, as essential components of the AI innovation ecosystem. [Pages 18, 19, 20]


In [63]:
print(vanilla_RAG_response)

Focus Area 3 of the USPTO AI Strategy is centered on promoting the responsible use of AI within the USPTO and across the broader innovation ecosystem. It emphasizes maximizing the benefits of AI in a responsible manner by adhering to principles such as safety, fairness, transparency, privacy, reliability, and accountability. The strategy is guided by Executive Orders 13960 and 14110, which establish principles for the trustworthy use of AI by Federal agencies and direct them to advance the responsible use of AI, including the secure use of generative AI systems. The USPTO aims to adopt AI responsibly in its operations and promote its responsible development and use across society. [Page 18]
