In [1]:
directory = "/home/shreya/sem6/nlp/papers/qa_over_large_struct_doc.pdf"

In [2]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(directory, extract_images=True)

In [3]:
def load_docs(filename):
    loader = PyPDFLoader(filename, extract_images=True)
    pages = loader.load()
    return pages

In [4]:
documents = load_docs(directory)

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents) #to use across multiple documents rather than a string, use split_documents
    return docs

docs = split_docs(documents)
print(len(docs))

58


In [7]:
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings()

In [8]:
import pinecone
from langchain.vectorstores import Pinecone

  from tqdm.autonotebook import tqdm


In [9]:
def connect_db():
    pinecone.init(
    api_key="539e3f4e-1c1f-4894-a175-4d2abd5fbd31",
    environment="gcp-starter"
    )

In [10]:
connect_db()

In [11]:
def create_index(index_name, dimension):
    
    if index_name in pinecone.list_indexes():
        pinecone.delete_index(index_name)
    
    pinecone.create_index(name=index_name, metric="cosine", dimension=dimension)
    
    index = Pinecone.from_documents(documents = docs, embedding=embeddings, index_name = index_name)
    
    return index

DIMENSION = 4096
index = create_index("langchain-pdfqna", DIMENSION)

In [12]:
from langchain_community.llms import Ollama
llm = Ollama(model="llama2")

In [29]:
from langchain.prompts import PromptTemplate
from langchain_core.messages.human import HumanMessage
retriever = index.as_retriever(search_type="similarity", search_kwargs={"k": 1})
#prompt = PromptTemplate.from_template("You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:")

In [34]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough #used to map input to keys in prompt 
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()

In [46]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


def contextualized_question(input: dict):
    if input.get("chat_history"):
        return contextualize_q_chain
    else:
        return input["question"]


rag_chain = (
    RunnablePassthrough.assign(
        context=contextualized_question | retriever | format_docs
    )
    | qa_prompt
    | llm
    | StrOutputParser()
)

In [47]:
chat_history = []

question = "What is PDF question answering"
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
print(ai_msg)


Based on the context provided, it appears that you are asking about PDF question answering. To answer this question, I can tell you that PDF question answering refers to the task of using natural language processing (NLP) techniques to extract relevant information from PDF documents. This can involve tasks such as identifying and extracting text, classifying the content of a document, or answering questions based on the contents of the document. The field of PDF question answering is a growing area of research, with applications in industries such as healthcare, finance, and legal.


In [48]:
from langchain_core.messages.ai import AIMessage
chat_history.extend([HumanMessage(content=question), AIMessage(content = ai_msg)])

second_question = "Why is it used?"
ai_message = rag_chain.invoke({"question": second_question, "chat_history": chat_history})
print(ai_message)

PDF question answering is used to extract relevant information from PDF documents for various purposes, such as:

1. Automating document review and analysis: By using NLP techniques, PDF question answering can help automate the process of reviewing and analyzing large volumes of PDF documents, such as contracts, legal documents, or medical records.
2. Improving document search and retrieval: PDF question answering can improve the accuracy and speed of document search and retrieval by allowing users to ask natural language questions about the contents of a document.
3. Facilitating document-based decision-making: By providing relevant information from PDF documents, PDF question answering can help support decision-making processes by providing access to key data and insights.


In [50]:
for chunk in rag_chain.stream({"question":"What is PDF question answering", "chat_history":chat_history}):
    print(chunk, end="", flush=True)

PDF question answering is a task that involves using natural language processing techniques to extract relevant information from PDF documents. This can include identifying and extracting text, classifying the content of a document, or answering questions based on the contents of the document. The field of PDF question answering is a growing area of research with applications in industries such as healthcare, finance, and legal.

In [51]:
response = ""
for chunk in rag_chain.stream({"question":"What are the pre-retrieval steps?", "chat_history":chat_history}):
    print(chunk, end="", flush=True)
    response+=chunk
    
print('-------------')
print(response)

AI: Based on the context provided, it seems that you are asking about the pre-retrieval steps for PDF question answering. To answer this question, I can tell you that the pre-retrieval steps typically involve several activities, including:

1. Document Preparation: The document is prepared by extracting relevant information, such as text, tables, and figures, into a structured format.
2. Question Formulation: Annotators read the document and formulate questions based on the content of the document.
3. Question Categorization: The formed questions are then categorized into different types based on their nature, such as Figure Questions, Text Questions, Table Reasoning, etc.
4. Data Collection: The annotated questions are then collected and stored in a database for further analysis.-------------
AI: Based on the context provided, it seems that you are asking about the pre-retrieval steps for PDF question answering. To answer this question, I can tell you that the pre-retrieval steps typi