In [1]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
# !pip install pdfplumber

In [19]:
from pprint import pprint
from dotenv import dotenv_values
import openai
import pickle
from pypdf import PdfReader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

env_vars = dotenv_values('.env')
openai.api_key = env_vars.get('OPENAI_API_KEY')

In [11]:
import os, sys
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

import utils.chroma as chom

## Parent Document Retriever

In [116]:
Page = namedtuple("Page", ["id", "page_content", "metadata"])

def pdf_reader(file_path):
    reader = PdfReader(file_path)
    pdf_pages = []
    for page_number, page in enumerate(reader.pages):
        page_content = page.extract_text().strip()
        if page_content:
            metadata = {"page_number": page_number}  # Add any additional metadata as needed
            pdf_pages.append(Page(id=page_number, page_content=page_content, metadata=metadata))
    return pdf_pages

file_path = '../data/RaptorContract.pdf'
pdf_pages = pdf_reader(file_path)

In [118]:
# parent_docs = documents

# Embedding Model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


# Splitters
child_splitter = RecursiveCharacterTextSplitter(chunk_size= 200)
# We don't need a parent splitter because the data cames from CSV file, and each row is a parent doc.
store = InMemoryStore()
vectorstore = Chroma(embedding_function=embeddings, collection_name="ppp")


parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    # parent_splitter =parent_splitter
)

parent_document_retriever.add_documents(pdf_pages , ids=None)

In [119]:
print(f"Number of parent chunks  is: {len(list(store.yield_keys()))}")

print(f"Number of child chunks is: {len(parent_document_retriever.vectorstore.get()['ids'])}")

Number of parent chunks  is: 72
Number of child chunks is: 2475


In [120]:
parent_document_retriever.vectorstore.get()

{'ids': ['000f34e4-0326-4fc2-a9dc-1ec53d4d781b',
  '005350ae-82f9-4ffc-bafb-087f25ef0a26',
  '00668aaf-e27d-4f83-9fd0-117b250f5127',
  '00cb6be3-0539-4636-a1bc-0272f598b050',
  '00d95488-6a8a-4fa4-a184-bc1290127837',
  '0122cde2-300e-48b9-bb58-98c99d680f8b',
  '0135e15e-0f21-4adc-8d84-db71112a7036',
  '015e1e57-6341-4632-abbb-afabfede0101',
  '0167d1e1-7ef0-47cc-8441-025052c37fbd',
  '017363ca-e993-4913-8314-11d5edcfff4e',
  '019e6dc9-6806-4793-88c1-0f11b8359fd2',
  '01a9bb6c-621a-494c-96d4-83c99a117491',
  '01dd4d19-98d4-4d68-8d60-c65439abca44',
  '01f46f8f-af9f-48f8-a01a-65bf4d14c86b',
  '01f98737-c0c8-4192-8b3a-847557e8ab52',
  '0214f98f-c7f7-43e8-80d5-c9d1e929e8fe',
  '021da6a6-4d62-4be3-a2d9-ea9675aadf17',
  '02296faf-83d5-451c-bd7d-2c43df76771d',
  '022995e1-a130-490c-a5af-a151596a7692',
  '0231e011-d935-4de8-9c70-c87ab61247b0',
  '0231f8e5-4014-455b-8885-6c40401e8734',
  '0233044f-556f-4253-b77f-243ccee3df5c',
  '023c8237-baea-486b-a738-601583a1a4d8',
  '0282b04a-364b-439c-961c-

In [121]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

Answer question in summarization, put section number of the answer from the file for example like `Except in the case of fraud, the Sellers have no liability for breach of representations and
warranties (See section 10.01)`
 
Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

In [122]:
from langchain_community.chat_models import ChatOpenAI

chat_model = ChatOpenAI()

In [141]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": parent_document_retriever })
output_parser = StrOutputParser()


parent_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


parent_retrieval_chain.invoke("Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?")
print(parent_document_retriever )

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] Entering Chain run with input:
[0m{
  "input": "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?"
}
[36

In [136]:
from langchain.globals import set_verbose, set_debug

set_debug(True)
parent_retrieval_chain.invoke("Does the Buyer needs the Sellers’ consent in the event of an assignment of the Agreement to a third party who is not a Buyer’s Affiliates?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Does the Buyer needs the Sellers’ consent in the event of an assignment of the Agreement to a third party who is not a Buyer’s Affiliates?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] Entering Chain run with input:
[0m{
  "input": "Does the Buyer needs the Sellers’ consent in the event of an assignment of the Agreement to a third party who is not a Buyer’s Affiliates?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Does the Buyer needs the Sellers’ consent in the event of an assignment of the Agreement to a third party who is not a Buyer’s Affiliates?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:Runnabl

"Yes, the Buyer needs the Sellers' consent in the event of an assignment of the Agreement to a third party who is not a Buyer's Affiliates (See section 11.02)."