In [1]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
# !pip install pdfplumber

In [19]:
from pprint import pprint
from dotenv import dotenv_values
import openai
import pickle
from pypdf import PdfReader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

env_vars = dotenv_values('.env')
openai.api_key = env_vars.get('OPENAI_API_KEY')

In [11]:
import os, sys
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

import utils.chroma as chom

## Parent Document Retriever

In [97]:
Page = namedtuple("Page", ["id", "page_content", "metadata"])

def pdf_reader(file_path):
    reader = PdfReader(file_path)
    pdf_pages = []
    for page_number, page in enumerate(reader.pages):
        page_content = page.extract_text().strip()
        if page_content:
            metadata = {"page_number": page_number}  # Add any additional metadata as needed
            pdf_pages.append(Page(id=page_number, page_content=page_content, metadata=metadata))
    return pdf_pages

file_path = '../data/RobinsonAdvisory.pdf'
pdf_pages = pdf_reader(file_path)

In [None]:
# parent_docs = documents

# Embedding Model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


# Splitters
child_splitter = RecursiveCharacterTextSplitter(chunk_size= 200)
# We don't need a parent splitter because the data cames from CSV file, and each row is a parent doc.
store = InMemoryStore()
vectorstore = Chroma(embedding_function=embeddings, collection_name="papp")


parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    # parent_splitter =parent_splitter
)

parent_document_retriever.add_documents(pdf_pages , ids=None)

In [99]:
print(f"Number of parent chunks  is: {len(list(store.yield_keys()))}")

print(f"Number of child chunks is: {len(parent_document_retriever.vectorstore.get()['ids'])}")

Number of parent chunks  is: 4
Number of child chunks is: 132


In [100]:
parent_document_retriever.vectorstore.get()

{'ids': ['010f076c-09cb-4405-bc38-fc039cd4d57e',
  '0187ccde-b08c-49a5-ad06-cdd8852f8a05',
  '028040bf-2722-407f-b917-685c5dfb6d89',
  '02ea12ca-d29a-4337-99e6-7b7fc188dc6e',
  '044c9529-2ef1-4d8c-8d0f-668e40bd285c',
  '04b6b4a0-22e8-463b-9e64-e4722db70c61',
  '0504fc25-91fa-463a-bf91-b9d9f5b722a9',
  '0863c66b-d37f-49f9-9065-0b3a6aa45378',
  '0be35a65-0133-4ba7-a619-15920d388547',
  '0e24b960-b9cc-4715-b118-56232f9cb267',
  '130c2688-8e95-49cb-9d42-08359d30aa62',
  '158f0d69-8741-4800-898e-4e2f67875241',
  '1bf7d69b-c91b-48d2-9900-cd610164c5ec',
  '1e78855b-1a51-4825-b256-d04ccb97fe30',
  '1f28d961-9a72-40b0-adb6-cc1d7426f05c',
  '1f670610-18ca-4815-9ba4-5d6a2547d7b1',
  '249aab39-f286-4721-a2e3-b59b06e677f7',
  '2679ffd4-f5fc-4a06-95b7-d4a450f3e6ef',
  '27260dd5-9487-4086-82f5-5180c3b06ac4',
  '27ed1561-049b-4674-b537-672cf365a127',
  '290d81ec-e8f2-4fbc-ae4f-cc3af606ecba',
  '29428075-10a3-4151-be6a-e04d972884b0',
  '2accacbd-9752-4794-9550-71cff2853b33',
  '2db2c49b-73d9-44c1-ab84-

In [106]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

Answer question in summarization
 
Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

In [107]:
from langchain_community.chat_models import ChatOpenAI

chat_model = ChatOpenAI()

In [113]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": parent_document_retriever })
output_parser = StrOutputParser()


parent_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


parent_retrieval_chain.invoke("Can the Agreement or any of its obligations be assigned?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Can the Agreement or any of its obligations be assigned?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] Entering Chain run with input:
[0m{
  "input": "Can the Agreement or any of its obligations be assigned?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Can the Agreement or any of its obligations be assigned?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "Can the Agreement or any of its obligations be assigned?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] [1.95s] Exi

"No, the Agreement cannot be assigned by the Advisor for any reason. However, the Company may assign the Agreement to a successor, provided the assignee assumes the Company's obligations under the Agreement."

In [114]:
from langchain.globals import set_verbose, set_debug

set_debug(True)
parent_retrieval_chain.invoke("What are the payments to the Advisor under the Agreement?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What are the payments to the Advisor under the Agreement?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] Entering Chain run with input:
[0m{
  "input": "What are the payments to the Advisor under the Agreement?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "What are the payments to the Advisor under the Agreement?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "What are the payments to the Advisor under the Agreement?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] [1.78s]

'The payments to the Advisor under the Agreement include hourly fees at a rate of USD 9 per Billable Hour, limited to a maximum of USD 1,500 per month, as well as USD 100 per month for workspace expenses. Additionally, the Company shall reimburse Advisor for any reasonable and actual expenses incurred in connection with the performance of the Services.'