In [1]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
# !pip install pdfplumber

In [19]:
from pprint import pprint
from dotenv import dotenv_values
import openai
import pickle
from pypdf import PdfReader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

env_vars = dotenv_values('.env')
openai.api_key = env_vars.get('OPENAI_API_KEY')

In [11]:
import os, sys
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

import utils.chroma as chom

In [12]:
file_path = '../data/RaptorContract.pdf'
pdftexts = chom.pdf_reader(file_path)
pdftexts[0]

'[R&G Draft 12.__.2021] \n112923184_5  \n \nSTOCK PURCHASE AGREEMENT \nBY AND AMONG \n[BUYER], \n[TARGET COMPANY], \nTHE SELLERS LISTED ON SCHEDULE I HERETO \nAND  \nTHE SELLERS ’ REPRESENTATIVE NAMED HEREIN \nDated as of [●]  \n \n[This document is intended solely to facilitate discussions among the parties identified herein.  \nNeither this document nor such discussions are intended to create, nor will either or both be \ndeemed to create, a legally binding or enforceable offer or agreement of any type or nature, \nunless and until a definitive written agreement is executed and delivered by each of th e parties \nhereto. \n \nThis document shall be kept confidential pursuant to the terms of the Confidentiality \nAgreement entered into by the parties and, if applicable, its affiliates with respect to the subject \nmatter hereof.]'

In [20]:
docs = []
for loader in pdftexts:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

AttributeError: 'str' object has no attribute 'load'

In [57]:
Page = namedtuple("Page", ["id", "page_content", "metadata"])

def pdf_reader(file_path):
    reader = PdfReader(file_path)
    pdf_pages = []
    for page_number, page in enumerate(reader.pages):
        page_content = page.extract_text().strip()
        if page_content:
            metadata = {"page_number": page_number}  # Add any additional metadata as needed
            pdf_pages.append(Page(id=page_number, page_content=page_content, metadata=metadata))
    return pdf_pages

file_path = '../data/RaptorContract.pdf'
pdf_pages = pdf_reader(file_path)

In [30]:
# parent_docs = documents

# Embedding Model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


# Splitters
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)
# We don't need a parent splitter because the data cames from CSV file, and each row is a parent doc.

file_path = '../data/RaptorContract.pdf'
store_path = '../data/parentdoc'
# Stores
store = InMemoryStore()
vectorstore = Chroma(embedding_function=embeddings, collection_name="fullDoc", persist_directory=store_path)


parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    # parent_splitter =parent_splitter
)

parent_document_retriever.add_documents(pdf_pages , ids=None)

In [31]:
print(f"Number of parent chunks  is: {len(list(store.yield_keys()))}")

print(f"Number of child chunks is: {len(parent_document_retriever.vectorstore.get()['ids'])}")

Number of parent chunks  is: 72
Number of child chunks is: 2475


In [32]:
parent_document_retriever.vectorstore.get()

{'ids': ['004c7f11-43f2-40ae-bdde-dfe0fd70936e',
  '008b022c-b117-40f7-be88-cd48cf681fb2',
  '00a60622-a2a2-41ce-a7e3-28c71e9582fc',
  '0114fdc9-f3e8-47eb-8b0c-989ecc25026a',
  '01382a10-1ee3-4890-bf34-bf30c012ec6e',
  '017a3111-f62c-4ce4-97e2-a96d91f7ab88',
  '01b5b3bb-ed31-4b5c-b9e5-24b670b7b8a4',
  '01b9e7ed-d787-4424-b7b5-4e790f3e61d8',
  '01c21fe2-3556-47ac-96df-715367b43280',
  '01d8ecf9-4313-4a8e-982c-560a13d6eb42',
  '01ea0609-d00d-49c9-afdc-9b60fafce533',
  '01ef9ed6-46cb-4eff-9a6e-99f12daf074c',
  '01f72605-4035-4165-a868-b65b5381019b',
  '020f621e-0919-491e-b793-fce99f0c126d',
  '02193f10-6dfa-4738-b236-b9614e5536ec',
  '021a35d6-7744-4eff-a825-453fb6759d02',
  '026babc5-f744-45d9-99c9-fc3205dfed04',
  '02749f9a-8c08-40ee-97e7-fe55ef9b98f7',
  '0289adb5-a4b8-4b08-b764-c88747f82537',
  '028b33b7-fb20-49b6-8682-de9c2a1c8617',
  '02c04fec-c0fb-4296-b970-52e936657e3a',
  '02e9b7d7-4362-4916-b57c-e214004491f6',
  '02f278c3-6198-48e0-ba3b-b616af7d69b8',
  '03084c03-2c18-4fde-b60a-

In [51]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

Answer question in summarization, in not more 2 line of sentence 
Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

In [44]:
from langchain_community.chat_models import ChatOpenAI

chat_model = ChatOpenAI()

  warn_deprecated(


In [60]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": parent_document_retriever })
output_parser = StrOutputParser()


parent_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


parent_retrieval_chain.invoke("Whose consent is required for the assignment of the Agreement by the Buyer?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Whose consent is required for the assignment of the Agreement by the Buyer?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] Entering Chain run with input:
[0m{
  "input": "Whose consent is required for the assignment of the Agreement by the Buyer?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Whose consent is required for the assignment of the Agreement by the Buyer?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "Whose consent is required for the assignment of the Agreement by the Buyer?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chai

'The consent required for the assignment of the Agreement by the Buyer is not specified in the provided context.'

In [59]:
from langchain.globals import set_verbose, set_debug

set_debug(True)
parent_retrieval_chain.invoke("Whose consent is required for the assignment of the Agreement by the Buyer?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Whose consent is required for the assignment of the Agreement by the Buyer?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context>] Entering Chain run with input:
[0m{
  "input": "Whose consent is required for the assignment of the Agreement by the Buyer?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Whose consent is required for the assignment of the Agreement by the Buyer?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<question,context> > 3:chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "Whose consent is required for the assignment of the Agreement by the Buyer?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chai

'The consent required for the assignment of the Agreement by the Buyer is not specified in the provided context.'