In [1]:
import warnings
warnings.filterwarnings("ignore")

In [50]:
# !pip install pdfplumber
# !pip install faiss-cpu
# !pip install tiktoken
# !pip install lancedb

In [2]:
import pprint
from dotenv import dotenv_values
import openai
import pickle
from pypdf import PdfReader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import FAISS


env_vars = dotenv_values('.env')
openai.api_key = env_vars.get('OPENAI_API_KEY')

In [3]:
import os, sys
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

# import utils.chroma as chom

## Parent Document Retriever

In [4]:
from collections import namedtuple
Page = namedtuple("Page", ["id", "page_content", "metadata"])

def pdf_reader(file_path):
    reader = PdfReader(file_path)
    pdf_pages = []
    for page_number, page in enumerate(reader.pages):
        page_content = page.extract_text().strip()
        if page_content:
            metadata = {"page_number": page_number}  # Add any additional metadata as needed
            pdf_pages.append(Page(id=page_number, page_content=page_content, metadata=metadata))
    return pdf_pages

file_path = '../data/RaptorContract.pdf'
pdf_pages = pdf_reader(file_path)

In [60]:
pdf_pages 
# pdf_pages

[Page(id=0, page_content='[R&G Draft 12.__.2021] \n112923184_5  \n \nSTOCK PURCHASE AGREEMENT \nBY AND AMONG \n[BUYER], \n[TARGET COMPANY], \nTHE SELLERS LISTED ON SCHEDULE I HERETO \nAND  \nTHE SELLERS ’ REPRESENTATIVE NAMED HEREIN \nDated as of [●]  \n \n[This document is intended solely to facilitate discussions among the parties identified herein.  \nNeither this document nor such discussions are intended to create, nor will either or both be \ndeemed to create, a legally binding or enforceable offer or agreement of any type or nature, \nunless and until a definitive written agreement is executed and delivered by each of th e parties \nhereto. \n \nThis document shall be kept confidential pursuant to the terms of the Confidentiality \nAgreement entered into by the parties and, if applicable, its affiliates with respect to the subject \nmatter hereof.]', metadata={'page_number': 0}),
 Page(id=1, page_content='-i- \n112923184_5 TABLE OF CONTENTS \nARTICLE I DEFINITIONS; CERTAIN RULES

In [32]:
pages = []
for page in pdf_pages:
    page_data = {
        "id": page.id,
        "page_content": page.page_content,
        "metadata": page.metadata
    }
    pages.append(page_data)

In [33]:
import numpy as np
embed_model = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings = []
for page in pages:
    # Embed the page content
    content_embedding = embed_model.embed_query(page["page_content"])

    # Encode the page id
    page_id_feature = np.array([page["id"]])

    # Encode the metadata
    metadata_features = np.array(list(page["metadata"].values()))

    # Combine the embeddings and features
    final_embedding = np.concatenate([content_embedding, page_id_feature, metadata_features])
    embeddings.append(final_embedding)



In [None]:
import lancedb
from langchain.vectorstores import LanceDB
my_db = lancedb.connect("./by_db")

# Embedding Model
embed = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai.api_key)
text = "clark this isn't righ, now right"
# embeddings = embed_model.embed_documents(text)

# vectorstore = Chroma(embedding_function=embed, collection_name="pppp")

lance_table = my_db.create_table("pdf_pages", data=[
    {
        "id": page.id,
        "page_content": page.page_content,
        "metadata": page.metadata
    } for page in pdf_pages
])
lance_table
# Create the LanceDB vector store
# vectorstore = LanceDB(lance_table, embed)

# # # Initialize the FAISSVectorStore
# # vectorstore = FAISS.from_documents(pdf_pages, embeddings)
# # vectorstore = FAISSVectorStore(dimension=768) 

# parent_document_retriever = ParentDocumentRetriever(
#     vectorstore=vectorstore,
#     docstore=store,
#     child_splitter=child_splitter
# )

# parent_document_retriever.add_documents(pdf_pages , ids=None)

In [None]:
child_splitter = RecursiveCharacterTextSplitter(chunk_size= 512)
from langchain.text_splitter import RecursiveCharacterTextSplitter
small_chunk_splitter = RecursiveCharacterTextSplitter(chunk_size = 512)
store = InMemoryStore()
vectorstore = LanceDB(lance_table, embed)
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=small_chunk_splitter
)


parent_document_retriever.add_documents(pdf_pages , ids=None)

In [None]:
print(f"Number of parent chunks  is: {len(list(store.yield_keys()))}")

print(f"Number of child chunks is: {len(parent_document_retriever.vectorstore['ids'])}")

Number of parent chunks  is: 72


TypeError: 'FAISS' object is not subscriptable

In [None]:
parent_document_retriever.vectorstore.get()

In [None]:

# To retrieve documents
# query = "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?"
# results = vectorstore.similarity_search(query)
# results

def parent_document_retriever_wrapper(question: str) -> dict:
    return {"context": vectorstore.similarity_search(question)}

In [None]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

Answer question in summarization, put section number of the answer from the file for example like 
 
Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

In [None]:
from langchain_community.chat_models import ChatOpenAI

chat_model = ChatOpenAI()

In [None]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": parent_document_retriever_wrapper })
output_parser = StrOutputParser()


parent_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


parent_retrieval_chain.invoke("Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?")
print(parent_document_retriever )



tags=None metadata=None vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x7998d84005e0> docstore=<langchain.storage.in_memory.InMemoryStore object at 0x7998d8400d00> id_key='doc_id' search_kwargs={} child_splitter=<langchain.text_splitter.RecursiveCharacterTextSplitter object at 0x7998d8400cd0> parent_splitter=None


In [None]:
parent_retrieval_chain.invoke("""How much is the escrow amount in dollar?""")



'Answer:\nThe escrow amount is $1,000,000 (Section number: 6f2d1ccc-42a7-4f9e-8b0b-4bad42c90abe)'

In [None]:
parent_retrieval_chain.invoke("What is the purpose of the escrow?")



'Answer:\nThe purpose of the escrow according to Section 2.07 is to hold the Escrow Amount and release it to the Company Securityholders in accordance with the Escrow Agreement. (Section 2.08)'

Rerank

In [None]:
import cohere
co = cohere.Client(os.getenv("COHERE_API_KEY"))

In [None]:
query = "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"
rerank_docs = co.rerank(
    query=query, documents=parent_document_retriever.vectorstore.get(), top_n=25, model="rerank-english-v2.0"
)

In [None]:
type(rerank_docs[0])

In [None]:
# Retrieve the documents from the parent_document_retriever
documents = parent_document_retriever.vectorstore.get()
query = "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"

# Convert the documents to a list of objects
document_objects = []
for document in documents:
    document_objects.append({"text": document})

# Perform the reranking using Cohere
rerank_docs = co.rerank(
    query=query,
    documents=document_objects,
    top_n=25,
    model="rerank-english-v2.0"
)

In [None]:
rerank_docs



## PREPARING THE EVALUATION TEST

In [50]:
import pdfplumber
import pandas as pd

evaluation = []
with pdfplumber.open("../data/RobinsonQ&A.pdf") as pdf:
    for page in pdf.pages:
        item = page.extract_text()
        evaluation.append(item)
pprint.pp(evaluation)

['Q1: Who are the parties to the Agreement and what are their defined names?\n'
 'A1: Cloud Investments Ltd. (“Company”) and Jack Robinson (“Advisor”)\n'
 'Q2: What is the termination notice?\n'
 'A2: According to section 4:14 days for convenience by both parties. The '
 'Company may terminate without notice if\n'
 'the Advisor refuses or cannot perform the Services or is in breach of any '
 'provision of this Agreement.\n'
 'Q3: What are the payments to the Advisor under the Agreement?\n'
 'A3: According to section 6: 1. Fees of $9 per hour up to a monthly limit of '
 '$1,500, 2. Workspace expense of $100\n'
 'per month, 3. Other reasonable and actual expenses if approved by the '
 'company in writing and in advance.\n'
 'Q4: Can the Agreement or any of its obligations be assigned?\n'
 'A4: 1. Under section 1.1 the Advisor can’t assign any of his obligations '
 'without the prior written consent of the\n'
 'Company, 2. Under section 9 the Advisor may not assign the Agreement and the '

In [49]:
len(evaluation)

1

In [47]:
import pandas as pd

questions = [
    "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?",
    "Would the Sellers be responsible if after the closing it is determined that there were inaccuracies in the representation provided by them where such inaccuracies are the resolute of the Sellers' gross negligence?",
    "How much is the escrow amount?",
    "Is escrow amount grete then the Retention Amount?",
    "What is the purpose of the escrow?",
    "May the Escrow Amount serve as a recourse for the Buyer in case of breach of representations by the Company?",
    "Are there any conditions to the closing?",
    "Are Change of Control Payments considered a Seller Transaction Expense?",
    "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?",
    "Does the Buyer need to pay the Employees Closing Bonus Amount directly to the Company's employees?",
    "Does any of the Sellers provide a representation with respect to any Tax matters related to the Company?",
    "Is any of the Sellers bound by a non-competition covenant after the Closing?",
    "Whose consent is required for the assignment of the Agreement by the Buyer?",
    "Does the Buyer needs the Sellers' consent in the event of an assignment of the Agreement to a third party who is not a Buyer's Affiliates?"
]

answers = [
    "Except in the case of fraud, the Sellers have no liability for breach of representations and warranties (See section 10.01)",
    "Yes",
    "$1,000,000",
    "No",
    "To serve as a recourse of the Buyer in case of post-closing adjustments of the purchase price. (See section 2.07(e)).",
    "No",
    "No, as the signing and closing are simultaneous.",
    "Yes. (See defining of Sellers Transaction Expenses).",
    "Yes (See Section 2.07)",
    "No. (See Section 2.10)",
    "No. Only the Company provides such a representation.",
    "No.",
    "If the assignment is to an Affiliate or purchaser of all of the Buyer's assets, no consent is required. Otherwise, the consent of the Company and the Seller Representative is required.",
    "No. If the assignment is not part of a sale of all or substantially all of the Buyer's assets, the assignment requires the consent of the Company and the Seller's Representative."
]

# Create a pandas DataFrame
df = pd.DataFrame({'Question': questions, 'Answer': answers}, index=range(len(questions)))

# Rearrange the DataFrame based on the question numbers
# df = df.sort_values('Question')

# Display the rearranged DataFrame
df.to_csv('../data/RaptorQA.csv')

In [46]:
df["Answer"][13]

"No. If the assignment is not part of a sale of all or substantially all of the Buyer's assets, the assignment requires the consent of the Company and the Seller's Representative."

In [51]:
questions = [
"Q1: Who are the parties to the Agreement and what are their defined names?",
"Q2: What is the termination notice?",
"Q3: What are the payments to the Advisor under the Agreement?",
"Q4: Can the Agreement or any of its obligations be assigned?",
"Q5: Who owns the IP?",
"Q6: Is there a non-compete obligation to the Advisor?",
"Q7: Can the Advisor charge for meal time?",
"Q8: In which street does the Advisor live?",
"Q9: Is the Advisor entitled to social benefits?",
"Q10: What happens if the Advisor claims compensation based on employment relationship with the Company?"
]

answers = [
"""Cloud Investments Ltd. ("Company") and Jack Robinson ("Advisor")""",
"According to section 4:14 days for convenience by both parties. The Company may terminate without notice if the Advisor refuses or cannot perform the Services or is in breach of any provision of this Agreement.",
"According to section 6: 1. Fees of $9 per hour up to a monthly limit of $1,500, 2. Workspace expense of $100 per month, 3. Other reasonable and actual expenses if approved by the company in writing and in advance.",
"1. Under section 1.1 the Advisor can't assign any of his obligations without the prior written consent of the Company, 2. Under section 9 the Advisor may not assign the Agreement and the Company may assign it, 3 Under section 9 of the Undertaking the Company may assign the Undertaking.",
"According to section 4 of the Undertaking (Appendix A), Any Work Product, upon creation, shall be fully and exclusively owned by the Company.",
"Yes. During the term of engagement with the Company and for a period of 12 months thereafter.",
"No. See Section 6.1, Billable Hour doesn't include meals or travel time.",
"1 Rabin st, Tel Aviv, Israel",
"No. According to section 8 of the Agreement, the Advisor is an independent consultant and shall not be entitled to any overtime pay, insurance, paid vacation, severance payments or similar fringe or employment benefits from the Company.",
"If the Advisor is determined to be an employee of the Company by a governmental authority, payments to the Advisor will be retroactively reduced so that 60% constitutes salary payments and 40% constitutes payment for statutory rights and benefits. The Company may offset any amounts due to the Advisor from any amounts payable under the Agreement. The Advisor must indemnify the Company for any losses or expenses incurred if an employer/employee relationship is determined to exist."
]

dff = pd.DataFrame({'Question': questions, 'Answer': answers}, index=range(len(questions)))

# Rearrange the DataFrame based on the question numbers
# df = df.sort_values('Question')

# Display the rearranged DataFrame
dff.to_csv('../data/RobinsonQA.csv')

In [60]:
dff = dff.rename(columns={'Answers': 'Ground Truth'})
dff["rag_response"] = ['a','b', 'c', 'd', 'e', 'f', 'g', 'e', 'f', 'g']
dff["source_document"] = ['a','b', 'c', 'd', 'e', 'f', 'g', 'e', 'f', 'g']
dff

Unnamed: 0,Question,Answer,rag_response,source_document
0,Q1: Who are the parties to the Agreement and w...,"Cloud Investments Ltd. (""Company"") and Jack Ro...",a,a
1,Q2: What is the termination notice?,According to section 4:14 days for convenience...,b,b
2,Q3: What are the payments to the Advisor under...,According to section 6: 1. Fees of $9 per hour...,c,c
3,Q4: Can the Agreement or any of its obligation...,1. Under section 1.1 the Advisor can't assign ...,d,d
4,Q5: Who owns the IP?,According to section 4 of the Undertaking (App...,e,e
5,Q6: Is there a non-compete obligation to the A...,Yes. During the term of engagement with the Co...,f,f
6,Q7: Can the Advisor charge for meal time?,"No. See Section 6.1, Billable Hour doesn't inc...",g,g
7,Q8: In which street does the Advisor live?,"1 Rabin st, Tel Aviv, Israel",e,e
8,Q9: Is the Advisor entitled to social benefits?,"No. According to section 8 of the Agreement, t...",f,f
9,Q10: What happens if the Advisor claims compen...,If the Advisor is determined to be an employee...,g,g


RAGAS IMPLEMENTATION

In [None]:
# !pip install ragas==0.0.11