In [1]:
import warnings
warnings.filterwarnings("ignore")

In [50]:
# !pip install pdfplumber
# !pip install faiss-cpu
# !pip install tiktoken
# !pip install lancedb

In [2]:
import pprint
import pandas as pd
from dotenv import dotenv_values
import openai
import pickle
from pypdf import PdfReader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Weaviate
from langchain_community.vectorstores import LanceDB


env_vars = dotenv_values('.env')
openai.api_key = env_vars.get('OPENAI_API_KEY')

In [3]:
import os, sys
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

# import utils.chroma as chom

## Parent Document Retriever

In [4]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

In [4]:
from collections import namedtuple
Page = namedtuple("Page", ["id", "page_content", "metadata"])

def pdf_reader(file_path):
    reader = PdfReader(file_path)
    pdf_pages = []
    for page_number, page in enumerate(reader.pages):
        page_content = page.extract_text().strip()
        if page_content:
            metadata = {"page_number": page_number}  # Add any additional metadata as needed
            pdf_pages.append(Page(id=page_number, page_content=page_content, metadata=metadata))
    return pdf_pages

# file_path = '../data/RaptorContract.pdf'
file_path = "../data/RobinsonAdvisory.pdf"
pdf_pages = pdf_reader(file_path)

In [22]:
# pretty_print_docs(pdf_pages)
pdf_pages

[Page(id=0, page_content='ADVISORY SERVICES AGREEMENT \n \nThis Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date ”), by and \nbetween Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the " Company "), and Mr. Jack \nRobinson, Passport Number 780055578, residing at 1 Rabin st, Tel Aviv, Israel, Email: \njackrobinson@gmail.com ("Advisor "). \n \nWhereas, Advisor has expertise and/or knowledge and/or relationships, which are relevant to the \nCompany ’s business and the Company has asked Advisor to provide it with certain Advisory \nservices, as described in this Agreement; and \nWhereas,  Advisor has agreed to provide the Company with such services, subject to the terms set forth \nin this Agreement. \n \nNOW THEREFORE THE PARTIES AGREE AS FOLLOWS: \n \n1. Services:   \n1.1 Advisor shall provide to the Company, as an independent contractor, software development \nservices, and / or any other services as agreed by the parties from time to t

In [5]:
pages = []
for page in pdf_pages:
    page_data = {
        "id": page.id,
        "page_content": page.page_content,
        "metadata": page.metadata
    }
    pages.append(page_data)

In [6]:
import numpy as np
embed_model = OpenAIEmbeddings(model="text-embedding-3-small")
embeddings = []
for page in pages:
    content_embedding = embed_model.embed_query(page["page_content"])
    page_id_feature = np.array([page["id"]])
    metadata_features = np.array(list(page["metadata"].values()))
    final_embedding = np.concatenate([content_embedding, page_id_feature, metadata_features])
    embeddings.append(final_embedding)

  warn_deprecated(


In [6]:
# !pip install weaviate-client lancedb
from langchain_core.documents import Document
full_docs = []
for page in pdf_pages:
    doc_metadata = {
        "page_id": page.id,
        "page_content": page.page_content,
        "page_length": len(page.page_content),
        # Add any other relevant metadata
    }
    full_docs.append((page.id, doc_metadata))
# full_docs

documents = []
for doc_id, doc_data in full_docs:
    document = Document(
        page_content=doc_data['page_content'],
        metadata=doc_data
    )
    documents.append(document)

In [23]:
embed = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai.api_key)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n\n",
    chunk_size = 200,
    chunk_overlap = 100,
    is_separator_regex = False,
    model_name='text-embedding-3-small', #used to calculate tokens
    encoding_name='text-embedding-3-small'
)

store = InMemoryStore()
# vectorstore = Chroma(embedding_function=embed, collection_name="pppp")

# from langchain.storage import LocalFileStore
# from langchain.embeddings import CacheBackedEmbeddings
# store = LocalFileStore("./cache/")
# cached_embedder = CacheBackedEmbeddings.from_bytes_store(
#     embed, store, namespace=embed.model
# )

# # # Initialize the FAISSVectorStore

vectorstore = FAISS.from_documents(pdf_pages, embed)
# vectorstore = FAISSVectorStore(dimension=768) 

# # #  Weaviate VectorStore
# vectorstore = Weaviate.from_documents(
#     pdf_pages, embed, weaviate_url="http://127.0.0.1:8080"
# )




In [24]:
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter
)

parent_document_retriever.add_documents(pdf_pages , ids=None)

In [None]:
# import lancedb
# from langchain.vectorstores import LanceDB
# my_db = lancedb.connect("./by_db")

# lance_table = my_db.create_table("pdf_pages", data=[
#     {
#         "id": page.id,
#         "page_content": page.page_content,
#         "metadata": page.metadata
#     } for page in pdf_pages
# ])
# lance_table
# Create the LanceDB vector store
# vectorstore = LanceDB(lance_table, embed)

# child_splitter = RecursiveCharacterTextSplitter(chunk_size= 512)
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# small_chunk_splitter = RecursiveCharacterTextSplitter(chunk_size = 512)
# store = InMemoryStore()
# vectorstore = LanceDB(lance_table, embed)
# parent_document_retriever = ParentDocumentRetriever(
#     vectorstore=vectorstore,
#     docstore=store,
#     child_splitter=small_chunk_splitter
# )

# parent_document_retriever.add_documents(pdf_pages , ids=None)

In [15]:
print(f"Number of parent chunks  is: {len(list(store.yield_keys()))}")

# print(f"Number of child chunks is: {len(parent_document_retriever.vectorstore.aget_by_ids('ids'))}")
vectorstore.aget_by_ids('ids')

Number of parent chunks  is: 4


<coroutine object VectorStore.aget_by_ids at 0x783bb06f1b60>

In [9]:
# parent_document_retriever.vectorstore.get()
# print(vectorstore.index.ntotal)
# pprint.pp(vectorstore.docstore._dict)
print(vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.8}
))

tags=['FAISS', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x783bb098e890> search_type='similarity_score_threshold' search_kwargs={'score_threshold': 0.8}


In [30]:

# To retrieve documents
# query = "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?"
# results = vectorstore.similarity_search(query)
# results
# query = "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"
query ="Who are the parties to the Agreement and what are their defined names?"
def parent_document_retriever_wrapper(question: str) -> dict:
    return {"context": vectorstore.similarity_search(question, k=10)}
result = parent_document_retriever_wrapper(query)

In [31]:
result

{'context': [Document(metadata={'page_number': 2, 'doc_id': '76dae4e1-7000-4350-a11f-70b37b5da0ed'}, page_content='Agreement.   \n \n10. Governing Law and Jurisdiction :  This Agreement shall be governed by the laws of the State of'),
  Document(metadata={'page_number': 2, 'doc_id': '76dae4e1-7000-4350-a11f-70b37b5da0ed'}, page_content='9. Entire Agreement; No Waiver or Assignment : This Agreement together with the Exhibits, which \nare attached hereto and incorporated herein, set forth the entire Agreement between the pa rties and'),
  Document(metadata={'page_number': 2, 'doc_id': '76dae4e1-7000-4350-a11f-70b37b5da0ed'}, page_content='are attached hereto and incorporated herein, set forth the entire Agreement between the pa rties and \nshall supersede all previous communications and agreements between the parties, either oral or'),
  Document(metadata={'page_number': 1, 'doc_id': '054354f3-88e2-4c4f-8b2a-0e23b448b247'}, page_content='Exhibit A .  \n \n8. Relationship of the Parties; 

In [16]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

Answer question in summarization, put section number of the answer from the file for example like 
 
Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

In [18]:
from langchain_community.chat_models import ChatOpenAI

chat_model = ChatOpenAI()

In [19]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": parent_document_retriever_wrapper })
output_parser = StrOutputParser()


parent_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser


# parent_retrieval_chain.invoke("Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?")
# print(parent_document_retriever )

In [27]:
parent_retrieval_chain.invoke("""Who are the parties to the Agreement and what are their defined names?""")
# answer = result.split('\n\nAnswer:')[0].strip()

# print(answer)

'Answer:\nThe parties to the Agreement are not explicitly mentioned in the provided context.'

In [105]:
parent_retrieval_chain.invoke("Is escrow amount greater than the Retention Amount?")
# answer = result.split('\n\nAnswer:')[0].strip()

# print(answer)

'Answer:\nThe Retention Amount is $5,000,000. \n\nSection number: 14'

In [77]:
result

'Query:\nWhat is the purpose of the escrow?\n\nAnswer:\nSection 2.08 of the document states that the purpose of the escrow is for Buyer to deposit the Escrow Amount in escrow on behalf of the Sellers in accordance with the Escrow Agreement. The Escrow Amount shall be held and, subject to Section 2.07, released to the Company Securityholders in accordance with the agreement.'

## Chatbox

In [12]:
import os
from langchain.chat_models import ChatOpenAI

# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") or "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"
# sk-proj-Tzc9mrWyEFVxyDsq5HiWT3BlbkFJxp47toOztG4XRILBeRxr
chat = ChatOpenAI(
    openai_api_key="sk-proj-Tzc9mrWyEFVxyDsq5HiWT3BlbkFJxp47toOztG4XRILBeRxr",
    model='gpt-3.5-turbo'
)

from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to understand string theory.")
]

In [13]:
def augment_prompt(query: str):
    results = parent_document_retriever_wrapper(query)
    source_knowledge = [doc.page_content for doc in results['context']]
    augmented_prompt = f"""Using the contexts below, answer the query from the context provide.
    Contexts:
    {source_knowledge}
    Query: {query}"""
    return augmented_prompt

In [14]:
def rag_generator(query):
    # query = "How much is the escrow amount in dollar?"
    prompt = HumanMessage(
        content=augment_prompt(query)
    )
    messages.append(prompt)
    res = chat(messages)
    # print(res.content)
    return res.content

In [None]:
query="how much is the escrow amount in dollar?"
rag_generator(query)

In [107]:
evaluation = pd.read_csv('../data/RaptorQA.csv')

answers = []
contexts = []

for question in evaluation['question']:
    # answer = rag_generator(question)
    answer = parent_retrieval_chain.invoke(question)
    answers.append(answer)
    result = parent_document_retriever_wrapper(question)
    context = [doc.page_content for doc in result['context']]
    contexts.append(context)

evaluation['answer'] = answers
evaluation['contexts'] = contexts
# evaluation.head()
evaluation.to_csv('../data/updated_raptor_parnentdoc_CharcterSplit_chunk200_evaluation.csv', index=False)

## Reranking Types

**cohere**

In [185]:
# !pip install --upgrade --quiet  cohere
# !pip install langchain_cohere

In [171]:
import cohere
co = cohere.Client(os.getenv("COHERE_API_KEY"))

In [181]:
query = "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"
rerank_docs = co.rerank(
    query=query, documents=parent_document_retriever.vectorstore.get(), top_n=25, model="rerank-english-v3.0"
)

unknown field: parameter model is not a valid field


In [None]:
type(rerank_docs[0])

In [None]:
# Retrieve the documents from the parent_document_retriever
documents = parent_document_retriever.vectorstore.get()
query = "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"

# Convert the documents to a list of objects
document_objects = []
for document in documents:
    document_objects.append({"text": document})

# Perform the reranking using Cohere
rerank_docs = co.rerank(
    query=query,
    documents=document_objects,
    top_n=25,
    model="rerank-english-v2.0"
)

In [None]:
rerank_docs



In [None]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.llms import Cohere

llm = Cohere(temperature=0)
compressor = CohereRerank(model="rerank-english-v3.0")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=parent_document_retriever
)

compressed_docs = compression_retriever.invoke(
    "What did the president say about Ketanji Jackson Brown"
)
pretty_print_docs(compressed_docs)

In [None]:
from langchain.chains import RetrievalQA
chain = RetrievalQA.from_chain_type(
    llm=Cohere(temperature=0), retriever=compression_retriever
)
chain({"query": query})

**cross-encoder**

In [191]:
import numpy as np
#cross encoder reranker
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

query_text = "How much is the escrow amount?"
# query_text = "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?"
result = parent_document_retriever_wrapper(query_text)
pairs = [[query_text, doc.page_content] for doc in result['context']]

# Predict scores for pairs
scores = cross_encoder.predict(pairs)
# Print scores
print("Scores:")
for score in scores:
    print(score)


print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

INFO:sentence_transformers.cross_encoder.CrossEncoder:Use pytorch device: cpu
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Scores:
-0.4082793
0.12048418
0.86506265
-2.7284338
New Ordering:
3
2
1
4


In [196]:
pairs

[['How much is the escrow amount?',
  'the Escrow Agreement (and any remaining balance of the Escrow Amount not required to be \npaid to the Buyer shall be released to Company Securityholders in accordance with the terms of \nthe Escrow Agreement). \nSection 2.08  Escrow.   \n(a) At Closing, Buyer will deposit the Escrow Amount in escrow on behalf of the'],
 ['How much is the escrow amount?',
  'the Escrow Agreement). \nSection 2.08  Escrow.   \n(a) At Closing, Buyer will deposit the Escrow Amount in escrow on behalf of the \nSellers in accordance with the Escrow Agreement.  The Escrow Amount shall be held and, \nsubject to Section 2.07, released to the Company Securityholders in accordance with the'],
 ['How much is the escrow amount?',
  'Escrow Agent shall release the Escrow Amount to Company Securityholders in accordance with \nthe Escrow Agreement or (ii) the amount, if any, by which such estimated Purchase Price paid at \nClosing in accordance with Section 2.05(a)(i) and Section 

In [195]:
print(len(result['context']))
lists = [doc.page_content for doc in result['context']]
pprint.pp(lists[0])

4
('the Escrow Agreement (and any remaining balance of the Escrow Amount not '
 'required to be \n'
 'paid to the Buyer shall be released to Company Securityholders in accordance '
 'with the terms of \n'
 'the Escrow Agreement). \n'
 'Section 2.08  Escrow.   \n'
 '(a) At Closing, Buyer will deposit the Escrow Amount in escrow on behalf of '
 'the')


In [170]:
# drf = amnesty_qa["eval"].to_pandas()
# pprint.pp(drf['contexts'][0])
# type(drf['contexts'][0])

**colbert reranker**

In [159]:
# !pip install -U ragatouille

In [160]:
from ragatouille import RAGPretrainedModel

RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

INFO:faiss.loader:Loading faiss with AVX512 support.
INFO:faiss.loader:Successfully loaded faiss with AVX512 support.


artifact.metadata:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.huggingface.co/repos/a0/4c/a04cd86fe01b157b8cc5a51615eef5d3e9dce42c5569c2b74b15f66522dbba90/3f58890b1dfdfec066ef12ba431fa9d56992da9e30a53489242c4156e37e9017?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1721313900&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMTMxMzkwMH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy9hMC80Yy9hMDRjZDg2ZmUwMWIxNTdiOGNjNWE1MTYxNWVlZjVkM2U5ZGNlNDJjNTU2OWMyYjc0YjE1ZjY2NTIyZGJiYTkwLzNmNTg4OTBiMWRmZGZlYzA2NmVmMTJiYTQzMWZhOWQ1Njk5MmRhOWUzMGE1MzQ4OTI0MmM0MTU2ZTM3ZTkwMTc%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=fFWGwba-oe22SEZjCYerZ0edi3VS-FpKy4sA9bMb%7E-N8AN2nxUB1WyWPlva7xsQLD6EXQOB32LMkL5JjotQulxhfIyfa1KuaBj1cYG%7EtsjFGXCF8Q9KAoARD%7EGfPOhtLPr4fPNuX6DGarS2ZCt-v6koFbMw4-Doo2w6DD5-nTnRCyZEJTC12txlO1v6uewtbsBPPPZBLND46lAA4Lt-j161YXU1BsMkGL40N43-KqxxE1SOt

model.safetensors:  81%|########1 | 357M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[Jul 15, 17:56:04] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


In [166]:
from langchain.retrievers import ContextualCompressionRetriever

# Assuming you have a properly configured `parent_document_retriever`
compression_retriever = ContextualCompressionRetriever(
    base_compressor=RAG.as_langchain_document_compressor(k=3),  # Set k to a value less than or equal to the number of documents
    base_retriever=parent_document_retriever
)

compressed_docss = compression_retriever.invoke(
    "How much is the escrow amount in number?"
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
100%|██████████| 1/1 [00:02<00:00,  2.02s/it]


In [169]:
print([doc.metadata["page_number"] for doc in compressed_docss])
scores = [doc.metadata["relevance_score"] for doc in compressed_docss]
print(scores)

relevance_scores = [doc.metadata["relevance_score"] for doc in compressed_docss]
print(relevance_scores)

print("New Ordering:")
for o in np.argsort(relevance_scores)[::-1]:
    print(o+1)

[25, 24, 9]
[14.916751861572266, 5.888035297393799, 4.762559413909912]
[14.916751861572266, 5.888035297393799, 4.762559413909912]
New Ordering:
1
2
3


In [167]:
pretty_print_docs(compressed_docss)

Document 1:

-21- 
112923184_5 provisions of the Escrow Agreement with the Company Securityholders being entitled to share in 
such released amounts in accordance with their Pro Rata Percentages.  From and after the 
Closing, Buyer and the Sellers ’ Representative will direct the Escrow Agent to disburse 
payments from the Escrow Account in accordance with the purchase price adjustment provisions 
of this Agreement and the terms of the Escrow Agreement including: (a) in the case of any 
disbursement that is required by the terms of this Agreement and as to which there is no dispute 
(or as to which the disputing party has failed to notify the Escrow Agent and the other parties of 
its dispute in accordance with any applicable requirements under this Agreement and the Escrow 
Agreement), they will provide prompt joint payment instructions directing the Escrow Agent to 
make such disbursement and (b) in the case of a disbursement as to which either the Buyer or the 
Seller ’s Representat

**flashrank reranker**

In [127]:
# !pip install flashrank
# !pip install --upgrade --quiet flashrank
# !pip install --upgrade --quiet faiss
# !pip install --upgrade --quiet faiss_cpu

In [142]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)

compressor = FlashrankRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=parent_document_retriever
)

compressed_docs = compression_retriever.invoke(
    "How much is the escrow amount in number?"
)
print([doc.metadata["page_number"] for doc in compressed_docs])
scores = [doc.metadata["relevance_score"] for doc in compressed_docs]
print(scores)

relevance_scores = [doc.metadata["relevance_score"] for doc in compressed_docs]
print(relevance_scores)

print("New Ordering:")
for o in np.argsort(relevance_scores)[::-1]:
    print(o+1)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[25, 24, 9]


In [146]:
pretty_print_docs(compressed_docs)

Document 1:

-21- 
112923184_5 provisions of the Escrow Agreement with the Company Securityholders being entitled to share in 
such released amounts in accordance with their Pro Rata Percentages.  From and after the 
Closing, Buyer and the Sellers ’ Representative will direct the Escrow Agent to disburse 
payments from the Escrow Account in accordance with the purchase price adjustment provisions 
of this Agreement and the terms of the Escrow Agreement including: (a) in the case of any 
disbursement that is required by the terms of this Agreement and as to which there is no dispute 
(or as to which the disputing party has failed to notify the Escrow Agent and the other parties of 
its dispute in accordance with any applicable requirements under this Agreement and the Escrow 
Agreement), they will provide prompt joint payment instructions directing the Escrow Agent to 
make such disbursement and (b) in the case of a disbursement as to which either the Buyer or the 
Seller ’s Representat

In [148]:
from langchain.chains import RetrievalQA

chain = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever)

In [233]:
query="How much is the Retention Amount?"
chain.invoke(query)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'query': 'How much is the Retention Amount? and is escrow amount greater than the Retention Amount',
 'result': 'The text does not specify a specific amount for the Retention Amount. Therefore, I cannot provide the exact value of the Retention Amount. Additionally, without a specific amount for the Retention Amount, I cannot compare it to the Escrow Amount to determine if the Escrow Amount is greater or not.'}

**rank fusion**

ContextualCompressionRetriever(base_compressor=FlashrankRerank(client=<flashrank.Ranker.Ranker object at 0x73a21e185450>, top_n=3, model='ms-marco-MultiBERT-L-12'), base_retriever=ParentDocumentRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x73a228f62ce0>, docstore=<langchain_core.stores.InMemoryStore object at 0x73a228f62c80>, child_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x73a228f60e80>))

## PREPARING THE EVALUATION TEST

In [50]:
import pdfplumber
import pandas as pd

evaluation = []
with pdfplumber.open("../data/RobinsonQ&A.pdf") as pdf:
    for page in pdf.pages:
        item = page.extract_text()
        evaluation.append(item)
pprint.pp(evaluation)

['Q1: Who are the parties to the Agreement and what are their defined names?\n'
 'A1: Cloud Investments Ltd. (“Company”) and Jack Robinson (“Advisor”)\n'
 'Q2: What is the termination notice?\n'
 'A2: According to section 4:14 days for convenience by both parties. The '
 'Company may terminate without notice if\n'
 'the Advisor refuses or cannot perform the Services or is in breach of any '
 'provision of this Agreement.\n'
 'Q3: What are the payments to the Advisor under the Agreement?\n'
 'A3: According to section 6: 1. Fees of $9 per hour up to a monthly limit of '
 '$1,500, 2. Workspace expense of $100\n'
 'per month, 3. Other reasonable and actual expenses if approved by the '
 'company in writing and in advance.\n'
 'Q4: Can the Agreement or any of its obligations be assigned?\n'
 'A4: 1. Under section 1.1 the Advisor can’t assign any of his obligations '
 'without the prior written consent of the\n'
 'Company, 2. Under section 9 the Advisor may not assign the Agreement and the '

In [49]:
len(evaluation)

1

In [7]:
import pandas as pd

questions = [
    "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?",
    "Would the Sellers be responsible if after the closing it is determined that there were inaccuracies in the representation provided by them where such inaccuracies are the resolute of the Sellers' gross negligence?",
    "How much is the escrow amount?",
    "Is escrow amount grete then the Retention Amount?",
    "What is the purpose of the escrow?",
    "May the Escrow Amount serve as a recourse for the Buyer in case of breach of representations by the Company?",
    "Are there any conditions to the closing?",
    "Are Change of Control Payments considered a Seller Transaction Expense?",
    "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?",
    "Does the Buyer need to pay the Employees Closing Bonus Amount directly to the Company's employees?",
    "Does any of the Sellers provide a representation with respect to any Tax matters related to the Company?",
    "Is any of the Sellers bound by a non-competition covenant after the Closing?",
    "Whose consent is required for the assignment of the Agreement by the Buyer?",
    "Does the Buyer needs the Sellers' consent in the event of an assignment of the Agreement to a third party who is not a Buyer's Affiliates?"
]

answers = [
    "Except in the case of fraud, the Sellers have no liability for breach of representations and warranties (See section 10.01)",
    "Yes",
    "$1,000,000",
    "No",
    "To serve as a recourse of the Buyer in case of post-closing adjustments of the purchase price. (See section 2.07(e)).",
    "No",
    "No, as the signing and closing are simultaneous.",
    "Yes. (See defining of Sellers Transaction Expenses).",
    "Yes (See Section 2.07)",
    "No. (See Section 2.10)",
    "No. Only the Company provides such a representation.",
    "No.",
    "If the assignment is to an Affiliate or purchaser of all of the Buyer's assets, no consent is required. Otherwise, the consent of the Company and the Seller Representative is required.",
    "No. If the assignment is not part of a sale of all or substantially all of the Buyer's assets, the assignment requires the consent of the Company and the Seller's Representative."
]

# Create a pandas DataFrame
df = pd.DataFrame({'question': questions, 'ground_truth': answers}, index=range(len(questions)))

# Rearrange the DataFrame based on the question numbers
# df = df.sort_values('Question')

# Display the rearranged DataFrame
df.to_csv('../data/RaptorQA.csv')

In [None]:
df["ground_truth"][13]

In [9]:
questions = [
"Q1: Who are the parties to the Agreement and what are their defined names?",
"Q2: What is the termination notice?",
"Q3: What are the payments to the Advisor under the Agreement?",
"Q4: Can the Agreement or any of its obligations be assigned?",
"Q5: Who owns the IP?",
"Q6: Is there a non-compete obligation to the Advisor?",
"Q7: Can the Advisor charge for meal time?",
"Q8: In which street does the Advisor live?",
"Q9: Is the Advisor entitled to social benefits?",
"Q10: What happens if the Advisor claims compensation based on employment relationship with the Company?"
]

answers = [
"""Cloud Investments Ltd. ("Company") and Jack Robinson ("Advisor")""",
"According to section 4:14 days for convenience by both parties. The Company may terminate without notice if the Advisor refuses or cannot perform the Services or is in breach of any provision of this Agreement.",
"According to section 6: 1. Fees of $9 per hour up to a monthly limit of $1,500, 2. Workspace expense of $100 per month, 3. Other reasonable and actual expenses if approved by the company in writing and in advance.",
"1. Under section 1.1 the Advisor can't assign any of his obligations without the prior written consent of the Company, 2. Under section 9 the Advisor may not assign the Agreement and the Company may assign it, 3 Under section 9 of the Undertaking the Company may assign the Undertaking.",
"According to section 4 of the Undertaking (Appendix A), Any Work Product, upon creation, shall be fully and exclusively owned by the Company.",
"Yes. During the term of engagement with the Company and for a period of 12 months thereafter.",
"No. See Section 6.1, Billable Hour doesn't include meals or travel time.",
"1 Rabin st, Tel Aviv, Israel",
"No. According to section 8 of the Agreement, the Advisor is an independent consultant and shall not be entitled to any overtime pay, insurance, paid vacation, severance payments or similar fringe or employment benefits from the Company.",
"If the Advisor is determined to be an employee of the Company by a governmental authority, payments to the Advisor will be retroactively reduced so that 60% constitutes salary payments and 40% constitutes payment for statutory rights and benefits. The Company may offset any amounts due to the Advisor from any amounts payable under the Agreement. The Advisor must indemnify the Company for any losses or expenses incurred if an employer/employee relationship is determined to exist."
]

dff = pd.DataFrame({'question': questions, 'ground_truth': answers}, index=range(len(questions)))

# Rearrange the DataFrame based on the question numbers
# df = df.sort_values('Question')

# Display the rearranged DataFrame
dff.to_csv('../data/RobinsonQA.csv')

In [60]:
dff = dff.rename(columns={'Answers': 'Ground Truth'})
dff["rag_response"] = ['a','b', 'c', 'd', 'e', 'f', 'g', 'e', 'f', 'g']
dff["source_document"] = ['a','b', 'c', 'd', 'e', 'f', 'g', 'e', 'f', 'g']
dff

Unnamed: 0,Question,Answer,rag_response,source_document
0,Q1: Who are the parties to the Agreement and w...,"Cloud Investments Ltd. (""Company"") and Jack Ro...",a,a
1,Q2: What is the termination notice?,According to section 4:14 days for convenience...,b,b
2,Q3: What are the payments to the Advisor under...,According to section 6: 1. Fees of $9 per hour...,c,c
3,Q4: Can the Agreement or any of its obligation...,1. Under section 1.1 the Advisor can't assign ...,d,d
4,Q5: Who owns the IP?,According to section 4 of the Undertaking (App...,e,e
5,Q6: Is there a non-compete obligation to the A...,Yes. During the term of engagement with the Co...,f,f
6,Q7: Can the Advisor charge for meal time?,"No. See Section 6.1, Billable Hour doesn't inc...",g,g
7,Q8: In which street does the Advisor live?,"1 Rabin st, Tel Aviv, Israel",e,e
8,Q9: Is the Advisor entitled to social benefits?,"No. According to section 8 of the Agreement, t...",f,f
9,Q10: What happens if the Advisor claims compen...,If the Advisor is determined to be an employee...,g,g


RAGAS IMPLEMENTATION

In [None]:
# !pip install ragas==0.0.11

# Query Expansion

Query expansion (QE) is a process in Information Retrieval which consists of selecting and adding terms to the user’s query with the goal of minimizing query-document mismatch and thereby improving retrieval performance.

In [92]:
#we are using openai for generating query
import os
import openai
from openai import OpenAI
openai.api_key = os.environ['OPENAI_API_KEY']
openai_client = OpenAI()


def augment_query_generated(query, model="gpt-3.5-turbo"):
    messages = [
        {
            "role": "system",
            "content": f"you should give hypothetical answer for the following question: {query}"
        },
        {"role": "user", "content": query}
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    return content


original_query = "How much is the escrow amount?"
hypothetical_answer = augment_query_generated(original_query)
# we are combining our orignal query + hypothetical_answer 
joint_query = f"{original_query} {hypothetical_answer}"

In [93]:
hypothetical_answer

'The escrow amount can vary depending on the specific transaction or agreement involved. Typically, the escrow amount may be a percentage of the total transaction value, such as 1-2% of the purchase price in a real estate transaction. In other cases, it could be a fixed amount agreed upon by the parties involved. Ultimately, the escrow amount is determined by the terms of the contract or agreement.'

In [95]:
pprint.pp(joint_query)

('How much is the escrow amount? The escrow amount can vary depending on the '
 'specific transaction or agreement involved. Typically, the escrow amount may '
 'be a percentage of the total transaction value, such as 1-2% of the purchase '
 'price in a real estate transaction. In other cases, it could be a fixed '
 'amount agreed upon by the parties involved. Ultimately, the escrow amount is '
 'determined by the terms of the contract or agreement.')


In [97]:
rag_generator(joint_query)

'The escrow amount can vary depending on the specific transaction or agreement involved. Typically, the escrow amount may be a percentage of the total transaction value, such as 1-2% of the purchase price in a real estate transaction. In other cases, it could be a fixed amount agreed upon by the parties involved. Ultimately, the escrow amount is determined by the terms of the contract or agreement.'

# Multi-Query Approach

In [5]:
import os
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.load import dumps, loads


## Environment Variables
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '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'
os.environ['OPENAI_API_KEY'] = 'sk-proj-Tzc9mrWyEFVxyDsq5HiWT3BlbkFJxp47toOztG4XRILBeRxr'
# this is optional, before using this line, create a project with this name in the langsmith
os.environ['LANGCHAIN_PROJECT']='parentdoc-multiquery-retriever'

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [6]:
# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [8]:
import openai
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import SupabaseVectorStore
from supabase.client import create_client
from dotenv import dotenv_values

env_vars = dotenv_values('../.env')
supabase_url = env_vars.get('SUPABASE_URL')
supabase_key = env_vars.get('SUPABASE_KEY')
def fetch_stored_embedding():
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai.api_key)
    supabase_client = create_client(supabase_url, supabase_key)
    vector_store = SupabaseVectorStore(
        client=supabase_client,
        embedding=embeddings,
        table_name="documents",
        query_name="match_documents",
    )
    return vector_store
vector_store = fetch_stored_embedding()
retriever = vector_store.as_retriever()

In [9]:
from sentence_transformers import CrossEncoder

def reranker(documents: list[list]):
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    pairs = [[question, doc] for sublist in documents for doc in sublist]
    scores = cross_encoder.predict(pairs)
    ordered_indices = np.argsort(scores)[::-1]
    top_scored_docs = [documents[i] for i in ordered_indices[:15]]
    return top_scored_docs

In [11]:
from sentence_transformers import CrossEncoder
import numpy as np
def reranker(documents: list[list]):
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    pairs = [[question, doc.__str__()] for sublist in documents for doc in sublist]
    scores = cross_encoder.predict(pairs)
    ordered_indices = np.argsort(scores)[::-1]
    # Get the top scored document pairs
    top_scored_pairs = [pairs[i] for i in ordered_indices[:min(15, len(pairs))]]

    # Find the top scored documents in the original documents list
    top_scored_docs = []
    for pair in top_scored_pairs:
        for doc_list in documents:
            for doc in doc_list:
                if doc.__str__() == pair[1]:
                    top_scored_docs.append(doc)
                    break

    return top_scored_docs

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
# question = query

retrieval_chain = generate_queries | retriever.map() | get_unique_union
# Tesing a single retriever
# docs = retrieval_chain.invoke({"question":question})
# len(docs)

In [36]:
type(retrieval_chain)

langchain_core.runnables.base.RunnableSequence

In [37]:
docs

[Document(metadata={'page_number': 2, 'doc_id': '76dae4e1-7000-4350-a11f-70b37b5da0ed'}, page_content='Agreement.   \n \n10. Governing Law and Jurisdiction :  This Agreement shall be governed by the laws of the State of'),
 Document(metadata={'page_number': 3, 'doc_id': 'e448c7d3-5b7f-4be4-acd0-a2df775e9305'}, page_content='assign this undertaking to third parties. \nIN WITNESS WHEREOF , the undersigned has executed this Undertaking as of the Effective Date. \nAdvisor   \nBy: _____________________________________'),
 Document(metadata={'page_number': 1, 'doc_id': '054354f3-88e2-4c4f-8b2a-0e23b448b247'}, page_content='5. Termination : Either party, at any given time, may terminate this Agreement, for any reason \nwhatsoever, with or without cause, upon fourteen (14) days ’ prior written notice. Notwithstanding the'),
 Document(metadata={'page_number': 2, 'doc_id': '76dae4e1-7000-4350-a11f-70b37b5da0ed'}, page_content='proof of acceptance by the other party. \n \nIN WITNESS WHEREOF the p

In [23]:
# ! pip show chromadb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: chromadb
Version: 0.5.3
Summary: Chroma.
Home-page: 
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: /home/ek/.local/lib/python3.10/site-packages
Requires: bcrypt, build, chroma-hnswlib, fastapi, grpcio, httpx, importlib-resources, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-instrumentation-fastapi, opentelemetry-sdk, orjson, overrides, posthog, pydantic, pypika, PyYAML, requests, tenacity, tokenizers, tqdm, typer, typing-extensions, uvicorn
Required-by: 


In [12]:
template = """
Provide an answer to the following question based on the given legal contract context. Be sure to include the relevant section number(s) in your response. If there are multiple possible answers, list them all.

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)


# result = final_rag_chain.invoke({"question": question})

In [15]:
query = "Can the Advisor charge for meal time?"
question = query
final_rag_chain.invoke({"question": question})

'There is no specific mention of an escrow amount or retention amount in the provided legal contract context.'

In [20]:
evaluation = pd.read_csv('../data/RaptorQA.csv')

answers = []
contexts = []
for question in evaluation['question']:
    answer = final_rag_chain.invoke({"question":question})
    answers.append(answer)
    docs = retrieval_chain.invoke({"question":question})
    context = [doc.page_content for doc in docs]
    contexts.append(context)

evaluation['answer'] = answers
evaluation['contexts'] = contexts
evaluation.head()
evaluation.to_csv('../data/updated_raptor_multiquery_chunk2000_evaluation.csv', index=False)