In [1]:
import warnings
warnings.filterwarnings("ignore")

In [50]:
# !pip install pdfplumber
# !pip install faiss-cpu
# !pip install tiktoken
# !pip install lancedb

In [31]:
import pprint
import pandas as pd
import numpy as np
from dotenv import dotenv_values
import openai
import pickle
from pypdf import PdfReader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.chat_models import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Weaviate
from langchain_community.vectorstores import LanceDB
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from sentence_transformers import CrossEncoder


env_vars = dotenv_values('.env')
openai.api_key = env_vars.get('OPENAI_API_KEY')

In [3]:
import os, sys
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

# import utils.chroma as chom

# Parent Document Rag Retriever Section

In [4]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

In [32]:
from collections import namedtuple
Page = namedtuple("Page", ["id", "page_content", "metadata"])

def pdf_reader(file_path):
    reader = PdfReader(file_path)
    pdf_pages = []
    for page_number, page in enumerate(reader.pages):
        page_content = page.extract_text().strip()
        if page_content:
            metadata = {"page_number": page_number}  # Add any additional metadata as needed
            pdf_pages.append(Page(id=page_number, page_content=page_content, metadata=metadata))
    return pdf_pages

file_path = '../data/RaptorContract.pdf'
# file_path = "../data/RobinsonAdvisory.pdf"
pdf_pages = pdf_reader(file_path)

In [22]:
# pretty_print_docs(pdf_pages)
pdf_pages

[Page(id=0, page_content='ADVISORY SERVICES AGREEMENT \n \nThis Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date ”), by and \nbetween Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the " Company "), and Mr. Jack \nRobinson, Passport Number 780055578, residing at 1 Rabin st, Tel Aviv, Israel, Email: \njackrobinson@gmail.com ("Advisor "). \n \nWhereas, Advisor has expertise and/or knowledge and/or relationships, which are relevant to the \nCompany ’s business and the Company has asked Advisor to provide it with certain Advisory \nservices, as described in this Agreement; and \nWhereas,  Advisor has agreed to provide the Company with such services, subject to the terms set forth \nin this Agreement. \n \nNOW THEREFORE THE PARTIES AGREE AS FOLLOWS: \n \n1. Services:   \n1.1 Advisor shall provide to the Company, as an independent contractor, software development \nservices, and / or any other services as agreed by the parties from time to t

In [33]:
embed = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai.api_key)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)
store = InMemoryStore()
vectorstore = FAISS.from_documents(pdf_pages, embed)

In [34]:
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter
)

parent_document_retriever.add_documents(pdf_pages , ids=None)

In [None]:
# To retrieve documents
# query = "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?"
# results = vectorstore.similarity_search(query)
# results
# query = "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"
query ="Who are the parties to the Agreement and what are their defined names?"
def parent_document_retriever_wrapper(question: str) -> dict:
    return {"context": vectorstore.similarity_search(question, k=10)}
result = parent_document_retriever_wrapper(query)

In [87]:
# def reranker(retrieved_documents):
#     cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
      
#     pairs = [[question, doc] for doc in retrieved_documents]
    
#     scores = cross_encoder.predict(pairs)
#     for score in scores:
#         f"{score:.2f}"
#     ordered_indices = np.argsort(scores)[::-1]
#     for i in ordered_indices:
#         f"{i+1}. {retrieved_documents[i]}"
#     top_scored_docs = [retrieved_documents[i] for i in ordered_indices[:15]]

#     return top_scored_docs

def reranker(documents: list[list]):
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    pairs = [[question, doc.__str__()] for sublist in documents for doc in sublist]
    scores = cross_encoder.predict(pairs)
    ordered_indices = np.argsort(scores)[::-1]
    # Get the top scored document pairs
    top_scored_pairs = [pairs[i] for i in ordered_indices[:min(15, len(pairs))]]

    # Find the top scored documents in the original documents list
    top_scored_docs = []
    for pair in top_scored_pairs:
        for doc_list in documents:
            for doc in doc_list:
                if doc.__str__() == pair[1]:
                    top_scored_docs.append(doc)
                    break

    return {"context": top_scored_docs, "question": question}


# Parent Document Rag Generator Section

In [21]:
from langchain_core.prompts import ChatPromptTemplate

TEMPLATE = """\
You are happy assistant. Use the context provided below to answer the question.

Answer question in summarization, put section number of the answer from the file for example like 
 
Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

In [25]:
chat_model = ChatOpenAI()

  warn_deprecated(


In [92]:
# LCEL IMPLMENTATION
setup_and_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": parent_document_retriever })
output_parser = StrOutputParser()
parent_retrieval_chain = setup_and_retrieval | rag_prompt | chat_model | output_parser

In [None]:
parent_retrieval_chain.invoke("""Who are the parties to the Agreement and what are their defined names?""")

In [95]:
parent_retrieval_chain.invoke("Is escrow amount greater than the Retention Amount?")
# answer = result.split('\n\nAnswer:')[0].strip()
# print(answer)

"Yes, the escrow amount is greater than the Retention Amount.\n\nContext: Section 2.08 (a) At Closing, Buyer will deposit the Escrow Amount in escrow on behalf of the Sellers in accordance with the Escrow Agreement. The Escrow Amount shall be held and, subject to Section 2.07, released to the Company Securityholders in accordance with the'"

In [94]:
parent_retrieval_chain.invoke("How much is the escrow amount in dollar?")

'The Escrow amount is $1,000,000. \n\n(Page 9)'

**CREATING THE EVALUATION DATA**

In [96]:
evaluation = pd.read_csv('../data/RaptorQA.csv')

answers = []
contexts = []

for question in evaluation['question']:
    answer = parent_retrieval_chain.invoke(question)
    answers.append(answer)
    result = parent_document_retriever_wrapper(question)
    context = [doc.page_content for doc in result['context']]
    contexts.append(context)

evaluation['answer'] = answers
evaluation['contexts'] = contexts
# evaluation.head()
evaluation.to_csv('../data/evaluation_data/updated_raptor_parnentdoc_evaluation.csv', index=False)

# Multi-Query Approach

In [5]:
import os
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.load import dumps, loads


## Environment Variables
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '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'
os.environ['OPENAI_API_KEY'] = 'sk-proj-Tzc9mrWyEFVxyDsq5HiWT3BlbkFJxp47toOztG4XRILBeRxr'
# this is optional, before using this line, create a project with this name in the langsmith
os.environ['LANGCHAIN_PROJECT']='parentdoc-multiquery-retriever'

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [6]:
# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [11]:
def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

retrieval_chain = generate_queries | retriever.map() | get_unique_union
# Tesing a single retriever
# docs = retrieval_chain.invoke({"question":question})
# len(docs)

In [12]:
template = """
Provide an answer to the following question based on the given legal contract context. Be sure to include the relevant section number(s) in your response. If there are multiple possible answers, list them all.

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

In [15]:
query = "Can the Advisor charge for meal time?"
question = query
final_rag_chain.invoke({"question": question})

'There is no specific mention of an escrow amount or retention amount in the provided legal contract context.'

**CREATING EVALUATION DATA**

In [20]:
evaluation = pd.read_csv('../data/RaptorQA.csv')

answers = []
contexts = []
for question in evaluation['question']:
    answer = final_rag_chain.invoke({"question":question})
    answers.append(answer)
    docs = retrieval_chain.invoke({"question":question})
    context = [doc.page_content for doc in docs]
    contexts.append(context)

evaluation['answer'] = answers
evaluation['contexts'] = contexts
evaluation.head()
evaluation.to_csv('../data/updated_raptor_multiquery_chunk2000_evaluation.csv', index=False)