In [1]:
import warnings
warnings.filterwarnings("ignore")


In [2]:
#!pip install chromadb
#!pip install sentence-transformers
#!pip install langchain openai

In [98]:
import matplotlib.pyplot as plt
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import openai
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import SupabaseVectorStore
from supabase.client import create_client
import numpy as np
from sentence_transformers import CrossEncoder
from dotenv import dotenv_values

env_vars = dotenv_values('../.env')
openai.api_key = env_vars.get('OPENAI_API_KEY')
openai_client = openai.OpenAI(api_key=openai.api_key)
supabase_url = env_vars.get('SUPABASE_URL')
supabase_key = env_vars.get('SUPABASE_KEY')

In [2]:
import os, sys
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

import utils.chroma as chom

## The Rag Retriever Section

* The retriever section chunked, embed and save to chroma vector db
* Uses similarity search to the vector db based on users question
* Before passing the retrieved document to the generator section it passes through to the cross encoder to be reranked

In [82]:
file_path = '../data/RaptorContract.pdf'
# file_path = "../data/RobinsonAdvisory.pdf"

In [10]:
from pypdf import PdfReader
from collections import namedtuple
Page = namedtuple("Page", ["id", "page_content", "metadata"])
def pdf_reader_to_document_format(file_path):
        reader = PdfReader(file_path)
        pdf_pages = []
        for page_number, page in enumerate(reader.pages):
            page_content = page.extract_text().strip()
            if page_content:
                metadata = {"page_number": page_number}  # Add any additional metadata as needed
                pdf_pages.append(Page(id=page_number, page_content=page_content, metadata=metadata))
        return pdf_pages
pdf_pages = pdf_reader_to_document_format(file_path)
pdf_pages[0]

Page(id=0, page_content='[R&G Draft 12.__.2021] \n112923184_5  \n \nSTOCK PURCHASE AGREEMENT \nBY AND AMONG \n[BUYER], \n[TARGET COMPANY], \nTHE SELLERS LISTED ON SCHEDULE I HERETO \nAND  \nTHE SELLERS ’ REPRESENTATIVE NAMED HEREIN \nDated as of [●]  \n \n[This document is intended solely to facilitate discussions among the parties identified herein.  \nNeither this document nor such discussions are intended to create, nor will either or both be \ndeemed to create, a legally binding or enforceable offer or agreement of any type or nature, \nunless and until a definitive written agreement is executed and delivered by each of th e parties \nhereto. \n \nThis document shall be kept confidential pursuant to the terms of the Confidentiality \nAgreement entered into by the parties and, if applicable, its affiliates with respect to the subject \nmatter hereof.]', metadata={'page_number': 0})

In [86]:
def chunking_RecursiveCharacterTextSplitter(pdf_doc):
        text = '\n\n'.join([page.page_content for page in pdf_doc])
        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n", " "],
            chunk_size=200,
            chunk_overlap=100,
            length_function=len,
            is_separator_regex=False
        )

        chunk_list = []
        for i, chunk in enumerate(text_splitter.create_documents([text])):
            metadata = {"page_number": [page.metadata["page_number"] for page in pdf_doc if page.page_content in chunk.page_content]}
            chunk_list.append(Document(id=i, page_content=chunk.page_content, metadata=metadata))

        return chunk_list

tsplit_texts = chunking_RecursiveCharacterTextSplitter(pdf_pages)

In [5]:
embedding_function = chom.embedding()

In [93]:
import chromadb

def connect_with_chromadb(embedding_function, token_split_texts):
    chroma_client = chromadb.Client()
    chroma_collection = chroma_client.create_collection("microsoft_annual_report_cp5", embedding_function=embedding_function)
    text_list = [doc.page_content for doc in token_split_texts]
    ids = [str(i) for i in range(len(text_list))]
    chroma_collection.add(ids=ids, documents=text_list)
    chroma_collection.count()
    return chroma_collection

In [None]:
chroma_collection = connect_with_chromadb(embedding_function, tsplit_texts)
# chroma_collection.count()

In [95]:
def vectordb_answer_question(query, chroma_collection):
    # query = "What was the total revenue?"
    results = chroma_collection.query(query_texts=[query], n_results=chroma_collection.count())
    retrieved_documents = results['documents'][0]
    # for document in retrieved_documents:
    #     print(document)
    #     print('\n')
    return retrieved_documents


* If the retrieved_documents parameter in the reranker function below is in Document format use this on line 3 of the function code

- pairs = [[query_text, doc.page_content] for doc in retrieved_documents]
* If the retrieved_documents parameter in the reranker function below is in list format use this on line 3 of the function code

- pairs = [[query_text, doc] for doc in retrieved_documents]

In [105]:
def reranker(query, retrieved_documents):
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    query_text = query
    
    pairs = [[query_text, doc] for doc in retrieved_documents]
    
    scores = cross_encoder.predict(pairs)
    for score in scores:
        f"{score:.2f}"
    ordered_indices = np.argsort(scores)[::-1]
    for i in ordered_indices:
        f"{i+1}. {retrieved_documents[i]}"
    top_scored_docs = [retrieved_documents[i] for i in ordered_indices[:15]]

    return top_scored_docs

## The Rag Generator section

In [100]:
def generate_llm_response(question, ranked_docs):
    try:
        messages = [
            {"role": "system", "content": "You are an AI assistant that provides answers to questions based on the given information."},
            {"role": "user", "content": f"Question: {question}. Information: {ranked_docs}"}
        ]

        response = openai_client.chat.completions.create(
            model="gpt-4",
            messages=messages,
        )

        return response.choices[0].message.content.strip()
    except Exception as e:
        return f'Error: {str(e)}'  

**openai stream response**

In [78]:
from openai import ChatCompletion

def get_response_stream(question, ranked_docs):
    # client = ChatCompletion()
    response_stream = openai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": f"Question: {question}. Information: {ranked_docs}"}],
        temperature=0,
        max_tokens=729,
        top_p=1,
        stream=True,
    )
    return response_stream

import textwrap
from IPython.display import display, clear_output, HTML


def process_streamed_responses(response_stream):
    response_text = ""
    for chunk in response_stream:
        chunk_message = chunk.choices[0].delta.content
        if chunk_message is not None:  # Check if chunk_message is not None
            response_text += chunk_message
        is_complete = chunk.choices[0].finish_reason is not None
        wrapped_text = textwrap.fill(response_text, width=80)  
        clear_output(wait=True)
        display(HTML(f"<div style='text-align: left;'><pre>{wrapped_text}</pre></div>"))
        if is_complete:
            break
    return response_text


def main(query, ranked_docs):
    stream = get_response_stream(query, ranked_docs)
    text =  process_streamed_responses(stream)
    return text


In [107]:
# query = "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?"
query = "Is the escrow amount greater than the retention amount?"
# query ="Who are the parties to the Agreement and what are their defined names?"
# query = "Is there a non-compete obligation to the Advisor?"
# query = "In which street does the Advisor live?"
retrieved_documents = vectordb_answer_question(query, chroma_collection)
len(retrieved_documents)

2333

In [108]:
top_docs = reranker(query, retrieved_documents)
output_answer = main(query, top_docs)
# output_answer = generate_llm_response(query, top_docs)
print(output_answer)

Based on the information provided, the escrow amount is $1,000,000 and the retention amount is $5,000,000. Therefore, the escrow amount is not greater than the retention amount.


**CREATING THE EVALUATION DATA**

In [113]:
import pandas as pd
import pprint
evaluation = pd.read_csv('../data/RaptorQA.csv')

answers = []
contexts = []
for question in evaluation['question']:
    docs = reranker(question, retrieved_documents)
    answer = generate_llm_response(question, docs)
    pprint.pp(answer)
    answers.append(answer)
    
    context = [doc for doc in docs]
    contexts.append(context)

evaluation['answer'] = answers
evaluation['contexts'] = contexts
# evaluation.head()
evaluation.to_csv('../data/evaluation_data/updated_raptor_crossendoderReranker_evaluation.csv', index=False)

('The Sellers are responsible for a breach of representations and warranties '
 'under the circumstance that any of its Affiliates or its or its Affiliates’ '
 'Representatives violate the provisions of Section 6.05(a). Each Seller is '
 'severally responsible, meaning not jointly, for their individual conduct. '
 'The extent of their responsibility includes any Liability required to be '
 'disclosed on Schedule 4.06. In addition, responsibility can also extend to '
 'any breach under Government Orders affecting the Seller or its properties. '
 'Nevertheless, the Sellers’ Representative is not liable for these actions or '
 'omissions unless in the case of gross negligence, bad faith, or willful '
 'misconduct. Cooperation and non-disclosure following the Closing Date are '
 'also expected from the Sellers.')
('The provided information does not offer a definitive answer to the question '
 'of whether the Sellers would be responsible for inaccuracies in their '
 'representations due to 