In [1]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
import pprint
from dotenv import dotenv_values
import openai
import pickle
from pypdf import PdfReader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

env_vars = dotenv_values('.env')
openai.api_key = env_vars.get('OPENAI_API_KEY')

In [3]:
import os, sys
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

# Custom Chunking

chunking by  section

In [6]:
import PyPDF2
from collections import namedtuple

file_path = '../data/RaptorContract.pdf'

# Load the PDF
with open(file_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()

# Split the text into general information and main content
general_info = []
main_content = []
in_main_content = False
for line in text.split('\n'):
    if line.startswith('ARTICLE I'):
        in_main_content = True
    if in_main_content:
        main_content.append(line)
    else:
        general_info.append(line)

# Split the main content into sections
sections = []
current_section = []
for line in main_content:
    if line.startswith('Section'):
        if current_section:
            sections.append('\n'.join(current_section))
        current_section = [line]
    else:
        current_section.append(line)
if current_section:
    sections.append('\n'.join(current_section))

# Define the Page namedtuple
Page = namedtuple("Page", ["id", "page_content", "metadata"])

# Create Page objects for each section
pages = []
for section_num, section in enumerate(sections):
    section_lines = section.split('\n')
    section_metadata = {
        'section_num': section_num,
        # Add any other section-level metadata here
    }
    pages.append(Page(id=section_num, page_content=section, metadata=section_metadata))

# Print the results
# print('===== General Information =====')
# print('\n'.join(general_info))
# print('\n')
# print('===== Sections =====')
# for page in pages:
#     print(f'===== Section {page.metadata["section_num"]} =====\n{page.section_content}\n')


In [7]:
pages

[Page(id=0, page_content='ARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION .................................... 2  ', metadata={'section_num': 0}),
 Page(id=1, page_content='Section 1.01  Definitions ......................................................................................................... 2  ', metadata={'section_num': 1}),
 Page(id=2, page_content='Section 1.02  Certain Matters of Construction ...................................................................... 13  \nARTICLE II PURCHASE AND SALE OF SHARES AND WARRANTS; \nTREATMENT OF OPTIONS; CLOSING. .................................................................................. 14  ', metadata={'section_num': 2}),
 Page(id=3, page_content='Section 2.01  Purchase and Sale of Shares ............................................................................ 14  ', metadata={'section_num': 3}),
 Page(id=4, page_content='Section 2.02  Purchase Price...............................................................

In [8]:
page_contents = [page.page_content for page in pages]

embed_model = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key="sk-proj-Tzc9mrWyEFVxyDsq5HiWT3BlbkFJxp47toOztG4XRILBeRxr")
embedded_documents = embed_model.embed_documents(page_contents)
embedded_documents



[[0.020905759049312852,
  -0.007360992439549479,
  0.04957845029707897,
  0.008592334519956123,
  0.013653017491534487,
  0.004201448960789209,
  -0.03767096226699703,
  -0.020838104052643382,
  -0.029389845015804283,
  0.0454649512487117,
  0.028577970154609555,
  -0.015790952266663428,
  0.03615546202939484,
  0.018253637358799285,
  0.03639902523281131,
  0.043327018336227895,
  0.02936278264460747,
  -0.007273039267498545,
  0.04459895537041362,
  0.04178445759007402,
  0.018037138389224765,
  0.0045397295320719665,
  -0.03753564854836782,
  0.03507296345623196,
  -0.009607177165126964,
  -0.024762162777342667,
  -0.010249911740680313,
  -0.02557403577589226,
  0.047169889947336736,
  -0.009005038009013975,
  0.031717218251956066,
  -0.036615524202385834,
  -0.0136800798627313,
  -0.06289318162939528,
  0.020080353933842283,
  0.05044444617537705,
  -0.03618252626323679,
  -0.014992609522389675,
  0.011887190786023037,
  -0.009715427581236794,
  0.006833274338566447,
  -0.007009180

In [9]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# Assuming you've already created the `pages` array
page_contents = [page.page_content for page in pages]

# Create FAISS vector store
faiss_store = FAISS.from_documents([Document(page_content=page_content) for page_content in page_contents], embed_model)



In [10]:
query = "Can you please clarify the circumstances and limitations under which the Sellers can be held liable for breaches of the representations and warranties in the agreement?"
docs = faiss_store.similarity_search(query)



In [11]:
print(len(docs))
print(docs[3].page_content)

4
Section 10.02  Recourse Limitations.  
(a) Other than claims arising from fraud committed by the Person, no director, 
officer, employee, incorporator, manager, member, partner, stockholder, shareholder, Affiliate, 

20 Note to Draft:  Subject to finalization of transaction structure.   
-59- 
112923184_5 parent of, or holder of any equity interest in, any tier, agent, attorney or representative of the 
Company, Sellers or Buyer (each, a “Non-Party Affiliate ”) shall have any liability (whether in 
contract or in tort, in law or in equity, or based upon any theory that seeks to impose liability of 
an entity party against its owners or Affiliates, whether by or through attempted piercing of the 
corporate, limited partnership or limited liability company veil or any other theory or doctrine, 
including alter ego or otherwise) to any Person for any obligations or liabilities arising under, in 
connection with or related to this Agreeme

In [48]:
query = "How much is the escrow amount in number dollar?"
docs = faiss_store.similarity_search(query)



In [53]:
print(len(docs))
print(docs[3].page_content)

4
Section 2.01  Purchase and Sale of Shares.  Upon the terms and subject to the conditions 
set forth in this Agreement, at the Closing, each of the Sellers shall sell, transfer and deliver to 
Buyer, free and clear of all Encumbrances, and Buyer shall purchase from each of such Se llers, 
all of the outstanding Shares.   
-16- 
112923184_5 Section 2.02  Purchase Price.  In consideration for the purchase by Buyer of the Shares, 
Buyer shall pay, or cause to be paid, to the Sellers at Closing the Closing Cash Considera tion, as 
calculated and adjusted (a) at the Closing as described in Section 2.04 and Section 2.05(a)(i), and 
(b) if applicable, following the Closing as described in Section 2.07(f). The Closing Cash 
Consideration plus the Escrow Amount, after all adjustments contemplated in this Agreement, is 
referred to herein as the “Purchase Price ”. 


In [55]:
query = "What is the purpose of the escrow?"
docs = faiss_store.similarity_search(query)



In [59]:
print(len(docs))
print(docs[0].page_content)

4
Section 2.08  Escrow.   
(a) At Closing, Buyer will deposit the Escrow Amount in escrow on behalf of the 
Sellers in accordance with the Escrow Agreement.  The Escrow Amount shall be held and, 
subject to Section 2.07, released to the Company Securityholders in accordance with the   
-21- 
112923184_5 provisions of the Escrow Agreement with the Company Securityholders being entitled to share in 
such released amounts in accordance with their Pro Rata Percentages.  From and after the 
Closing, Buyer and the Sellers ’ Representative will direct the Escrow Agent to disburse 
payments from the Escrow Account in accordance with the purchase price adjustment provisions 
of this Agreement and the terms of the Escrow Agreement including: (a) in the case of any 
disbursement that is required by the terms of this Agreement and as to which there is no dispute 
(or as to which the disputing party has failed to notify the Escrow Agent and the other parties of 
its dispute in accordance with any a

In [60]:
query = """May the Escrow Amount serve as a recourse for the Buyer in case of breach of
representations by the Company?"""
docs = faiss_store.similarity_search(query)



In [17]:
print(len(docs))
print(docs[3].page_content)

4
Section 2.04  Closing Payments.  
(a) At the Closing, Buyer shall pay, or cause to be paid, through payment to the 
Paying Agent which will in turn make the following payments, in cash by wire transfer of 
immediately available funds –(a) the Closing Cash Consideration to or for the account of Sellers, 
(b) the Warrant Closing Cash Consideration to or for the account of Warrantholders, (c) the 
Closing Vested Options Cash Consideration to the [102 Trustee] for the benefit of the Vested 
Optionholders, and (d) the Employee Closing Bonus Amount [to the Company] for the benefit of 
the Employee Closing Bonus Beneficiaries (the payments set forth in (a) through (d), the 
“Closing Cash Amoun ts”).  Exhibit 2.04 attached hereto is a statement (the “Estimated Closing 
Statement ”) setting forth (i) the Sellers ’ Representative ’s good faith estimate (together with 
reasonable detail) of the Closing Cash Amounts, (ii) a list of all payees to whom any portion of 
the Closing Debt Amount is pa

In [61]:
query = "Are there any conditions to the closing?"
docs = faiss_store.similarity_search(query)



In [67]:
print(len(docs))
print(docs[1].page_content)

4
Section 2.03  The Closing. The purchase and sale of the Shares (the “Closing ”) shall 
take place on the date hereof (the “Closing Date ”) by exchange of documents and signatures (or 
their electronic counterparts). The consummation of the transactions contemplated by this 
Agreement shall be deemed to occur at 12:01 a.m. Pacific Standard Time on the Closing Date. 


In [20]:
query = "Are Change of Control Payments considered a Seller Transaction Expense?"
docs = faiss_store.similarity_search(query)



In [21]:
print(len(docs))
print(docs[3].page_content)

4
Section 6.05(a) by any of its Affiliates or its or its Affiliates ’ Representatives. 
(b) Notwithstanding the foregoing, each of the parties hereto and their respective 
Representatives may disclose to any and all Persons, without limitation of any kind, the tax 
treatment and tax structure of the Contemplated Transactions and all materials of any kind 
(including opinions or other tax analyses) that are provided to it relating to such tax treatment 
and tax structure, all as contemplated by Treasury Regulation Section 1.6011-4(b)(3)(iii). For the 
avoidance of doubt, nothing contained in this Agreement limits, restricts or in any other way 
affects any Seller ’s communicating with any Governmental Authority, or communicating with 
any official or staff person of a Governmental Authority, concerning matters relevant to such 
Governmental Authority. 


In [22]:
query = """Would the aggregate amount payable by the Buyer to the Sellers be affected if it is
determined that the actual Closing Debt Amount is greater the estimated Closing Debut Amount?"""
docs = faiss_store.similarity_search(query)



In [26]:
print(len(docs))
print(docs[0].page_content)

4
Section 2.05(a)(i) (the “Estimated Purchase Price ”) shall be calculated using the estimated 
Closing Debt Amount, estimated Closing Cash Amount and estimated Seller Transaction 
Expenses set forth on the Estimated Closing Statement. 
(b) Proposed Final Closing Statement.  Within sixty (60) calendar days after the 
Closing Date, the Company shall prepare or cause to be prepared, and will provide to the Sellers ’ 
Representative, a written statement setting forth in reasonable detail its proposed final 
determination of the Closing Debt Amount, Closing Cash Amount, and the Seller Transaction 
Expenses (the “Proposed Final Closing Statement ”).  The Proposed Final Closing Statement will 
be prepared in accordance with the Accounting Principles and without giving effect to any 
changes resulting from the consummation of the Contemplated Transactions on the Closing 
Date.  The Sellers ’ Representative and its Representatives shall have reasonable access to the 
work papers and other book

# Chunking Models

Character Text Splitter

In [21]:
from langchain.text_splitter import CharacterTextSplitter

# Combine the page contents into a single string
text = '\n\n'.join([page.page_content for page in pages])

# Create the text splitter
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1200,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False
)

# Split the text into paragraphs
para_list = text_splitter.create_documents([text])

# Print the first two paragraphs
print(para_list[:2])

Created a chunk of size 294, which is longer than the specified 200
Created a chunk of size 324, which is longer than the specified 200
Created a chunk of size 276, which is longer than the specified 200
Created a chunk of size 341, which is longer than the specified 200
Created a chunk of size 218, which is longer than the specified 200
Created a chunk of size 246, which is longer than the specified 200
Created a chunk of size 256, which is longer than the specified 200
Created a chunk of size 229, which is longer than the specified 200
Created a chunk of size 273, which is longer than the specified 200
Created a chunk of size 3587, which is longer than the specified 200
Created a chunk of size 30048, which is longer than the specified 200
Created a chunk of size 2025, which is longer than the specified 200
Created a chunk of size 6199, which is longer than the specified 200
Created a chunk of size 284, which is longer than the specified 200
Created a chunk of size 1129, which is long

[Document(page_content='ARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION .................................... 2'), Document(page_content='Section 1.01  Definitions ......................................................................................................... 2')]


In [22]:
print(len(para_list))
print("\n")
pprint.pp(para_list[0])

149


Document(page_content='ARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION .................................... 2')


Recursive Character Splitter

In [27]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text = '\n\n'.join([page.page_content for page in pages])
def len_func(text):
    return len(text)

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n", " "],
    chunk_size = 200,
    chunk_overlap = 100,
    length_function = len_func,
    is_separator_regex=False
)

chunk_list = text_splitter.create_documents([text])

In [28]:
print(len(chunk_list))
print("\n")
pprint.pp(chunk_list[100])

139


Document(page_content='has there been any settlements or similar out-of-court or pre-litigation arrangement relating to \nany such matters, nor has any such Action, settlement or other arrangement been proposed or \nthreatened.')


tiktoken

tiktoken is a fast BPE(Byte-Pair encoding) tokenizer created by OpenAI to count the number of tokens.

In [None]:
# Splitting based on the token limit
from langchain.text_splitter import CharacterTextSplitter
text = '\n\n'.join([page.page_content for page in pages])

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n\n",
    chunk_size = 1200,
    chunk_overlap = 100,
    is_separator_regex = False,
    model_name='text-embedding-3-small', #used to calculate tokens
    encoding_name='text-embedding-3-small'
)

doc_list = text_splitter.create_documents(text)
doc_list # returns list of document objects

In [32]:
line_list = text_splitter.split_text(text)
line_list[0]

Created a chunk of size 6459, which is longer than the specified 1200


Created a chunk of size 1537, which is longer than the specified 1200
Created a chunk of size 1603, which is longer than the specified 1200
Created a chunk of size 2198, which is longer than the specified 1200
Created a chunk of size 3061, which is longer than the specified 1200
Created a chunk of size 3279, which is longer than the specified 1200
Created a chunk of size 3212, which is longer than the specified 1200


'ARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION .................................... 2  \n\nSection 1.01  Definitions ......................................................................................................... 2  \n\nSection 1.02  Certain Matters of Construction ...................................................................... 13  \nARTICLE II PURCHASE AND SALE OF SHARES AND WARRANTS; \nTREATMENT OF OPTIONS; CLOSING. .................................................................................. 14  \n\nSection 2.01  Purchase and Sale of Shares ............................................................................ 14  \n\nSection 2.02  Purchase Price.................................................................................................. 14  \n\nSection 2.03  The Closing...................................................................................................... 14  \n\nSection 2.04  Closing Payments. .................................

Splitting by code

We can split codes written in any programming language. Here is an example using PythonTextSplitter.

In [33]:
python_code = """def peer_review(article_id):
    chat = ChatOpenAI()
    loader = ArxivLoader(query=article_id, load_max_docs=2)
    data = loader.load()
    first_record = data[0]
    page_content = first_record.page_content
    title = first_record.metadata['Title']
    summary = first_record.metadata['Summary']
    ''''''''''''
    ''''''''''''
    return response.content"""

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size = 50,
    chunk_overlap = 10
)

text_splitter.create_documents(texts = [python_code])

[Document(page_content='def peer_review(article_id):'),
 Document(page_content='chat = ChatOpenAI()'),
 Document(page_content='loader = ArxivLoader(query=article_id,'),
 Document(page_content='load_max_docs=2)'),
 Document(page_content='data = loader.load()'),
 Document(page_content='first_record = data[0]'),
 Document(page_content='page_content = first_record.page_content'),
 Document(page_content="title = first_record.metadata['Title']"),
 Document(page_content="summary = first_record.metadata['Summary']"),
 Document(page_content="''''''''''''\n    ''''''''''''"),
 Document(page_content='return response.content')]