## 1. importing data

In [1]:
import os
contracts = "full_contract_pdf"

pdf_paths = []

for root, dirs, files in os.walk(contracts):
    for file in files:
        if file.endswith(".pdf"):
            pdf_paths.append(os.path.join(root, file))


for pdf_path in pdf_paths:
    print(pdf_path)

full_contract_pdf\Part_I\Affiliate_Agreements\CreditcardscomInc_20070810_S-1_EX-10.33_362297_EX-10.33_Affiliate Agreement.pdf
full_contract_pdf\Part_I\Affiliate_Agreements\CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf
full_contract_pdf\Part_I\Affiliate_Agreements\DigitalCinemaDestinationsCorp_20111220_S-1_EX-10.10_7346719_EX-10.10_Affiliate Agreement.pdf
full_contract_pdf\Part_I\Affiliate_Agreements\LinkPlusCorp_20050802_8-K_EX-10_3240252_EX-10_Affiliate Agreement.pdf
full_contract_pdf\Part_I\Affiliate_Agreements\SouthernStarEnergyInc_20051202_SB-2A_EX-9_801890_EX-9_Affiliate Agreement.pdf
full_contract_pdf\Part_I\Affiliate_Agreements\SteelVaultCorp_20081224_10-K_EX-10.16_3074935_EX-10.16_Affiliate Agreement.pdf
full_contract_pdf\Part_I\Affiliate_Agreements\TubeMediaCorp_20060310_8-K_EX-10.1_513921_EX-10.1_Affiliate Agreement.pdf
full_contract_pdf\Part_I\Affiliate_Agreements\UnionDentalHoldingsInc_20050204_8-KA_EX-10_3345577_EX-10_Affiliate Agreemen

## 2. data pre-processing

In [None]:
# opening and reading the files, and doing some investigations
import fitz
from tqdm.auto import tqdm
def text_formatter(text: str) -> str:
    text = text.replace("\n", " ").strip()
    return text

def open_read_pdf(pdf_path: str) -> list[dict]:
  doc = fitz.open(pdf_path)
  page_texts = []
  for page_number, page in tqdm(enumerate(doc), total = len(doc), desc=f"Processing {os.path.basename(pdf_path)}"):
      text = page.get_text()
      text = text_formatter(text=text)
      page_texts.append({"page_number": page_number+1,
                        "page_char_count": len(text),
                        "page_word_count": len(text.split()),
                        "page_senetence_count": len(text.split(". ")),
                        "page_token_count": len(text) / 4,
                        "text": text})

  return page_texts

In [None]:
# combining all the files
all_pages_and_texts = []
for path in pdf_paths:
    pages_and_texts = open_read_pdf(pdf_path=path)
    all_pages_and_texts.extend(pages_and_texts)

print(all_pages_and_texts[:2])

## 3. some data analysis to choose the right embedding model

In [4]:
import pandas as pd

df = pd.DataFrame(all_pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_senetence_count,page_token_count,text
0,1,2561,362,3,640.25,"Exhibit 10.33 Last Updated: April 6, 2007 ..."
1,2,1651,265,1,412.75,• Incorporates any materials which infringe or...
2,3,1934,302,5,483.5,• STARBUCKS • SUBARU • TEMPLE UNIVERSITY • TOY...
3,4,2795,432,10,698.75,• If Affiliate manages a sub-affiliate network...
4,5,2599,393,8,649.75,7. Order Processing Chase will be solely res...


In [5]:
df.shape

(3512, 6)

In [6]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_senetence_count,page_token_count
count,3512.0,3512.0,3512.0,3512.0,3512.0
mean,20.54,2841.99,431.36,13.33,710.5
std,20.91,1429.1,218.07,10.43,357.27
min,1.0,32.0,5.0,1.0,8.0
25%,5.0,2028.5,310.0,6.0,507.12
50%,13.0,2996.0,456.5,12.0,749.0
75%,29.0,3687.0,567.0,18.0,921.75
max,111.0,9575.0,1426.0,325.0,2393.75


## 4. chunking

In [None]:
from spacy.lang.en import English

nlp = English()

#add a sentenciser pipeline https://spacy.io/api/sentencizer

sentencizer = nlp.add_pipe("sentencizer")

for item in tqdm(all_pages_and_texts):
  item["sentences"] = list(nlp(item["text"]).sents)
  for i in range(len(item["sentences"])):
    item["sentences"][i] = str(item["sentences"][i])

    item["spacy_page_senetence_count"] = len(item["sentences"])

In [8]:
all_pages_and_texts[567]

{'page_number': 2,
 'page_char_count': 2530,
 'page_word_count': 410,
 'page_senetence_count': 34,
 'page_token_count': 632.5,
 'text': 'ARTICLE 5 -- COMPENSATION 5.1 Payment for Services. EHN will pay Dr. Murray $8,333 per month at the end of each month during the first twelve months that this agreement is in effect. 5.2 Options. Upon execution of this Agreement and on each anniversary date of this Agreement for as long as this Agreement is active, EHS will grant Dr. Murray options to purchase 25,000 shares of EHS common stock at their then fair market value (the “Options”). The Options will vest immediately on the date of grant. 5.3 Royalty/Commission Payments. Dr. Murray will receive an annual royalty on net sales (defined as gross sales minus returns) for any products (the “Dr. Murray Products”) developed by Dr. Murray for EHN for as long as the Dr. Murray Products are being sold. The Dr. Murray Products will be listed on Schedule A attached hereto as they are developed and added t

In [9]:
df = pd.DataFrame(all_pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_senetence_count,page_token_count,spacy_page_senetence_count
count,3512.0,3512.0,3512.0,3512.0,3512.0,3512.0
mean,20.54,2841.99,431.36,13.33,710.5,13.92
std,20.91,1429.1,218.07,10.43,357.27,8.68
min,1.0,32.0,5.0,1.0,8.0,1.0
25%,5.0,2028.5,310.0,6.0,507.12,8.0
50%,13.0,2996.0,456.5,12.0,749.0,13.0
75%,29.0,3687.0,567.0,18.0,921.75,18.0
max,111.0,9575.0,1426.0,325.0,2393.75,67.0


In [10]:
num_sentences_per_chunk = 8

def split_list(input_list: list[str], slice_size: int=num_sentences_per_chunk) -> list[list[str]]:
  return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(all_pages_and_texts):
  item["sentences_chunks"] = split_list(input_list=item["sentences"],
                                       slice_size = num_sentences_per_chunk)
  item["num_chunks_per_page"] = len(item["sentences_chunks"])

  0%|          | 0/3512 [00:00<?, ?it/s]

In [11]:
all_pages_and_texts[5]

{'page_number': 6,
 'page_char_count': 2806,
 'page_word_count': 436,
 'page_senetence_count': 6,
 'page_token_count': 701.5,
 'text': "13.\xa0Commercial\xa0Use \xa0 This\xa0program\xa0is\xa0intended\xa0for\xa0commercial\xa0use\xa0only.\xa0Commissions\xa0are\xa0payable\xa0for\xa0Approved\xa0Accounts\xa0to\xa0third\xa0parties\xa0who\xa0access\xa0the\xa0Chase\xa0URL's (marketing\xa0pages)\xa0through\xa0the\xa0Links\xa0located\xa0on\xa0Affiliate’s\xa0sponsoring\xa0Web\xa0site.\xa0Affiliates\xa0who\xa0use\xa0this\xa0program\xa0to\xa0apply\xa0for\xa0credit\xa0cards\xa0for\xa0their own\xa0use\xa0are\xa0NOT\xa0in\xa0violation\xa0of\xa0this\xa0Agreement. \xa0 14.\xa0Trademarks \xa0 All\xa0Chase\xa0trademarks,\xa0trade\xa0names\xa0and\xa0service\xa0marks\xa0(collectively,\xa0the\xa0“Marks”)\xa0are\xa0the\xa0exclusive\xa0property\xa0of\xa0Chase.\xa0Notwithstanding\xa0anything\xa0set forth\xa0in\xa0this\xa0Agreement,\xa0Chase\xa0reserves\xa0full\xa0ownership\xa0of\xa0the\xa0Marks\xa0and\xa0the\xa

In [12]:
df = pd.DataFrame(all_pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_senetence_count,page_token_count,spacy_page_senetence_count,num_chunks_per_page
count,3512.0,3512.0,3512.0,3512.0,3512.0,3512.0,3512.0
mean,20.54,2841.99,431.36,13.33,710.5,13.92,2.21
std,20.91,1429.1,218.07,10.43,357.27,8.68,1.07
min,1.0,32.0,5.0,1.0,8.0,1.0,1.0
25%,5.0,2028.5,310.0,6.0,507.12,8.0,1.0
50%,13.0,2996.0,456.5,12.0,749.0,13.0,2.0
75%,29.0,3687.0,567.0,18.0,921.75,18.0,3.0
max,111.0,9575.0,1426.0,325.0,2393.75,67.0,9.0


spilitting chunks into its own item so we can dice specifically into the text sample that was used in our model cuz this is what rags is all about you want generations you need refrences

In [13]:
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_senetence_count,page_token_count,text,sentences,spacy_page_senetence_count,sentences_chunks,num_chunks_per_page
0,1,2561,362,3,640.25,"Exhibit 10.33 Last Updated: April 6, 2007 ...","[Exhibit 10.33 Last Updated: April 6, 2007 ...",13,"[[Exhibit 10.33 Last Updated: April 6, 2007 ...",2
1,2,1651,265,1,412.75,• Incorporates any materials which infringe or...,[• Incorporates any materials which infringe o...,3,[[• Incorporates any materials which infringe ...,1
2,3,1934,302,5,483.5,• STARBUCKS • SUBARU • TEMPLE UNIVERSITY • TOY...,[• STARBUCKS • SUBARU • TEMPLE UNIVERSITY • TO...,8,[[• STARBUCKS • SUBARU • TEMPLE UNIVERSITY • T...,1
3,4,2795,432,10,698.75,• If Affiliate manages a sub-affiliate network...,[• If Affiliate manages a sub-affiliate networ...,21,[[• If Affiliate manages a sub-affiliate netwo...,3
4,5,2599,393,8,649.75,7. Order Processing Chase will be solely res...,"[7., Order Processing Chase will be solely ...",29,"[[7., Order Processing Chase will be solely...",4


In [14]:
import re

pages_and_chunks = []
for item in tqdm(all_pages_and_texts):
  for sentence_chunk in item["sentences_chunks"]:
    chunk_dict= {}
    chunk_dict["page_number"] = item["page_number"]

    joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()

    # Add space after periods followed by capital letters
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

    # Remove multiple hyphens
    joined_sentence_chunk = re.sub(r'-', '', joined_sentence_chunk)

    # Remove multiple asterisks
    #joined_sentence_chunk = re.sub(r'\*', '', joined_sentence_chunk)

    # remove the "•" symbole
    joined_sentence_chunk = re.sub(r'•', '', joined_sentence_chunk)


    # Replace multiple spaces or newlines with a single space
    joined_sentence_chunk = re.sub(r'\s+', ' ', joined_sentence_chunk).strip()

     # Remove spaces in ".com" patterns
    joined_sentence_chunk = re.sub(
            r'([A-Za-z0-9]+)\s*\.\s*com\b',
            r'\1.COM',
            joined_sentence_chunk,
            flags=re.IGNORECASE
        )

    chunk_dict["sentences_chunks"] = joined_sentence_chunk

    chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split())
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

    pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)


  0%|          | 0/3512 [00:00<?, ?it/s]

7749

In [18]:
pages_and_chunks[1256]

{'page_number': 4,
 'sentences_chunks': 'a) Independent Contractor Status. The Developer agrees to perform the Services hereunder solely as an independent contractor. The Parties agree that nothing in this Agreement shall be construed as creating a joint venture, partnership, franchise, agency, employer/employee, or similar relationship between the Parties, or as authorizing either Party to act as the agent of the other. The Developer is and will remain an independent contractor in its relationship to the Client. The Client shall not be responsible for withholding taxes with respect to the Developer’s compensation hereunder. The Developer shall have no claim against the Client hereunder or otherwise for vacation pay, sick leave, retirement benefits, social security, worker’s compensation, health or disability benefits, unemployment insurance benefits, or employee benefits of any kind. Nothing in this Agreement shall create any obligation between either Party or a third party. (b) Indem

In [19]:
df = pd.DataFrame(pages_and_chunks)
df.head()

Unnamed: 0,page_number,sentences_chunks,chunk_char_count,chunk_word_count,chunk_token_count
0,1,"Exhibit 10.33 Last Updated: April 6, 2007 CHAS...",909,144,227.25
1,1,"BY SUBMITTING YOUR REGISTRATION FORM, AFFILIAT...",1604,213,401.0
2,2,Incorporates any materials which infringe or a...,1546,223,386.5
3,3,"STARBUCKS SUBARU TEMPLE UNIVERSITY TOYS ""R"" US...",1839,260,459.75
4,4,"If Affiliate manages a subaffiliate network, t...",1346,212,336.5


In [20]:
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,7749.0,7749.0,7749.0,7749.0
mean,17.99,1255.51,194.27,313.88
std,19.13,825.34,128.84,206.34
min,1.0,2.0,1.0,0.5
25%,5.0,638.0,98.0,159.5
50%,11.0,1184.0,182.0,296.0
75%,24.0,1715.0,266.0,428.75
max,111.0,6425.0,1003.0,1606.25


In [21]:
1606 - 512

1094

In [22]:
indexes = []
for i in range(len(df)):
    if len(df.loc[i, "sentences_chunks"]) < 15:
        indexes.append(i)

print(indexes)
print(len(indexes))


[31, 38, 45, 98, 125, 208, 217, 230, 233, 246, 1122, 1130, 1140, 1143, 1160, 1198, 1299, 1314, 1334, 1344, 1377, 1380, 1666, 1686, 1694, 1698, 1719, 1737, 1745, 1753, 1770, 1789, 1795, 1817, 1894, 1990, 2000, 2006, 2020, 2282, 2292, 2333, 2341, 2363, 2373, 2376, 2379, 2381, 2396, 2400, 2408, 2449, 2590, 2628, 2630, 2641, 2682, 2775, 2791, 2994, 2998, 3027, 3044, 3051, 3101, 3141, 3154, 3166, 3173, 3190, 3204, 3308, 3322, 3326, 3330, 3356, 3561, 3563, 3575, 3607, 3620, 3622, 3629, 3655, 3663, 3838, 3850, 3855, 3857, 3874, 3946, 4045, 4048, 4058, 4098, 4164, 4196, 4218, 4263, 4542, 4568, 4576, 4598, 4656, 4928, 4937, 4949, 5005, 5008, 5019, 5023, 5026, 5030, 5038, 5051, 5063, 5080, 5082, 5099, 5102, 5105, 5424, 5448, 5470, 5529, 5537, 5614, 5622, 5629, 5898, 5912, 5915, 5939, 5988, 6186, 6190, 6203, 6511, 6523, 6533, 6544, 6560, 6565, 6591, 7019, 7057, 7072, 7115, 7130, 7134, 7138, 7141, 7145, 7148, 7286, 7319, 7401]
157


In [23]:
min_token_len = 15
for row in df[df['chunk_char_count'] <= min_token_len].sample(5).iterrows():
    print(f'chunk token count: {row[1]["chunk_token_count"]} | text: {row[1]["sentences_chunks"]}')

chunk token count: 3.75 | text: S1/A, 1/21/2020
chunk token count: 3.5 | text: 10K, 2/27/2020
chunk token count: 3.0 | text: S1, 9/9/2011
chunk token count: 3.5 | text: S1, 11/15/2019
chunk token count: 3.5 | text: 8K, 11/17/2017


In [24]:
pages_and_chunks_more_min_token_len = df[df["chunk_token_count"] > min_token_len].to_dict(orient="records")
pages_and_chunks_more_min_token_len[:8]

[{'page_number': 1,
  'sentences_chunks': 'Exhibit 10.33 Last Updated: April 6, 2007 CHASE AFFILIATE AGREEMENT THIS AGREEMENT sets forth the terms and conditions agreed to between Chase Bank USA, N. A. (?Chase?) and you as an “Affiliate” in the Chase Affiliate Program (the “Affiliate Program”). Once accepted into the Affiliate Program, an Affiliate can establish links from the Affiliate’s Website to [Chase.COM]. Chase will pay Affiliate a fee for each approved credit card account that originates from a link in Affiliate’s Website. THIS IS A LEGAL AND CONTRACTUALLY BINDING AGREEMENT BETWEEN AFFILIATE AND CHASE. TO APPLY TO THE AFFILIATE PROGRAM, YOU MUST COMPLETE AND SUBMIT THE AFFILIATE REGISTRATION FORM AND CLICK ON THE “AGREE” BUTTON BELOW TO INDICATE YOUR WILLINGNESS TO BE BOUND TO CHASE BY THIS AGREEMENT. THIS AGREEMENT WILL TAKE EFFECT IF AND WHEN CHASE REVIEWS AND ACCEPTS YOUR REGISTRATION FORM AND PROVIDES YOU NOTICE OF ACCEPTANCE.',
  'chunk_char_count': 909,
  'chunk_word_coun

In [25]:
df1 = pd.DataFrame(pages_and_chunks_more_min_token_len)
df1.describe().round(2)

#2h

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,7346.0,7346.0,7346.0,7346.0
mean,18.02,1322.7,204.7,330.67
std,19.2,794.82,124.17,198.7
min,1.0,61.0,7.0,15.25
25%,5.0,732.0,114.0,183.0
50%,11.0,1229.0,190.0,307.25
75%,25.0,1750.0,272.0,437.5
max,111.0,6425.0,1003.0,1606.25


## 5. Embedding

In [13]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-distilroberta-v1",
                            device = "cuda")

sentences = ["Just experimenting", "Each sentence is converted", "I build rag systems"]

embeddings = model.encode(sentences)
print(embeddings)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[ 0.00935405 -0.00927446 -0.01081148 ...  0.09628305 -0.02519781
  -0.03742711]
 [-0.01409588  0.00091119 -0.00096315 ... -0.02571573 -0.00289072
  -0.00579968]
 [-0.04448302 -0.02874016 -0.02722801 ...  0.04932087  0.00666179
  -0.06584102]]


In [31]:
n = 0
for i in embeddings:
  n = n + 1
  print(f"shape of sentence {n}, {i.shape}")

shape of sentence 1, (768,)
shape of sentence 2, (768,)
shape of sentence 3, (768,)


In [27]:
%%time

# embed each chunk one by one

model.to("cuda")

for item in tqdm(pages_and_chunks_more_min_token_len):
  item["embedding"] = model.encode(item["sentences_chunks"])

  0%|          | 0/7346 [00:00<?, ?it/s]

CPU times: total: 5min 18s
Wall time: 7min 56s


In [28]:
pages_and_chunks_more_min_token_len[5988]

{'page_number': 20,
 'sentences_chunks': 'Annual Marketing Plan and any related agreements between the Parties or their Affiliates (each an “Alliance Manager”). The Alliance Managers shall endeavor to ensure clear and responsive communication between the Parties and the effective exchange of information, and shall serve as a single point of contact for any matters arising under this Agreement. The Alliance Managers shall have the right to attend all JSC and subcommittee meetings as nonvoting participants and may bring to the attention of the JSC or subcommittee any matters or issues either of them reasonably believes should be discussed, and shall have such other responsibilities as the Parties may mutually agree in writing. Each Party may designate different Alliance Managers by notice in writing to the other Party.2.6 Compliance Managers. Within thirty (30) days after the Effective Date, Pfizer and Exact each agrees to appoint a Representative who (a) has received compliance training

## 6. Saving embeddings

In [29]:
text_chunks_embeddings_df = pd.DataFrame(pages_and_chunks_more_min_token_len)
embeddings_df_path = "text_chunks_embeddings_df.csv"
text_chunks_embeddings_df.to_csv(embeddings_df_path, index = False)

## 7. Import saved file

In [30]:
data_embeddings = pd.read_csv("text_chunks_embeddings_df.csv")
data_embeddings.head()

Unnamed: 0,page_number,sentences_chunks,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,1,"Exhibit 10.33 Last Updated: April 6, 2007 CHAS...",909,144,227.25,[-7.72002479e-03 -7.91429132e-02 4.01982898e-...
1,1,"BY SUBMITTING YOUR REGISTRATION FORM, AFFILIAT...",1604,213,401.0,[-1.30979512e-02 -9.78020430e-02 -3.97627195e-...
2,2,Incorporates any materials which infringe or a...,1546,223,386.5,[ 1.25754634e-02 -8.29192773e-02 -1.99667308e-...
3,3,"STARBUCKS SUBARU TEMPLE UNIVERSITY TOYS ""R"" US...",1839,260,459.75,[ 9.46377404e-03 -7.31573775e-02 -4.14279141e-...
4,4,"If Affiliate manages a subaffiliate network, t...",1346,212,336.5,[-1.98420454e-02 -3.04132663e-02 2.94256560e-...


# Search and Answer

In [20]:
import random 

import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

In [21]:
device

'cuda'

In [22]:
data_embeddings = pd.read_csv("text_chunks_embeddings_df.csv")

In [23]:
pages_and_chunks = data_embeddings.to_dict(orient = "records")

In [24]:
data_embeddings['embedding'] = data_embeddings['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

In [25]:
embeddings = torch.tensor(np.stack(data_embeddings['embedding'].tolist(), axis=0), dtype=torch.float32).to(device)

In [26]:
data_embeddings

Unnamed: 0,page_number,sentences_chunks,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,1,"Exhibit 10.33 Last Updated: April 6, 2007 CHAS...",909,144,227.25,"[-0.00772002479, -0.0791429132, 0.00401982898,..."
1,1,"BY SUBMITTING YOUR REGISTRATION FORM, AFFILIAT...",1604,213,401.00,"[-0.0130979512, -0.097802043, -0.00397627195, ..."
2,2,Incorporates any materials which infringe or a...,1546,223,386.50,"[0.0125754634, -0.0829192773, -0.0199667308, -..."
3,3,"STARBUCKS SUBARU TEMPLE UNIVERSITY TOYS ""R"" US...",1839,260,459.75,"[0.00946377404, -0.0731573775, -0.0414279141, ..."
4,4,"If Affiliate manages a subaffiliate network, t...",1346,212,336.50,"[-0.0198420454, -0.0304132663, 0.0029425656, 0..."
...,...,...,...,...,...,...
7341,11,"The provisions of Articles 9, 10 and 11, Secti...",2144,333,536.00,"[-0.0406014025, 0.00543733779, -0.0385607183, ..."
7342,11,The table of contents and headings contained i...,235,37,58.75,"[0.0626992732, -0.0125608072, 4.40777294e-05, ..."
7343,12,"used in this Agreement, such term shall be dee...",1495,231,373.75,"[0.0048785666, 0.00535674207, 0.00341437897, 0..."
7344,12,Section 15.7 ASSIGNMENT. Neither this Agreemen...,1332,211,333.00,"[0.0107778069, 0.0233041253, -0.0609544106, -0..."


In [27]:
embeddings.shape

torch.Size([7346, 768])

In [29]:
from sentence_transformers import util, SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-distilroberta-v1",
                            device = device)



In [30]:
embeddings.shape

torch.Size([7346, 768])

In [31]:
# define query
query = "charges of creating and hosting a website"
print(f"query: {query}")

# embed the query
query_embedding = model.encode(query, convert_to_tensor = True).to(device)

# similarity scores with dot product or with cosine similarity if outputs are not normalized
dot_scores = util.dot_score(a=query_embedding, b= embeddings)[0]

# top k results
k = 5
top_results_dot_product = torch.topk(dot_scores, k=k)
top_results_dot_product

query: charges of creating and hosting a website


torch.return_types.topk(
values=tensor([0.5586, 0.4996, 0.4991, 0.4756, 0.4582], device='cuda:0'),
indices=tensor([3463, 3636, 4492, 3467,  436], device='cuda:0'))

In [32]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [33]:
#query = "charges of creating and hosting a website"
print(f"query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indices from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentences_chunks"])
    print(f"Index: {idx}, Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

query: 'charges of creating and hosting a website'

Results:
Score: 0.5586
Text:
By Client: /s/ Natalija Tunevic Website Design, Development and Hosting
Agreement This Website Design, Development and Hosting Agreement the
(“Agreement”) is entered into on January 11, 2018 by and between Natalija
Tunevic, director of FreeCook (hereinafter referred to as “Client”) and Mitchell
Vitalis, director of Mitchell's Web Advance, PLC (hereinafter referred to as
“Company”).1. Website Design and Development. Client agrees to pay to Company
the sum of $5,000 (the “Contract Price”) to design and develop a website for
Client (the “Client Website”) in accordance with the accompanying Scope of Work,
attached to this Agreement as Exhibit A. (a) Change Orders. Any changes to the
Scope of Work following the execution of this Agreement requiring Additional
Work must be submitted to and accepted by Company in writing as a “Change
Order”. The costs of any such changes shall be added to the Contract Price.
Addi

## 8. fuctionizing the vector search pipeline

In [34]:
def retrieve(query: str,
             embeddings: torch.tensor,
             model: SentenceTransformer = model,
             num_resources_to_return: int = 2):
    #embed query
    query_embedding = model.encode(query, convert_to_tensor=True)

    #dot product
    dot_score = util.dot_score(query_embedding, embeddings)[0]

    scores, indices = torch.topk(input = dot_score, k=num_resources_to_return)
    return scores, indices

def top_results(query:str,
                embeddings:torch.tensor,
                pages_and_chunks: list[dict]=pages_and_chunks,
                num_resources_to_return: int=5):
    scores, indices = retrieve (query=query,
                                embeddings=embeddings,
                                num_resources_to_return=num_resources_to_return)
    
    for score, idx in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print("Text:")
        print_wrapped(pages_and_chunks[idx]["sentences_chunks"])
        print(f"Index: {idx}, Page number: {pages_and_chunks[idx]['page_number']}")
        print("\n")

In [35]:
query = "artificial intelligence"
retrieve(query = query, embeddings=embeddings) 

(tensor([0.3217, 0.3126], device='cuda:0'),
 tensor([4175, 5477], device='cuda:0'))

In [36]:
top_results(query=query, embeddings=embeddings)

Score: 0.3217
Text:
EXHIBIT B “Robots” Wireless Products A minimum of: 12D Java Game in connection
with the initial theatrical release 1 3D Java Game in connection with the DVD
release 5 Java Applications (‘Screensavers’) (2 3D Screensavers and 3 2D
Screensavers) in connection with the initial theatrical release 5 MMS 10
Wallpapers 5 Voicetones Wireless Content License Agreement Multiple Properties /
Sorrent, Inc. / Final PLZ 31 Source: GLU MOBILE INC, S1/A, 3/19/2007
Index: 4175, Page number: 31


Score: 0.3126
Text:
"ACSI Intellectual Property" means, collectively, any ACSI Existing Intellectual
Property, ACSI Future Intellectual Property and ACSI Derivative Works, but
excluding any Joint Works. ***Confidential Information has been omitted and has
been filed separately with the Securities and Exchange Commission. 1 "ACSI
Product Section" means a discrete group of products available on the ACSI Site
which is identified by a tab or other toplevel product category identifier on
the ACSI

## 9. runing an llm

In [37]:
from transformers.utils import is_flash_attn_2_available
is_flash_attn_2_available()

False

In [38]:
import torch
print(torch.cuda.get_device_capability(0))

(8, 6)


In [43]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

"""
from transformers import BitsAndBytesConfig


quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

"""

model_id = "microsoft/Phi-3-mini-4k-instruct"
#model_id = "gpt2"
#model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
#model_id = "google/gemma-7b-it"

#instatiate tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path= model_id, trust_remote_code=True)

#instatiate the model
llm_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_id,
    torch_dtype=torch.float16,
    #quantization_config=quantization_config,
    low_cpu_mem_usage=False,
    trust_remote_code=True
)

llm_model.to("cuda")

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_features=3206

In [44]:
llm_model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_features=3206

In [45]:
input_query = "how to build a contract document ?"
print(f"input query:\n{input_query}")

dialogue = [
    {
        "role":"user",
        "content":input_query
    }
]

prompt = tokenizer.apply_chat_template(conversation=dialogue,
                                       tokenize=False,
                                       add_generation_prompt=True)
print("")
print(f"Prompt:\n{prompt}")


input query:
how to build a contract document ?

Prompt:
<|user|>
how to build a contract document ?<|end|>
<|assistant|>



In [46]:
%%time
#tokenize input
input_ids = tokenizer(prompt,
                   return_tensors='pt').to('cuda')

#geneterete output

outputs = llm_model.generate(**input_ids,
                             max_new_tokens= 256)

print(f"output tokens:\n{outputs[0]}")

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


output tokens:
tensor([32010,   920,   304,  2048,   263,  8078,  1842,  1577, 32007, 32001,
        17166,   263,  8078,  1842, 20789,  3196,  6576,   304,  9801,   393,
          278, 17327,   338,  2821, 29892, 15171,  6270, 29892,   322,  2814,
          635,  9956, 29889,  2266, 29915, 29879,   263,  2498, 10754,   304,
         1371,   366,  1653,   263,  8078,  1842, 29901,    13,    13, 29896,
        29889, 13355,  1598,   278, 13973,  9701, 29901, 17732,   368,  2106,
          278,  2983,   322,  6958,  2472,   310,   599, 13973,  9701,   297,
          278, 17327, 29889,    13,    13, 29906, 29889, 22402,   278,  4967,
         4383, 29901, 17732,   368,  8453,   278,  6437, 29892,  6874, 29892,
          322,  4967,  4383,   310,   278,  8078, 29889,   910,  1122,  3160,
         5786, 29892,  9316, 29892,   470,   916, 10788,   800, 29889,    13,
           13, 29941, 29889, 12048,  1598,   278,  4958,   322,  5855, 29901,
         4451,  1220,   278, 18853,  4958,   322,

In [47]:
#convert the output tokens into text

output_decoded = tokenizer.decode(outputs[0])
print(f"model output:\n{output_decoded}")

model output:
<|user|> how to build a contract document ?<|end|><|assistant|> Building a contract document involves several steps to ensure that the agreement is clear, comprehensive, and legally binding. Here's a general guide to help you create a contract document:

1. Identify the parties involved: Clearly state the names and contact information of all parties involved in the agreement.

2. Define the subject matter: Clearly describe the purpose, scope, and subject matter of the contract. This may include services, products, or other obligations.

3. Specify the terms and conditions: Outline the essential terms and conditions of the agreement, such as payment terms, delivery dates, warranties, and any other relevant provisions.

4. Determine the duration: Specify the start and end dates of the contract, as well as any provisions for renewal or termination.

5. Assign responsibilities: Clearly define the roles and responsibilities of each party involved in the contract.

6. Include d

## 10. augementing the prompt

there are sometechnics that we can use but for this project we decided to use:

1: give clear instructions  
2: give examples of I/O  
3: give room to think like a scratchpad "show your wokring space" or "let's think step by step"

our prompt should look like this:

based on your infomation and the following examples:
- expamle 1
- example 2
- example 3 ...

answer the following query: how to build a contract from scratch

In [48]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    context = "- " + "\n- ".join(
        [item["sentences_chunks"] for item in context_items]
    )

    base_prompt = """Based on the following context items, generate a comprehensive and well-structured contract. Ensure the contract covers all necessary elements and addresses the query effectively.
Context items:
{context}
Query:{query}
Answer:
"""
    base_prompt = base_prompt.format(context=context,
                                query=query)

    dialogue_template = [
        {"role": "user",
         "content": base_prompt}
    ]
    prompt = tokenizer.apply_chat_template(conversation = dialogue_template,
                                           tokenize = False,
                                           add_generation_prompt=True)
    return prompt

In [49]:
query = "how to build a contract"
print(f"query: {query}")

#retrieve relevant ressources
scores, indices = retrieve(
    query = query,
    embeddings = embeddings
)

#list of context item
context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query=query,
                          context_items=context_items)

print(prompt)

query: how to build a contract
<|user|>
Based on the following context items, generate a comprehensive and well-structured contract. Ensure the contract covers all necessary elements and addresses the query effectively.
Context items:
- Name and business address of the developer; 2. Full name, address and ID card number of the person in charge; 3. Purpose and content of the development. Article IX. Duties of Party B I. Party B shall follow all instructions of Party A and ensure all plans and designs of the project to meet relevant construction laws and regulations. II. Party B shall take on a professional attitude for various services mentioned in Article III of the Contract, maintain benefits and interests of Party A all the time and adopt the most economical option under the precondition of safety and reliability. III.
- c) You must employ a qualified, licensed general contractor to construct the Franchised Business and complete all improvements. (d) Your architect or engineer must a

In [50]:
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = llm_model.generate(**input_ids,
                             temperature = 0.7,
                            do_sample=True,
                            max_new_tokens=256)

output_text = tokenizer.decode(outputs[0])

print(f"query: {query}")
print(f"RAG answer : {output_text.replace(prompt, '')}")

query: how to build a contract
RAG answer : <|user|> Based on the following context items, generate a comprehensive and well-structured contract. Ensure the contract covers all necessary elements and addresses the query effectively.
Context items:
- Name and business address of the developer; 2. Full name, address and ID card number of the person in charge; 3. Purpose and content of the development. Article IX. Duties of Party B I. Party B shall follow all instructions of Party A and ensure all plans and designs of the project to meet relevant construction laws and regulations. II. Party B shall take on a professional attitude for various services mentioned in Article III of the Contract, maintain benefits and interests of Party A all the time and adopt the most economical option under the precondition of safety and reliability. III.
- c) You must employ a qualified, licensed general contractor to construct the Franchised Business and complete all improvements. (d) Your architect or en

In [51]:
def ask_rag(query: str,
            temperature: float = 0.7,
            max_new_tokens: int= 256,
            format_answer_text = True,
            return_answer_only = True):
    
    # RETRIEVAL
    # scores and indices of top related results
    scores, indices = retrieve(query=query,
                               embeddings=embeddings)

    # create a list of context items
    context_items = [pages_and_chunks[i] for i in indices] 

    # add score to context item
    for i, item in enumerate(context_items): 
        item["score"] = scores[i].cpu()

    # AUGMENTATION
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # GENERATION
    # tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # decode the tokens into text
    output_text = tokenizer.decode(outputs[0])

    # format the answer
    if format_answer_text:
        # Replace prompt and special tokens
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "")

    # only return the answer without context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [52]:
ask_rag(query = "what are the key elements to build a contract", temperature = 0.1)

"<|user|> Based on the following context items, generate a comprehensive and well-structured contract. Ensure the contract covers all necessary elements and addresses the query effectively.\nContext items:\n- Name and business address of the developer; 2. Full name, address and ID card number of the person in charge; 3. Purpose and content of the development. Article IX. Duties of Party B I. Party B shall follow all instructions of Party A and ensure all plans and designs of the project to meet relevant construction laws and regulations. II. Party B shall take on a professional attitude for various services mentioned in Article III of the Contract, maintain benefits and interests of Party A all the time and adopt the most economical option under the precondition of safety and reliability. III.\n- 2. This Agreement is a legal, valid, and binding obligation of each Party, enforceable against it in accordance with its terms (except as may be limited by bankruptcy, insolvency, moratorium, 