In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from collections import Counter

In [None]:
folder_path = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\Raw\full_contract_txt"

data = []

for file in os.listdir(folder_path):
    if file.endswith(".txt"):
        with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
            text = f.read()
            data.append([file, text])


df = pd.DataFrame(data, columns=["filename", "content"])
df.head()


In [None]:
df.shape

Finding shape of the DataFrame(text_files) where it contains 510 rows and 2 columsn

In [None]:
df.isnull().sum()


Finding Sum of the Null Values for Each Column

In [None]:
df["length"] = df["content"].str.len()
df[["filename", "length"]].head()


Lengths of Each file

In [None]:
import json
json_file = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\Raw\CUAD_v1.json"


with open(json_file, "r", encoding="utf-8") as f:
    cuad_json = json.load(f)

print("Total labeled contracts:", len(cuad_json["data"]))


"510" labeled contracts are present

In [None]:
df_clauses = pd.read_csv(r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\Raw\master_clauses.csv")

df_clauses.head()


In [None]:
df_clauses.shape

master clause csv file contains 510 rows and 83 columns

In [None]:
total_qust= 0
total_ans = 0

for contract in cuad_json["data"]:
    for para in contract["paragraphs"]:
        for qa in para["qas"]:
            total_qust += 1
            if not qa["is_impossible"]:
                total_ans += 1

print("Total QAs:", total_qust)
print("Questions with answers:", total_ans)
print("Unanswerable questions:", total_qust - total_ans)


Json file contains 20910 Questions and Answers
                    , 6702 Questions with answers  and
                    14208 Questions are without answers

In [None]:
clause_counter = Counter()

for contract in cuad_json["data"]:
    for para in contract["paragraphs"]:
        for qa in para["qas"]:
            clause_counter[qa["question"]] += 1

clause_counter.most_common(10)


In [None]:
len(cuad_json["data"])


length of the Json file is "510"

In [None]:
from collections import Counter
clause_counter = Counter()

for contract in cuad_json["data"]:
    for para in contract["paragraphs"]:
        for qa in para["qas"]:
            clause_counter[qa["question"]] += 1

freq_df = pd.DataFrame(clause_counter.items(), columns=["Clause", "Count"]).sort_values("Count", ascending=False)
freq_df.head(15)


This shows which legal clauses appear most often

In [None]:
Q1 = df["length"].quantile(0.25)
Q3 = df["length"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df["length"] < lower_bound) | (df["length"] > upper_bound)]
outliers


Q1 (25th percentile): 25% of files are shorter than this length

Q3 (75th percentile): 75% of files are shorter than this length

IQR = Q3-Q1(which is used to find the outliers(the files which are unusually small or unusually large))

*lower bound = Q1-1.5 * IQR and upper bound = Q1+1.5 * 1QR

*Any files outside this range are found to be outliers

In [None]:
len(outliers)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df["length"], bins=30, kde=True)

plt.title("Distribution of Contract Lengths")
plt.xlabel("Length of Contract (characters)")
plt.ylabel("Number of Contracts")

plt.show()


Below is the Histogram which is drawn using lengths of the contracts:

    * It shows how long each contracts are there and how many contracts fall into each length range.

    * This plays a crucial role in understanding the data complexity ,planing about multi agents and optimize chuncking and analysis.

    * The large Contracts are present which are too far from remaining contracts those are known are outliers.So,We need to handle them using Chunking etc.

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x=df['length'])
plt.title("Boxplot of Contract Word Counts")
plt.show()


Understanding about Boxplot:

* Blue Box represent the middle 50% of contracts:

                -> Bottom of Box(Q1) - 25% of contracts are short than this value
                -> Top of Box(Q3) - 75% of contracts are short than this value
                ->Height of the Box(IQR) - helps to identify whether the dataset is consistent or highly variable.

* The Bold line(Median)[vertical line inside the box]:

          -> median is closer to the lower part of the box
          -> many contracts are short
          ->A few very long contracts are upto average

* The circle at right are the outliers which are too long from remaining data.These needs cleaning before chunking.

In [None]:
pip install wordcloud


In [None]:
from collections import Counter
import re
from wordcloud import WordCloud, STOPWORDS

all_text = " ".join(df["content"].tolist()).lower() # combine all content

words = re.findall(r"\b[a-zA-Z]{3,}\b", all_text)

legal_stopwords = {"herein", "thereof", "agreement", "party", "contract"}
stopwords = STOPWORDS.union(legal_stopwords) # stopwords are like it, the,are etc

filtered_words = [w for w in words if w not in stopwords]
word_freq = Counter(filtered_words)
top_words = word_freq.most_common(20)
top_words


We need to import re (regular expressions) to evaluate the certain pattern and solve the Queries

* wordcloud is nothing but finding the most frequent words from all contract files

* legal_stopwords are nothing but the words which we think they will appear most. So,we need to mention them to remove from the actual list.

* stop words are nothing but the words which we appears most like it,the,are etc.So,we need to exclude them

In [None]:
top_words = word_freq.most_common(20)

words = [w for w, c in top_words]
counts = [c for w, c in top_words]

plt.figure(figsize=(10,6))
sns.barplot(x=counts, y=words)
plt.title("Top 20 Most Frequent Legal Keywords")
plt.xlabel("Frequency")
plt.ylabel("Keyword")
plt.show()


bar plot between most frequent 20 words and there lengths

In [None]:
word_counts = df["content"].apply(lambda x: len(x.split()))

plt.figure(figsize=(8,6))

sns.scatterplot(
    x=df["length"],
    y=word_counts
)

plt.xlabel("File Size (Number of Characters)")
plt.ylabel("Word Count")
plt.title("File Size vs Word Count")
plt.show()


Understanding scatter plot: understanding if a contract has more characters does it  contains more words.

* The graph shows that, the points form a tight upward slopping line which means:

    -> As file size increases   word count also increases almost proportionally .

    -> larger contract have more words.

    -> this shows, Strong linear relationship.

In [None]:
import re
import pandas as pd

cleaned_data = []

def clean_contract(text):
    text = re.sub(r"Page\s+\d+(\s+of\s+\d+)?", "", text, flags=re.IGNORECASE)  # removing page numbers like page 3 or page 3 of 10

    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE) # remove standalone page no. like 3,6

    text = re.sub(r"^[A-Z][A-Z\s]{5,}$", "", text, flags=re.MULTILINE) # remove repeated headers(ALL CAPS) like TERMINATION OF AGREEMENT

    footer_keywords = ["CONFIDENTIAL", "PROPRIETARY", "DRAFT", "COPYRIGHT"] # remove footer text like "CONFIDENTIAL", "PROPRIETARY"
    for word in footer_keywords:
        text = re.sub(rf"^{word}.*$", "", text, flags=re.MULTILINE)

    text = re.sub(r"\n{2,}", "\n\n", text)   # Keep paragraph structure
    text = re.sub(r"[ \t]+", " ", text)      # Remove tabs and long spaces

    return text.strip()

for i, row in df.iterrows():
    file = row["filename"]
    content = row["content"]

    cleaned_text = clean_contract(content)
    cleaned_data.append([file, cleaned_text])

df_clean = pd.DataFrame(cleaned_data, columns=["filename", "content"])
df_clean["length"] = df_clean["content"].str.len()
df_clean.head()


Removing Header and Footer of the contract file's content:

* removing page number like page 3 or page 3 of 10

* removing stand line page numbers like 6 ,7

* removing repeated Headers

* removing Footer with footer_kewords

* normalizing whitespace

* Maintaining Paragraph Structure

==> after performing all of them, storing the cleaned data into a data frame called df_clean.

In [None]:
#original df
df_orig = df.rename(columns={
    "content": "raw_content",
    "length": "raw_length"
})

#cleaned df
df_clean2 = df_clean.rename(columns={
    "content": "clean_content",
    "length": "clean_length"
})
df_merged = df_orig.merge(
    df_clean2[["filename", "clean_content", "clean_length"]],
    on="filename",
    how="left"
)
df_merged.head()

In [None]:
plt.figure(figsize=(10,5))
plt.hist(df["length"], alpha=0.5, label="Before Cleaning")
plt.hist(df_clean["length"], alpha=0.5, label="After Cleaning")
plt.legend()
plt.title("Text Length Before vs After Cleaning")
plt.xlabel("Length (characters)")
plt.ylabel("Number of Contracts")
plt.show()


Ploting Histogram between original data's length and cleaned data's length

It shows different between the two different lengths

In [None]:
import re

def normalize_whitespace(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("\t", " ")  #remove tabs
    text = re.sub(r" {2,}", " ", text) #multiple spaces to one space
    text = re.sub(r"\n{3,}", "\n\n", text) # 3+ line break to 2 line breaks
    text = re.sub(r" *\n *", "\n", text) #removing spaces before and after newline
    text = text.strip() #strip global whitespace

    return text

df_clean["content"] = df_clean["content"].apply(normalize_whitespace)
df_clean["length"] = df_clean["content"].str.len()

df_clean.head()


In [None]:
import re
df_clean["content"] = df_clean["content"].astype(str)

df_clean["content"] = df_clean["content"].apply(   #remove extra spaces and tabs
    lambda x: re.sub(r"\s+", " ", x)
)

df_clean["content"] = df_clean["content"].apply( #remove multiple line breaks
    lambda x: re.sub(r"(\\n\s*){2,}", "\n", x)
)
df_clean["content"] = df_clean["content"].apply( #remove weird unicode and control char.
    lambda x: re.sub(r'[^\x00-\x7F]+', ' ', x)
)
df_clean["length"] = df_clean["content"].str.len()
df_clean[["filename","content" ,"length"]].head()


In [None]:
import re

def clean_noise(text):
    # Remove tabs
    text = re.sub(r'\t+', ' ', text)

    # Remove bullet symbols & similar formatting characters
    text = re.sub(r'[‚Ä¢‚ñ™‚óè‚ñ†‚óã‚óè‚òÖ‚òÜ‚Äì‚Äî-]+', ' ', text)

    # Remove non-ASCII characters
    text = text.encode("ascii", "ignore").decode()

    # Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

# Apply cleaning directly to df_clean
df_clean["content"] = df_clean["content"].apply(clean_noise)

print("Noise removal completed successfully!")

df.head()

1Ô∏è‚É£ To remove noise that is not part of the legal meaning
Like tabs, weird symbols, bullets (‚Ä¢, ‚ñ™), page numbers, OCR garbage ‚Äî these do not help analysis.

2Ô∏è‚É£ To make the text uniform for AI processing
Fixed spacing, proper sentence flow, and normalized formatting helps multi-agent models correctly identify clauses.

3Ô∏è‚É£ To improve accuracy in clause detection and embeddings
Cleaner text ‚Üí better vector similarity + fewer errors when extracting important contract obligations/risks.

In [None]:
import re
df_clean["content"] = df_clean["content"].str.replace(r"-\s*\n\s*", "", regex=True) #termi-\nnation = termination

df_clean["content"] = df_clean["content"].str.replace(r"(\w)-\s+(\w)", r"\1\2", regex=True) #fixes when hyphen seperated by space

before = df["content"].str.contains(r"-\s*\n").sum()
after = df_clean["content"].str.contains(r"-\s*\n").sum()

print("Hyphenation issues before:", before)
print("Hyphenation issues after:", after)


Hyphenation happens when a long word is split into two lines using a hyphen -, usually in scanned or PDF-extracted text. For example, the word ‚Äútermination‚Äù might appear split like ‚Äútermi- nation‚Äù or even across two lines without a hyphen like ‚Äútermi\nnation‚Äù. This is a formatting issue from the original document layout. When we process the text for AI or keyword matching, these broken words can cause errors ‚Äî the model will think they are two different words. So, we fix hyphenation by joining these split pieces back into complete words before sending the data to our agents or NLP pipeline.

In [None]:

df_clean["clean_lower"] = df_clean["content"].str.lower()



we lower the  case of text so that:
* comparisons become easier

* repeated terms like Liability vs liability are treated the same

* we do not lose original content ‚Üí useful for UI display later

In [None]:
import re

def smart_case_normalization(text):
    cleaned_lines = []

    for line in text.splitlines():
        stripped = line.strip()

        # If line is short & mostly uppercase ‚Üí treat as a header
        if len(stripped) < 60 and stripped.isupper():
            cleaned_lines.append(stripped)  # keep header
        else:
            cleaned_lines.append(stripped.lower())  # normalize body text

    return "\n".join(cleaned_lines)

# Apply to cleaned DataFrame
df_clean["content"] = df_clean["content"].apply(smart_case_normalization)


In [None]:
import os

# Create a folder to save cleaned contract files
output_path = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\Transformed\full_contract_txt_cleaned"
os.makedirs(output_path, exist_ok=True)

# Save each cleaned contract as a separate .txt file
for i, row in df_clean.iterrows():
    cleaned_file_path = os.path.join(output_path, row["filename"])
    with open(cleaned_file_path, "w", encoding="utf-8") as f:
        f.write(row["content"])

print("All cleaned contracts saved successfully in:", output_path)


In [None]:
pip install langchain_text_splitters

In [None]:
# <------->
# CHUNKING
# <----->


import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from langchain_text_splitters import RecursiveCharacterTextSplitter



In [None]:
import os
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Folder paths
cleaned_folder_path = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\Transformed\full_contract_txt_cleaned"
chunk_dir = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\chunks"
os.makedirs(chunk_dir, exist_ok=True)

# Text Splitter Settings
chunk_size = 1000
chunk_overlap = 200
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ". ", " "]
)

def chunk_text(text):
    return splitter.split_text(text)

for file in os.listdir(cleaned_folder_path):
    if not file.endswith(".txt"):
        continue

    file_path = os.path.join(cleaned_folder_path, file)

    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    chunks = chunk_text(content)

    output_data = {
        "filename": file,
        "total_chunks": len(chunks),
        "chunks": []
    }

    for idx, chunk in enumerate(chunks):
        output_data["chunks"].append({
            "chunk_id": idx,
            "chunk_text": chunk,
            "chunk_length": len(chunk)
        })

    # save JSON for this contract file
    out_path = os.path.join(chunk_dir, f"{file}_chunks.json")
    with open(out_path, "w", encoding="utf-8") as jf:
        json.dump(output_data, jf, indent=4)

print("Chunking completed! Check Data/chunks folder ‚úî")


Tries to split at paragraphs first, then newlines, then sentences (. ), then spaces.

Each chunk is about 1000 characters, and each new chunk overlaps the previous one by 200 characters, so context isn‚Äôt lost.

where we convert each full cleaned contract into smaller, overlapping, meaning-preserving text chunks ‚Äî which are later used by your AI agents for clause analysis.

Large contracts (50‚Äì300 pages) cannot be processed at once by LLMs because of token limits.

In [None]:
import json
import os

# Path to your chunks folder
chunks_folder = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\chunks"

# Pick one sample file
sample_file = os.listdir(chunks_folder)[0]
sample_path = os.path.join(chunks_folder, sample_file)

with open(sample_path, "r", encoding="utf-8") as f:
    sample_json = json.load(f)

print("üìå Previewing Chunk File:", sample_file)
print("Contract Filename:", sample_json["filename"])
print("Total Chunks:", sample_json["total_chunks"])
print("=" * 80)

# Show first 1‚Äì2 chunks
for i, chunk in enumerate(sample_json["chunks"][:2]):
    print(f"--- Chunk {chunk['chunk_id']} ---")
    print(chunk["chunk_text"][:350], "...")  # show only first 350 chars
    print("Chunk Length:", chunk["chunk_length"])
    print("-" * 80)


previewing a chunk json  file

displaying the content every chunks in that one json file and lengths

In [None]:
import os
import json
import matplotlib.pyplot as plt

chunks_folder = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\chunks"  # update if your path differs
chunk_lengths = []

# Read all chunk JSON files
for file in os.listdir(chunks_folder):
    if file.endswith(".json"):
        file_path = os.path.join(chunks_folder, file)
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

            # Extract lengths of each chunk
            for chunk in data["chunks"]:
                chunk_lengths.append(chunk["chunk_length"])

# Plot histogram
plt.figure(figsize=(10,5))
plt.hist(chunk_lengths, bins=20, color="steelblue", edgecolor="black")
plt.title("Distribution of Chunk Lengths")
plt.xlabel("Chunk Length (characters)")
plt.ylabel("Number of Chunks")
plt.grid(axis='y', linestyle="--", alpha=0.5)
plt.show()

print(f"Total Chunks Analyzed: {len(chunk_lengths)}")
print(f"Average Chunk Length: {sum(chunk_lengths) // len(chunk_lengths)} characters")


* Observation from Histogram:

Most of the chunks fall between 900‚Äì1000 characters	= Chunking is working as expected ‚Äî fixed size of ~1000 char

A few chunks are much smaller (100‚Äì500 chars) = 	These are probably end portions of contracts, e.g., signature blocks, exhibits

Very tiny chunks (0‚Äì100 chars) exist = 	Indicates leftover noise or very short paragraphs ‚Äî may need refining

Distribution shape is right-skewed	= Most text is continuous enough to produce full-sized chunks

* These is Good Because:

-> Large consistent chunks mean efficient embedding & retrieval

-> Small number of small chunks means very little fragmentation

->More accurate context preservation in RAG / LLM reasoning


In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Paths
clean_folder = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\Transformed\full_contract_txt_cleaned"
chunk_folder = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\chunks"
os.makedirs(chunk_folder, exist_ok=True)

# Chunk Configuration
chunk_size = 1000
chunk_overlap = 200

# Initialize the splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ". ", " "]
)

overlap_store = []  # store overlap values

# Processing each cleaned contract
for file in os.listdir(clean_folder):
    if not file.endswith(".txt"):
        continue

    filepath = os.path.join(clean_folder, file)
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    # Skip empty files
    if not content.strip():
        continue

    chunks = splitter.split_text(content)

    contract_data = {
        "filename": file,
        "total_chunks": len(chunks),
        "chunks": []
    }

    for i, chunk in enumerate(chunks):
        contract_data["chunks"].append({
            "chunk_id": i,
            "chunk_text": chunk,
            "chunk_length": len(chunk)
        })

        # Compute overlap with previous chunk
        if i > 0:
            prev_chunk = chunks[i - 1]
            actual_overlap = len(set(chunk[:chunk_overlap]) & set(prev_chunk[-chunk_overlap:]))
            overlap_store.append(actual_overlap)

    # Save each contract‚Äôs chunk JSON
    output_path = os.path.join(chunk_folder, file.replace(".txt", "_chunks.json"))
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(contract_data, f, indent=4)

# ------------------------ Visualization ------------------------
plt.figure(figsize=(10,5))
plt.scatter(range(len(overlap_store)), overlap_store, alpha=0.6)
plt.axhline(chunk_overlap, color="red", linestyle="dashed", label="Target Overlap")
plt.title("Overlap Verification Scatter Plot")
plt.xlabel("Chunk Index")
plt.ylabel("Overlap Characters")
plt.legend()
plt.show()


overlap = if we dont use overlap :

* Sentences will get cut

* A clause might split into 2 chunks

* Meaning can be lost

* LLM agents may miss key obligations

Observations:

* We see overlaps ~10 to 40 chars , not near 200

* Overlap isn't exactly 200  = 	But enough context is preserved
* Most overlaps are 20‚Äì40 characters = 	Helps maintain clause continuity
* Strong consistency in overlap values = 	Chunking is stable ‚Äî good for LLM multi-analysis
* Larger files show more chunks	= Confirms successful splitting
* No extreme spikes = Section boundaries handled cleanly

* It confirms that our  chunking  works correctly.

******************************EMBEDDINGS*****************

In [None]:
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

print("‚úÖ Sentence Transformer loaded")
print("Embedding dimension:", model.get_sentence_embedding_dimension())


In [None]:
def get_embedding(text: str):
    """
    Generate embedding using all-MiniLM-L6-v2
    """
    embedding = model.encode(text, normalize_embeddings=True)
    return embedding.tolist()

In [None]:
import os
import json

chunk_folder = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\chunks"
embedding_folder = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\embeddings"
os.makedirs(embedding_folder, exist_ok=True)

chunk_files = os.listdir(chunk_folder)

for file in chunk_files:
    if not file.endswith(".json"):
        continue

    with open(os.path.join(chunk_folder, file), "r", encoding="utf-8") as f:
        chunk_data = json.load(f)

    embedding_output = {
        "filename": chunk_data["filename"],
        "total_chunks": chunk_data["total_chunks"],
        "embeddings": []
    }

    for chunk in chunk_data["chunks"]:
        emb = get_embedding(chunk["chunk_text"])

        embedding_output["embeddings"].append({
            "chunk_id": chunk["chunk_id"],
            "embedding": emb,
            "text": chunk["chunk_text"]
        })

    out_file = file.replace("_chunks.json", "_embeddings.json")
    out_path = os.path.join(embedding_folder, out_file)

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(embedding_output, f, indent=2)

    print(f"‚úÖ Saved embeddings ‚Üí {out_file}")


generating embiddings of all chunks using sentence transformer using "all-MiniLM-L6-v2" model

In [None]:
import json
import os

embedding_folder = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\embeddings"

# pick one embedding file
sample_file = os.listdir(embedding_folder)[0]
sample_path = os.path.join(embedding_folder, sample_file)

with open(sample_path, "r", encoding="utf-8") as f:
    emb_json = json.load(f)

print("üìå Embedding File:", sample_file)
print("Contract:", emb_json["filename"])
print("Total Chunks:", emb_json["total_chunks"])

first_embedding = emb_json["embeddings"][0]
print("Sample Chunk ID:", first_embedding["chunk_id"])
print("Embedding Vector Length:", len(first_embedding["embedding"]))
print("First 5 values:", first_embedding["embedding"][:5])


Previewing first embedding file like displaying file name,contract name,total chunks,sample chunk id,embedding vector length,first5 valeus


In [None]:
vector_lengths = []

for emb in emb_json["embeddings"]:
    vector_lengths.append(len(emb["embedding"]))

print("Unique vector lengths:", set(vector_lengths))


In [None]:
import numpy as np
import matplotlib.pyplot as plt

norms = []

for emb in emb_json["embeddings"]:
    vec = np.array(emb["embedding"])
    norms.append(np.linalg.norm(vec))

plt.figure(figsize=(8,5))
plt.hist(norms, bins=30)
plt.title("Embedding Vector Norm Distribution")
plt.xlabel("Vector Norm")
plt.ylabel("Number of Chunks")
plt.show()


X-axis ‚Üí Vector Norm

Shows the magnitude of embeddings

Values clustered between a narrow range

Y-axis ‚Üí Number of Chunks

How many chunks fall into each norm range

What we see:

Most embeddings are clustered tightly

No extreme spikes or flat zeros

No exploding values

‚úÖ This is good and expected


Embeddings are stable

 ->No broken or empty vectors

 ->No numerical overflow

Chunks are being embedded consistently

 ->Similar text sizes ‚Üí similar vector magnitudes

Perfect for cosine similarity

 ->Cosine similarity assumes reasonably normalized vectors

 -> Your distribution supports that assumption

Sentence-Transformer is behaving correctly

 ->all-MiniLM-L6-v2 is producing high-quality, compact embeddings

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

vec1 = np.array(emb_json["embeddings"][0]["embedding"]).reshape(1, -1)
vec2 = np.array(emb_json["embeddings"][1]["embedding"]).reshape(1, -1)

cos_sim = cosine_similarity(vec1, vec2)[0][0]
print("Cosine Similarity:", round(cos_sim, 4))


In [None]:
dot_sim = np.dot(vec1, vec2.T)[0][0]
print("Dot Product Similarity:", round(dot_sim, 4))


************************************Pinecone****************************

In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec


In [None]:
# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Sanity check
test_vec = model.encode("termination clause")
print("Embedding dimension:", len(test_vec))


In [None]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone

# Load variables from .env
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = "cuad-index"

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
import os
import json

# Path where embedding JSON files exist
embedding_folder = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\embeddings"

vectors = []  # this will hold all vectors for upsert

for file in os.listdir(embedding_folder):
    if not file.endswith("_embeddings.json"):
        continue

    file_path = os.path.join(embedding_folder, file)

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    filename = data["filename"]

    for emb in data["embeddings"]:
        vector_id = f"{filename}_chunk_{emb['chunk_id']}"

        vectors.append((
            vector_id,
            emb["embedding"],
            {
                "filename": filename,
                "chunk_id": emb["chunk_id"],
                "text": emb["text"]
            }
        ))

print(f"‚úÖ Step 4 Complete: Prepared {len(vectors)} vectors for upsert")


In [None]:
import os
import json
from pinecone import Pinecone
from dotenv import load_dotenv

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
    raise ValueError("Missing PINECONE_API_KEY")

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("cuad-index")

embedding_folder = r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\embeddings"
vectors_to_upsert = []

for file in os.listdir(embedding_folder)[:20]:
    if not file.endswith("_embeddings.json"):
        continue

    with open(os.path.join(embedding_folder, file), "r", encoding="utf-8") as f:
        data = json.load(f)

    filename = data["filename"]

    for item in data["embeddings"]:
        vectors_to_upsert.append((
            f"{filename}_chunk_{item['chunk_id']}",
            item["embedding"],
            {
                "filename": filename,
                "chunk_id": item["chunk_id"],
                "text": item["text"]
            }
        ))

# Batch upsert
BATCH_SIZE = 100
for i in range(0, len(vectors_to_upsert), BATCH_SIZE):
    index.upsert(vectors=vectors_to_upsert[i:i + BATCH_SIZE])

print(f"‚úÖ Upserted {len(vectors_to_upsert)} vectors into Pinecone")


All vectors are upserted into Pinecone using Pinecone api , Pinecone index and using environmemt of the pinecone

Almost 912 vectors are upserted into Pinecone

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

query_text = "I am Something else who you dont know"

query_vector = model.encode(query_text).tolist()  #Query embeddings

# Query Pinecone
results = index.query(
    vector=query_vector,
    top_k=5,
    include_metadata=True
)

print("üîç Semantic Search Results Retrieved:", len(results["matches"]))


In [None]:
print("\nüîé Top 5 Retrieved Chunks:\n")

for i, match in enumerate(results["matches"], start=1):
    print(f"Result {i}")
    print("Score:", round(match["score"], 4))
    print("Filename:", match["metadata"]["filename"])
    print("Chunk ID:", match["metadata"]["chunk_id"])
    print("Text Preview:")
    print(match["metadata"]["text"][:300], "...")
    print("-" * 70)


Score ‚Üí similarity (closer to 1 = more relevant)

Filename ‚Üí which contract

Chunk ID ‚Üí exact chunk inside that contract

Text preview ‚Üí confirms clause relevance

These are 4 Things displayed about the external Sentence

In [None]:
import matplotlib.pyplot as plt

scores = [match["score"] for match in results["matches"]]

plt.figure(figsize=(6,4))
plt.hist(scores, bins=5, edgecolor="black")
plt.title("Similarity Score Distribution (Top-K Results)")
plt.xlabel("Cosine Similarity Score")
plt.ylabel("Number of Chunks")
plt.show()


Here,In the Above Plot:

X-Axis = Each value represent how semantically close a retrieved chunk is to our query (rough range is between 0.114 and 0.130)
Y-Axis = This shows how many chunks fall into each similarity range. (3 chunks have similarity  = 0.115,1 chunks = 0.122 , 1 chunk = 0.129)





This histogram shows how similar the top-K retrieved contract chunks are to your query using cosine similarity. The clustering of scores indicates that Pinecone successfully retrieved semantically related legal clauses, with higher scores representing more relevant chunks. Although the absolute similarity values are small, the ranking order is what matters for retrieval quality. Cosine similarity is used instead of dot product because it compares semantic meaning independent of text length, making it ideal for chunk-based legal document search in ClauseAI.

we will use consine similarity here instead of dot product because,For your project:

    ->Contracts vary hugely in length

    ->Clauses may be short or long

    ->Meaning matters more than size

* Cosine similarity ensures:

    ->A short termination clause is not unfairly penalized

    ->A long clause doesn‚Äôt dominate just because it‚Äôs long

    ->Retrieval stays semantically accurate

In [None]:
pip install pinecone

*********************************RAG****************************************

In [None]:
# ===== CELL 1: Pinecone Setup (NO .env) =====
from pinecone import Pinecone

PINECONE_API_KEY = "###"
PINECONE_ENV = "###"   # example: "us-east-1"
PINECONE_INDEX = "###"

pc = Pinecone(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

index = pc.Index(PINECONE_INDEX)

print("index object:", index)
print("index exists in globals:", "index" in globals())


In [None]:
# ===== CELL 2: RAG Functions =====
from sentence_transformers import SentenceTransformer
from typing import List, Dict

model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_query(query: str):
    return model.encode(query).tolist()

def rag_search(query: str, index, top_k: int = 10):
    query_vector = embed_query(query)

    response = index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True
    )

    if not response.matches:
        return []

    results = []
    for match in response.matches:
        results.append({
            "score": float(match.score),
            "filename": match.metadata.get("filename"),
            "chunk_id": match.metadata.get("chunk_id"),
            "text": match.metadata.get("text")
        })

    return results


The RAG search function is the core retrieval logic of your system.

It does 4 things in order:

Takes a user query (plain English text)

Converts the query into an embedding (same model used for documents)

Searches Pinecone to find the most similar vectors (chunks)

Returns structured results (score + text + metadata)

This function does NOT generate answers yet.
It only retrieves relevant contract chunks that will later be fed to agents or LLMs.

In [None]:
def pretty_print_results(results, preview_chars=300):
    """
    Display retrieved RAG results in a clean, readable format.
    """

    if not results:
        print("‚ö†Ô∏è No results to display.")
        return

    print("\nüîç RAG Search Results")
    print("=" * 80)

    for i, r in enumerate(results, start=1):
        print(f"\nResult {i}")
        print("-" * 80)
        print(f"Similarity Score : {round(r['score'], 4)}")
        print(f"Filename         : {r['filename']}")
        print(f"Chunk ID         : {r['chunk_id']}")
        print("\nText Preview:")
        print(r["text"][:preview_chars])
        print("..." if len(r["text"]) > preview_chars else "")


query = "termination clause notice period governing law jurisdiction"

results = rag_search(query, index, top_k=5)

pretty_print_results(results)


In [None]:
# ===== CELL 3: Run RAG =====
query = "termination clause notice period governing law jurisdiction"

print("index visible here:", index)

results = rag_search(query, index, top_k=5)

print("Results count:", len(results))


In [None]:
import json
import os
from datetime import datetime

def save_rag_results(query, results, output_dir=r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\rag_results"):
    """
    Save RAG search results to a timestamped JSON file.
    """

    # Create directory if not exists
    os.makedirs(output_dir, exist_ok=True)

    # Timestamped filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"rag_results_{timestamp}.json"
    output_path = os.path.join(output_dir, filename)

    # Final RAG JSON structure
    output_data = {
        "query": query,
        "num_results": len(results),
        "results": results
    }

    # Write to file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_data, f, indent=4)

    print(f"‚úÖ RAG results saved successfully at:\n{output_path}")
    return output_path


In [None]:
query = """
confidentiality obligations, data security measures,
comply with laws and regulations, statutory requirements,
audit and inspection, internal controls,
record keeping, reporting to authorities,
information governance
"""




# Run RAG search (already working for you)
results = rag_search(
    query=query,
    index=index,
    top_k=5
)

# Save results
output_path = save_rag_results(
    query=query,
    results=results
)

output_path


*******************AGENT Framework Setup***************

In [None]:
from huggingface_hub import login
login(token="hugging face token here")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

MODEL_NAME = "google/gemma-2b-it"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

gemma_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32
)

print("‚úÖ Gemma model loaded successfully")


In [None]:
AGENT_OUTPUT_SCHEMA = {
    "clause_type": "",
    "extracted_clauses": [],
    "risk_level": "unknown",
    "confidence": 0.0,
    "evidence": []
}

AGENT_OUTPUT_SCHEMA


In [None]:
class BaseAgent:
    def __init__(self, agent_name: str, system_prompt: str, model, tokenizer):
        self.agent_name = agent_name
        self.system_prompt = system_prompt
        self.model = model
        self.tokenizer = tokenizer

    def run(self, context_text: str, max_tokens: int = 300) -> str:
        """
        Run agent reasoning on provided contract context.
        """

        prompt = f"""
{self.system_prompt}

CONTRACT TEXT:
{context_text}

Return ONLY valid JSON.
"""

        # Tokenize input
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True
        )

        # Generate response
        outputs = self.model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_tokens,
            do_sample=False,
            temperature=0.0,
            eos_token_id=self.tokenizer.eos_token_id
        )

        # Remove prompt echo
        input_length = inputs["input_ids"].shape[1]
        generated_tokens = outputs[0][input_length:]

        response = self.tokenizer.decode(
            generated_tokens,
            skip_special_tokens=True
        )

        return response


In [None]:
LEGAL_AGENT_PROMPT = """
You are a legal contract analysis engine.

TASK:
Extract ONLY termination-related clauses.

RULES:
- Use ONLY the contract text
- Copy clauses VERBATIM
- Do NOT paraphrase
- Evidence must match extracted_clauses exactly
- If none found, return empty arrays

OUTPUT JSON:
{
  "extracted_clauses": [],
  "risk_level": "low | medium | high | unknown",
  "confidence": 0.0,
  "evidence": []
}
"""


* The BaseAgent was adapted for Gemma-2B-IT by using a single prompt-based generation flow with deterministic decoding, making it suitable for          structured legal analysis on CPU.

* The BaseAgent constructs a prompt, tokenizes it, generates a deterministic response using Gemma with controlled stopping and padding awareness, and returns structured text suitable for validation.

In [None]:
import json
import re

def extract_json_from_text(text: str) -> dict:
    match = re.search(r"\{[\s\S]*\}", text)
    if not match:
        raise ValueError("‚ùå No JSON found")

    return json.loads(match.group(0))


In [None]:
def validate_agent_output(raw_output: str, clause_type: str) -> dict:
    parsed = extract_json_from_text(raw_output)

    validated = {
        "clause_type": clause_type,
        "extracted_clauses": [],
        "risk_level": "unknown",
        "confidence": 0.0,
        "evidence": []
    }

    if isinstance(parsed.get("extracted_clauses"), list):
        validated["extracted_clauses"] = [
            c.strip() for c in parsed["extracted_clauses"] if isinstance(c, str)
        ]

    if isinstance(parsed.get("evidence"), list):
        validated["evidence"] = [
            e.strip() for e in parsed["evidence"] if isinstance(e, str)
        ]

    if parsed.get("risk_level") in {"low", "medium", "high", "unknown"}:
        validated["risk_level"] = parsed["risk_level"]

    try:
        validated["confidence"] = max(
            0.0, min(1.0, float(parsed.get("confidence", 0.0)))
        )
    except:
        validated["confidence"] = 0.0

    return validated


* Never trust raw LLM output

* This function:

        -> Blocks malformed JSON

        -> Enforces allowed risk levels

        ->Clamps confidence to [0,1]

* This is mandatory for legal/enterprise AI

In [None]:
LEGAL_AGENT_PROMPT = """
You are a legal contract analyst.

From the given contract text, extract TERMINATION-related clauses.

Assess legal risk.

Return ONLY valid JSON in this format:
{
  "extracted_clauses": ["..."],
  "risk_level": "low | medium | high",
  "confidence": 0.0,
  "evidence": ["exact sentence from contract"]
}
"""


In [None]:
dummy_contract_text = """
Either party may terminate this Agreement upon thirty (30) days written notice.
Termination may occur immediately in the event of material breach.
This Agreement shall be governed by the laws of India.
"""

# Instantiate Legal Agent
legal_agent = BaseAgent(
    agent_name="LegalAgent",
    system_prompt=LEGAL_AGENT_PROMPT,
    model=gemma_model,
    tokenizer=tokenizer

)

# Run agent
raw_output = legal_agent.run(dummy_contract_text)

print("RAW AGENT OUTPUT:")
print(raw_output)

# Validate output
validated_output = validate_agent_output(
    raw_output=raw_output,
    clause_type="legal"
)

print("\nVALIDATED OUTPUT:")
print(validated_output)


*****************Legal Agent**********

In [None]:
LEGAL_AGENT_PROMPT = """
You are a legal contract analysis engine.

TASK:
Extract ONLY termination-related clauses from the contract.

STRICT RULES:
- Use ONLY the provided contract text
- Copy clauses VERBATIM (exact sentences)
- Do NOT paraphrase
- Evidence MUST exactly match extracted clauses
- If none found, return empty arrays

OUTPUT JSON ONLY:
{
  "extracted_clauses": [],
  "risk_level": "low | medium | high | unknown",
  "confidence": 0.0,
  "evidence": []
}
"""


In [None]:
legal_query = (
    "termination clause, termination rights, material breach, "
    "notice period, termination for cause, termination without cause"
)

legal_results = rag_search(
    query=legal_query,
    index=index,
    top_k=8
)

print(f"‚úÖ Retrieved {len(legal_results)} legal chunks")


In [None]:
combined_legal_text = "\n\n".join(
    chunk["text"] for chunk in legal_results
)

print(combined_legal_text[:300])


In [None]:
legal_agent = BaseAgent(
    agent_name="LegalAgent",
    system_prompt=LEGAL_AGENT_PROMPT,
    model=gemma_model,
    tokenizer=tokenizer
)


In [None]:
import json
import re

def extract_and_fix_json(text):
    match = re.search(r'\{[\s\S]*\}', text)
    if not match:
        raise ValueError("‚ùå No JSON detected")

    json_str = match.group(0)
    json_str = re.sub(r'\n+', ' ', json_str)  # remove raw newlines
    json_str = re.sub(r"']\s*\}", "\"]}", json_str)
    json_str = re.sub(r"'\]\s*$", "\"]", json_str)
    if '"evidence": [' in json_str and not re.search(r'\"\]', json_str):
        json_str = json_str.replace("']", "\"]")

    # Final Parse
    try:
        return json.loads(json_str)
    except Exception as e:
        print("\n‚ùå JSON still broken:\n", json_str)
        raise ValueError(f"Final parse failed ‚Üí {e}")


In [None]:
raw_output = legal_agent.run(combined_legal_text)
print("RAW OUTPUT:")
print(raw_output)


In [None]:
validated_output = validate_agent_output(
    raw_output=raw_output,
    clause_type="legal"
)

print("\nVALIDATED OUTPUT:")
print(validated_output)


In [None]:
import os
import json
from datetime import datetime

def save_legal_agent_output(output, output_dir=r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\legal_agent_outputs"):
    """
    Save validated legal agent output to disk as JSON.
    """

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Timestamped filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = os.path.join(
        output_dir,
        f"legal_agent_output_{timestamp}.json"
    )

    # Save JSON
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=4)

    print(f"‚úÖ Legal Agent output saved successfully:\n{file_path}")
    return file_path


# üîπ Save the validated output
saved_file_path = save_legal_agent_output(validated_legal_output)
saved_file_path


*****************Compliance Agent**********

In [None]:
COMPLIANCE_AGENT_PROMPT = """
	You are a Compliance Risk Analysis Agent.

	Your task:
	1. Identify compliance-related clauses:
	- Data protection
	- Regulatory requirements
	- Audits & reporting
	2. Extract exact compliance obligations
	3. Assess compliance risk (low/medium/high)

	Return ONLY valid JSON:
	{
	extracted_clauses: [],
	risk_level: "",
	confidence: 0.0,
	evidence: []
	}
"""

In [None]:
compliance_query = """
data protection, privacy, GDPR, HIPAA, SOC2,
ISO, audits, regulatory compliance,
information security, reporting obligations
"""
compliance_results = rag_search(
    query=compliance_query,
    index=index,
    top_k=8
)

print(f"‚úÖ Retrieved {len(compliance_results)} compliance-related chunks")


In [None]:
compliance_context = compliance_results
combined_compliance_text = "\n\n".join(
    [chunk["text"] for chunk in compliance_context]
)

print("üîç Combined Compliance Text Preview:")
print(combined_compliance_text[:300])


In [None]:
compliance_agent = BaseAgent(
    agent_name="ComplianceAgent",
    system_prompt=COMPLIANCE_AGENT_PROMPT,
    model=gemma_model,
    tokenizer=tokenizer
)

print("‚úÖ Compliance Agent initialized successfully")


In [None]:
# STEP 4: Run Compliance Agent
raw_compliance_output = compliance_agent.run(combined_compliance_text)

print("üîπ RAW COMPLIANCE AGENT OUTPUT:")
print(raw_compliance_output)


In [None]:
# STEP 5: Validate Legal Agent Output

validated_output = validate_agent_output(
    raw_output=raw_compliance_output,
    clause_type="Compliance"
)

validated_output


In [None]:
import os
import json
from datetime import datetime

def save_compliance_agent_output(output, output_dir=r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\compliance_agent_outputs"):
    os.makedirs(output_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = os.path.join(
        output_dir,
        f"compliance_agent_output_{timestamp}.json"
    )

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=4)

    print(f"‚úÖ Compliance Agent output saved at:\n{file_path}")
    return file_path


# Save validated compliance output
saved_compliance_path = save_compliance_agent_output(
    validated_output
)

saved_compliance_path


In [None]:
COMPLIANCE_AGENT_PROMPT = """
You are a Compliance Risk Analysis Agent.

Your task:
1. Identify compliance-related clauses, including:
   - Data protection and privacy
   - Regulatory requirements
   - Audits & reporting
   - Industry and regulatory standards such as GDPR, SOC2, ISO, HIPAA
2. Extract exact compliance obligations as they appear in the contract text.
3. Assess compliance risk (low / medium / high).
IMPORTANT:
- Extract ONLY exact sentences from the contract text.
- Do NOT paraphrase, summarize, or rewrite.
- Evidence must be copied verbatim.

Rules:
- extracted_clauses must be a LIST of exact text copied from the contract.
- evidence must be a LIST of exact sentences from the contract.
- Do NOT summarize or explain clauses.
- Use ONLY the provided contract text.
- If compliance obligations exist, extracted_clauses must NOT be empty.

Return ONLY valid JSON:
{
  "extracted_clauses": [],
  "risk_level": "",
  "confidence": 0.0,
  "evidence": []
}
"""


In [None]:
compliance_agent = BaseAgent(
    agent_name="ComplianceAgent",
    system_prompt=COMPLIANCE_AGENT_PROMPT,
    model=gemma_model,
    tokenizer=tokenizer
)


In [None]:
raw_compliance_output = compliance_agent.run(combined_compliance_text)

print("üîπ RAW COMPLIANCE AGENT OUTPUT:")
print(raw_compliance_output)


In [None]:
validated_compliance_output = validate_agent_output(
    raw_output=raw_compliance_output,
    clause_type="Compliance"
)

validated_compliance_output


Modifying the Compliance Agent prompt to include GDPR, SOC2, ISO, and HIPAA improves regulatory awareness, but extracted clauses depend entirely on whether the contract explicitly references these standards.

Agents do not ‚Äúknow‚Äù compliance by default ‚Äî they must be instructed what compliance means.

* Adding GDPR, SOC2, ISO, HIPAA shows:

    Prompt engineering skill

    Regulatory awareness

    Real-world applicability

Without them:

* Agent may:

        Ignore subtle compliance clauses

        Treat them as generic text

    Risk:

        Under-reporting compliance risk

With them:

        Agent becomes regulation-sensitive

        Still avoids hallucination

*******************Finance Agent********

In [None]:
# üîç Finance-focused RAG query
finance_query = """
payment terms, fees, invoices, billing,
charges, penalties, late payment,
financial liability, costs, expenses,
interest, taxes, compensation
"""

# üîé Run RAG search for finance context
finance_results = rag_search(
    query=finance_query,
    index=index,
    top_k=8
)

print(f"‚úÖ Retrieved {len(finance_results)} finance-related chunks")


In [None]:
# üîπ Finance context for Finance Agent
finance_context = finance_results
finance_context[0]


Retrieved finance-relevant contract text

Reduced noise from legal/compliance clauses

Prepared clean input for Finance Agent reasoning

In [None]:
FINANCE_AGENT_PROMPT = """
You are a Finance Risk Analysis Agent.

Your task:
1. Identify finance-related clauses, including:
   - Payment terms
   - Fees and invoices
   - Penalties, late fees, or interest
   - Financial liability and costs
2. Extract exact financial obligations as they appear in the contract.
3. Assess financial risk (low / medium / high).

Rules:
- extracted_clauses must be a LIST of exact text copied from the contract.
- evidence must be a LIST of exact sentences from the contract.
- Do NOT summarize or explain clauses.
- Use ONLY the provided contract text.
- If financial obligations exist, extracted_clauses must NOT be empty.

Return ONLY valid JSON:
{
  "extracted_clauses": [],
  "risk_level": "",
  "confidence": 0.0,
  "evidence": []
}
"""

In [None]:
finance_agent = BaseAgent(
    agent_name="FinanceAgent",
    system_prompt=FINANCE_AGENT_PROMPT,
    model=gemma_model,
    tokenizer=tokenizer
)

print("‚úÖ Finance Agent initialized successfully")


In [None]:
# üîπ Combine finance-related chunk text
combined_finance_text = "\n\n".join(
    [chunk["text"] for chunk in finance_context]
)

print("üîç Combined Finance Text Preview:")
print(combined_finance_text[:300])


In [None]:
raw_finance_output = finance_agent.run(combined_finance_text)

print("üîπ RAW FINANCE AGENT OUTPUT:")
print(raw_finance_output)


The Finance Agent correctly identified that the retrieved clause involved no payment or penalty, resulting in low financial risk and no extractable financial obligations, while the truncation warning indicates a tokenizer configuration improvement rather than an error.

In [None]:
validated_finance_output = validate_agent_output(
    raw_output=raw_finance_output,
    clause_type="Finance"
)

validated_finance_output


In [None]:
import os
import json
from datetime import datetime

def save_finance_agent_output(output, output_dir=r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\finance_agent_outputs"):
    """
    Save validated Finance Agent output to disk as JSON.
    """

    # Create directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Timestamped filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = os.path.join(
        output_dir,
        f"finance_agent_output_{timestamp}.json"
    )

    # Write JSON file
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=4)

    print(f"‚úÖ Finance Agent output saved successfully:\n{file_path}")
    return file_path


# üîπ Save validated finance output
saved_finance_path = save_finance_agent_output(
    validated_finance_output
)

saved_finance_path


In [None]:
FINANCE_AGENT_PROMPT = """
You are a Finance Risk Analysis Agent.

Your task:
1. Identify finance-related clauses, including:
   - Payment terms
   - Fees and invoices
   - Late fees and penalties
   - Financial liability and costs
2. Extract exact financial obligations as they appear in the contract.

STRICT RULES:
- extracted_clauses MUST be copied verbatim from the contract text.
- evidence MUST be copied verbatim from the contract text.
- DO NOT summarize, paraphrase, or explain.
- If you cannot copy an exact sentence, leave the field empty.

Return ONLY valid JSON:
{
  "extracted_clauses": [],
  "risk_level": "",
  "confidence": 0.0,
  "evidence": []
}
"""


adding late fee and penalties in the prompt so that the Agent treat those keyword's realted clauses as financial risk not as generic text

In [None]:
finance_agent = BaseAgent(
    agent_name="FinanceAgent",
    system_prompt=FINANCE_AGENT_PROMPT,
    model=gemma_model,
    tokenizer=tokenizer
)


In [None]:
raw_finance_output = finance_agent.run(combined_finance_text)

print("üîπ RAW FINANCE AGENT OUTPUT:")
print(raw_finance_output)


In [None]:
validated_finance_output = validate_agent_output(
    raw_output=raw_finance_output,
    clause_type="Finance"
)

validated_finance_output


*****************Operations Agent********

In [None]:
# üîç Operations-focused RAG query
operations_query = """
deliverables, scope of work, services to be provided,
timelines, milestones, schedule,
service obligations, performance standards,
service levels, SLA, execution requirements
"""

# üîé Run RAG search for operational context
operations_results = rag_search(
    query=operations_query,
    index=index,
    top_k=8
)

print(f"‚úÖ Retrieved {len(operations_results)} operationally relevant chunks")


In [None]:
# üîπ Operations context for Operations Agent
operations_context = operations_results
operations_context[0]


In [None]:
OPERATIONS_AGENT_PROMPT = """
You are an Operations Risk Analysis Agent.

Your task:
1. Identify operational clauses, including:
   - Deliverables
   - Timelines and schedules
   - Service obligations
   - Performance standards and SLAs
2. Extract exact operational obligations as they appear in the contract.
3. Assess execution risk (low / medium / high).

Rules:
- extracted_clauses must be a LIST of exact text copied from the contract.
- evidence must be a LIST of exact sentences from the contract.
- Do NOT summarize or explain clauses.
- Use ONLY the provided contract text.
- If operational obligations exist, extracted_clauses must NOT be empty.

Return ONLY valid JSON:
{
  "extracted_clauses": [],
  "risk_level": "",
  "confidence": 0.0,
  "evidence": []
}
"""


In [None]:
operations_agent = BaseAgent(
    agent_name="OperationsAgent",
    system_prompt=OPERATIONS_AGENT_PROMPT,
    model=gemma_model,
    tokenizer=tokenizer
)

print("‚úÖ Operations Agent initialized successfully")


In [None]:
# üîπ Combine operationally relevant chunk text
combined_operations_text = "\n\n".join(
    [chunk["text"] for chunk in operations_context]
)

print("üîç Combined Operations Text Preview:")
print(combined_operations_text[:300])


In [None]:
# STEP 4: Run Operations Agent
raw_operations_output = operations_agent.run(combined_operations_text)

print("üîπ RAW OPERATIONS AGENT OUTPUT:")
print(raw_operations_output)

In [None]:
validated_operations_output = validate_agent_output(
    raw_output=raw_operations_output,
    clause_type="Operations"
)

validated_operations_output

In [None]:
import os
import json
from datetime import datetime

def save_operations_agent_output(output, output_dir=r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\operations_agent_outputs"):
    """
    Save validated Operations Agent output to disk as JSON.
    """
    os.makedirs(output_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = os.path.join(
        output_dir,
        f"operations_agent_output_{timestamp}.json"
    )

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=4)

    print(f"‚úÖ Operations Agent output saved successfully:\n{file_path}")
    return file_path


# üîπ Save current validated operations output
saved_ops_path = save_operations_agent_output(validated_operations_output)
saved_ops_path


In [None]:
OPERATIONS_AGENT_PROMPT = """
You are an Operations Risk Analysis Agent.

Your task:
1. Identify operational clauses, including:
   - Deliverables
   - Timelines, schedules, and milestones
   - Service obligations
   - Performance standards and SLAs
2. Extract exact operational obligations as they appear in the contract.
3. Assess execution risk (low / medium / high).
IMPORTANT:
- Extract ONLY exact sentences from the contract text.
- Do NOT paraphrase, summarize, or rewrite.
- Evidence must be copied verbatim.

Rules:
- extracted_clauses must be a LIST of exact text copied from the contract.
- evidence must be a LIST of exact sentences from the contract.
- Do NOT summarize or explain clauses.
- Use ONLY the provided contract text.
- If operational obligations exist, extracted_clauses must NOT be empty.

Return ONLY valid JSON:
{
  "extracted_clauses": [],
  "risk_level": "",
  "confidence": 0.0,
  "evidence": []
}
"""


In [None]:
operations_agent = BaseAgent(
    agent_name="OperationsAgent",
    system_prompt=OPERATIONS_AGENT_PROMPT,
    model=gemma_model,
    tokenizer=tokenizer
)


In [None]:
raw_operations_output = operations_agent.run(combined_operations_text)

print("üîπ RAW OPERATIONS AGENT OUTPUT (After Prompt Update):")
print(raw_operations_output)


In [None]:
validated_operations_output = validate_agent_output(
    raw_output=raw_operations_output,
    clause_type="Operations"
)

validated_operations_output


By expanding the Operations Agent prompt to include timeline and milestone language, the agent becomes more sensitive to execution-related obligations while still avoiding hallucination when such clauses are absent.

In [None]:
combined_agent_output = {
    "document_analysis": {
        "legal": validated_legal_output,
        "compliance": validated_compliance_output,
        "finance": validated_finance_output,
        "operations": validated_operations_output
    }
}

combined_agent_output


In [None]:
import os
import json
from datetime import datetime

def save_combined_agent_output(output, output_dir=r"C:\Users\anvesh4\OneDrive\Desktop\CLAUSEAI\Data\multi_agents_outputs"):
    os.makedirs(output_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    path = os.path.join(
        output_dir,
        f"combined_contract_analysis_{timestamp}.json"
    )

    with open(path, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=4)

    print(f"‚úÖ Combined Agent Output saved at:\n{path}")
    return path


final_path = save_combined_agent_output(combined_agent_output)
final_path


*********************Grounding Check************



purpose:


Ensures agent output is strictly derived from RAG content

Prevents hallucination

Required for legal systems

In [None]:
def check_agent_grounding(agent_output, rag_results):
    """
    Check whether agent output is grounded in RAG-retrieved text.

    Args:
        agent_output (dict): Validated agent output JSON
        rag_results (list): RAG results used to build combined_text

    Returns:
        dict: Grounding report
    """

    grounding_report = {
        "is_grounded": True,
        "ungrounded_extracted_clauses": [],
        "ungrounded_evidence": []
    }

    # Combine RAG text into one string
    combined_rag_text = "\n\n".join(
        [chunk["text"] for chunk in rag_results]
    ).lower()

    # Check extracted clauses
    for clause in agent_output.get("extracted_clauses", []):
        if clause.lower() not in combined_rag_text:
            grounding_report["ungrounded_extracted_clauses"].append(clause)
            grounding_report["is_grounded"] = False

    # Check evidence
    for ev in agent_output.get("evidence", []):
        if ev.lower() not in combined_rag_text:
            grounding_report["ungrounded_evidence"].append(ev)
            grounding_report["is_grounded"] = False

    return grounding_report


In [None]:
finance_grounding_report = check_agent_grounding(
    agent_output=validated_finance_output,
    rag_results=finance_context   # SAME RAG results used for this agent
)

finance_grounding_report


In [None]:
operations_grounding_report = check_agent_grounding(
    agent_output=validated_operations_output,
    rag_results=operations_context  # SAME RAG results used for this agent
)

operations_grounding_report


In [None]:
compliance_grounding_report = check_agent_grounding(
    agent_output=validated_compliance_output,
    rag_results=compliance_context  # SAME RAG results used for this agent
)

compliance_grounding_report


In [None]:
legal_grounding_report = check_agent_grounding(
    agent_output=validated_legal_output,
    rag_results=legal_context  # SAME RAG results used for this agent
)

legal_grounding_report


In [None]:
def clean_ungrounded_output(agent_output, grounding_report):
    cleaned = agent_output.copy()

    cleaned["extracted_clauses"] = [
        c for c in agent_output["extracted_clauses"]
        if c not in grounding_report["ungrounded_extracted_clauses"]
    ]

    cleaned["evidence"] = [
        e for e in agent_output["evidence"]
        if e not in grounding_report["ungrounded_evidence"]
    ]

    return cleaned


In [None]:
grounding_report = check_agent_grounding(
    agent_output=validated_legal_output,
    rag_results=legal_context
)
cleaned_legal_output = clean_ungrounded_output(
    agent_output=validated_legal_output,
    grounding_report=grounding_report
)
cleaned_legal_output


In [None]:
grounding_report = check_agent_grounding(
    agent_output=validated_operations_output,
    rag_results=operations_context
)
cleaned_operation_output = clean_ungrounded_output(
    agent_output=validated_operations_output,
    grounding_report=grounding_report
)
cleaned_operation_output


In [None]:
grounding_report = check_agent_grounding(
    agent_output=validated_finance_output,
    rag_results=finance_context
)
cleaned_finance_output = clean_ungrounded_output(
    agent_output=validated_finance_output,
    grounding_report=grounding_report
)
cleaned_finance_output


‚ÄúWe validate grounding by ensuring every extracted clause and evidence sentence exists verbatim in the RAG-retrieved contract text.‚Äù

***********************************Cross Verification********************

In [None]:
def cross_verify_agent_output(agent_output, rag_results):
    """
    Cross verify agent output with real contract content (RAG).
    """

    verification_report = {
        "verified": True,
        "verified_extracted_clauses": [],
        "verified_evidence": [],
        "missing_extracted_clauses": [],
        "missing_evidence": []
    }

    # Combine all RAG text
    combined_rag_text = "\n\n".join(
        [chunk["text"] for chunk in rag_results]
    ).lower()

    # Verify extracted clauses
    for clause in agent_output.get("extracted_clauses", []):
        if clause.lower() in combined_rag_text:
            verification_report["verified_extracted_clauses"].append(clause)
        else:
            verification_report["missing_extracted_clauses"].append(clause)
            verification_report["verified"] = False

    # Verify evidence
    for ev in agent_output.get("evidence", []):
        if ev.lower() in combined_rag_text:
            verification_report["verified_evidence"].append(ev)
        else:
            verification_report["missing_evidence"].append(ev)
            verification_report["verified"] = False

    return verification_report


In [None]:
verification_result = cross_verify_agent_output(
    agent_output=validated_finance_output,
    rag_results=finance_context
)

verification_result


‚ÄúWe cross-verify every extracted clause and evidence sentence by matching it verbatim against the RAG-retrieved contract text. We additionally map verified evidence back to its originating file and chunk for audit validation.‚Äù