In [None]:
!pip install -q transformers torch datasets


In [None]:
!pip install pdfplumber




In [None]:
import torch
import json
from datasets import load_dataset,VerificationMode
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
dataset = load_dataset(
    "theatticusproject/cuad",
    split="train",
    verification_mode=VerificationMode.NO_CHECKS
)

print("✅ Loaded CUAD successfully")
print("Total contracts:", len(dataset))
print("Keys:", dataset[0].keys())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/714 [00:00<?, ?it/s]

✅ Loaded CUAD successfully
Total contracts: 511
Keys: dict_keys(['pdf'])


Column([<pdfplumber.pdf.PDF object at 0x79368cc0b260>, <pdfplumber.pdf.PDF object at 0x793626fade50>, <pdfplumber.pdf.PDF object at 0x79368b22f9b0>, <pdfplumber.pdf.PDF object at 0x79368b22d6a0>, <pdfplumber.pdf.PDF object at 0x79368b2438f0>])

In [None]:
import pdfplumber

In [None]:
def extract_text_from_cuad(pdf_obj):
    """
    Correctly extract text from a CUAD PDF object
    """
    text = ""
    for page in pdf_obj.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text.strip() + "\n"
    return text


In [None]:
texts = []

for i in range(10):  # small batch
    text = extract_text_from_cuad(dataset[i]["pdf"])
    if len(text) > 1000:
        texts.append(text)

print("Contracts extracted:", len(texts))


Contracts extracted: 10


In [None]:
def chunk_text(text, chunk_size=800, overlap=150):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

all_chunks = []
for doc in texts:
    all_chunks.extend(chunk_text(doc))

print("Total chunks:", len(all_chunks))
print("\nSample chunk:\n", all_chunks[0][:500])


Total chunks: 567

Sample chunk:
 Datasheet for Contract Understanding Atticus Dataset (CUAD)
I.MOTIVATION
A. Who created the dataset (e.g., which team, research group) and on behalf of which entity (e.g. company,
institution, organization)?
The Atticus Project is a non-profit organization whose mission is to harness the power of AI to accelerate
accurate and efficient contract review. The Atticus Project started as a grassroots movement by experienced
lawyers in public companies and leading law firms aiming to achieve high-qual


In [None]:
!pip -q install -U sentence-transformers faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.1/494.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer



In [None]:
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(embed_model_name)  # auto uses GPU if available in Colab


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# all_chunks must already exist (from your chunking step)

chunk_embeddings = embed_model.encode(
    all_chunks,
    batch_size=64,                 # increase/decrease depending on RAM
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True      # helps cosine similarity
)

print("✅ Embeddings shape:", chunk_embeddings.shape)  # (num_chunks, embedding_dim)


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

✅ Embeddings shape: (567, 384)


In [None]:
import faiss

In [None]:
dim = chunk_embeddings.shape[1]

# cosine similarity (works because we normalized embeddings)
index = faiss.IndexFlatIP(dim)

# add embeddings
index.add(chunk_embeddings)

print("✅ FAISS index size:", index.ntotal)

✅ FAISS index size: 567


In [None]:
def retrieve_chunks(query, top_k=5):
    query_emb = embed_model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    scores, indices = index.search(query_emb, top_k)
    results = [all_chunks[i] for i in indices[0]]
    return results


In [None]:
results = retrieve_chunks("termination clause", top_k=3)

for i, chunk in enumerate(results, 1):
    print(f"\n--- Retrieved Chunk {i} ---\n")
    print(chunk[:600])



--- Retrieved Chunk 1 ---

ITIES AND EXCHANGE COMMISSION.
Some sentences in the files contain irrelevant information such as footers or page numbers. Some sentences
may not be relevant to the corresponding category. Some sentences may correspond to a different category.
Because many legal clauses are very long and contain various sub-parts, sometimes only a sub-part of a
sentence is responsive to a category.
To address the foregoing limitations, annotators manually deleted the portion that is not responsive, replacing it
with the symbol "<omitted>" to indicate that the two text segments do not appear immediately next to

--- Retrieved Chunk 2 ---

y fully with all
provisions of this Agreement.
5. Term and Termination.
a. Term. This Agreement shall become effective on the Effective Date
and shall continue in effect until either Party informs the other Party
with thirty (30) day prior written notice of termination of this Agreement.
b. Termination. UDC may immediately terminate this Agr

In [None]:
def build_context(chunks):
    return "\n\n".join(chunks)


In [None]:
!pip -q install transformers accelerate

In [None]:
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
def generate_answer(query, top_k=5):
    chunks = retrieve_chunks(query, top_k)
    context = "\n\n".join(chunks)

    prompt = f"""
You are a legal assistant.
Answer the question using ONLY the context below.
If the answer is not present, say "Not found in contract."

Context:
{context}

Question:
{query}

Answer:
"""

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=400,
        temperature=0.2,
        do_sample=False
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer, chunks


In [None]:
query = "What are the termination conditions in this contract?"

answer, sources = generate_answer(query)

print("🤖 AI Answer:\n")
print(answer)

print("\n📌 Source Chunks Used:\n")
for i, src in enumerate(sources, 1):
    print(f"--- Source {i} ---")
    print(src[:400])
    print()



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🤖 AI Answer:

a. Term. This Agreement shall become effective on the Effective Date and shall continue in effect until either Party informs the other Party with thirty (30) day prior written notice of termination of this Agreement. b. Termination. This Agreement may be terminated by either party at the expiration of its term or any renewal term upon thirty (30) days written notice to the other party. Company acknowledges that this Agreement shall not be terminated for MA’s failure to follow an operating plan, standard procedure, training manual, or substantial equivalent published in Paragraph 3 (k) of this Agreement, except that Company does reserve the right to terminate this Agreement for MA’s failure m”). If either party wishes to extend the Initial Term it shall provide notice to the other not later than 180 days, nor sooner than 270 days, before the end of the Initial Term. The parties shall then engage in discussions regarding renewal for a period of 30 days. If no agreement is r

In [None]:
RISK_KEYWORDS = [
    "immediately",
    "without notice",
    "sole discretion",
    "penalty",
    "liability",
    "indemnify",
    "terminate at any time"
]


In [None]:
def flag_risk(chunks):
    risks = []
    for chunk in chunks:
        for word in RISK_KEYWORDS:
            if word.lower() in chunk.lower():
                risks.append((word, chunk[:300]))
    return risks


In [None]:
risks = flag_risk(sources)

print("⚠️ Risk Flags:\n")
for risk, text in risks:
    print(f"Risk Keyword: {risk}")
    print(text)
    print()


⚠️ Risk Flags:

Risk Keyword: immediately
y fully with all
provisions of this Agreement.
5. Term and Termination.
a. Term. This Agreement shall become effective on the Effective Date
and shall continue in effect until either Party informs the other Party
with thirty (30) day prior written notice of termination of this Agreement.
b. Terminat

Risk Keyword: immediately
m”). If either party wishes to extend the Initial Term it shall provide
notice to the other not later than 180 days, nor sooner than 270 days, before the end of the Initial Term. The parties shall then engage in
discussions regarding renewal for a period of 30 days. If no agreement is reached during



In [None]:
def legal_ai_pipeline(query):
    answer, sources = generate_answer(query)
    risks = flag_risk(sources)
    return {
        "query": query,
        "answer": answer,
        "risk_flags": risks
    }


In [None]:
result = legal_ai_pipeline("Can the agreement be terminated without notice?")

print("Answer:\n", result["answer"])
print("\nRisks:")
for r in result["risk_flags"]:
    print(r)


Answer:
 Yes

Risks:
('immediately', 'y fully with all\nprovisions of this Agreement.\n5. Term and Termination.\na. Term. This Agreement shall become effective on the Effective Date\nand shall continue in effect until either Party informs the other Party\nwith thirty (30) day prior written notice of termination of this Agreement.\nb. Terminat')
('immediately', 'm”). If either party wishes to extend the Initial Term it shall provide\nnotice to the other not later than 180 days, nor sooner than 270 days, before the end of the Initial Term. The parties shall then engage in\ndiscussions regarding renewal for a period of 30 days. If no agreement is reached during')
('immediately', ' the other Party\npursuant to Section 2.3 hereof. Each Party shall fully cooperate in this effort. NCM shall be obligated to restore all premises from which it removes\nNCM Property or Equipment to its previous condition, reasonable wear and tear excepted. In addition, any and all licenses granted by ')


In [None]:
index=faiss.write_index(index, "cuad_faiss.index")



In [None]:
import faiss
import numpy as np

dim = chunk_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(chunk_embeddings.astype("float32"))

faiss.write_index(index, "cuad_faiss.index")
print("✅ Saved FAISS index")


✅ Saved FAISS index


In [None]:
import pickle

with open("all_chunks.pkl", "wb") as f:
    pickle.dump(all_chunks, f)

print("✅ Saved chunks")


✅ Saved chunks


In [None]:
import os
os.listdir()


['.config', 'cuad_faiss.index', 'all_chunks.pkl', 'sample_data']

In [None]:
from google.colab import files
files.download("cuad_faiss.index")
files.download("all_chunks.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>