In [1]:
#loading pytorch
!pip install -q sentence-transformers
import torch
import numpy as np
from sentence_transformers import SentenceTransformer

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())




Torch version: 2.9.0+cpu
CUDA available: False


In [2]:
#taking input
clauses = [
    {
        "clause_id": "1.1",
        "section": "Definitions",
        "text": "This agreement shall mean the employment contract entered into between the Employer and the Employee.",
        "position": 1,
        "page_no": 1,
        "layout_type": "paragraph",
        "font_size": 12,
        "language": "en"
    },
    {
        "clause_id": "2.1",
        "section": "Commencement",
        "text": "The employment shall commence on 1st July 2025 and shall continue until terminated in accordance with this agreement.",
        "position": 2,
        "page_no": 1,
        "layout_type": "paragraph",
        "font_size": 11,
        "language": "en"
    },
    {
        "clause_id": "3.1",
        "section": "Duties",
        "text": "The Employee shall perform such duties as may be assigned by the Employer from time to time.",
        "position": 4,
        "page_no": 2,
        "layout_type": "paragraph",
        "font_size": 11,
        "language": "en"
    },
    {
        "clause_id": "3.2",
        "section": "Duties",
        "text": "The Employer may, at its sole discretion, modify the duties and responsibilities of the Employee.",
        "position": 5,
        "page_no": 2,
        "layout_type": "paragraph",
        "font_size": 10,
        "language": "en"
    },
    {
        "clause_id": "4.1",
        "section": "Compensation",
        "text": "The Employee shall be paid a monthly salary as agreed between the parties.",
        "position": 7,
        "page_no": 3,
        "layout_type": "paragraph",
        "font_size": 11,
        "language": "en"
    },
    {
        "clause_id": "4.2",
        "section": "Compensation",
        "text": "The Employer reserves the right to revise the salary structure at any time without prior notice.",
        "position": 8,
        "page_no": 3,
        "layout_type": "paragraph",
        "font_size": 10,
        "language": "en"
    },
    {
        "clause_id": "5.1",
        "section": "Confidentiality",
        "text": "The Employee shall not disclose any confidential information during or after the term of employment.",
        "position": 10,
        "page_no": 4,
        "layout_type": "paragraph",
        "font_size": 11,
        "language": "en"
    },
    {
        "clause_id": "6.1",
        "section": "Termination",
        "text": "Either party may terminate this agreement by providing thirty (30) days written notice to the other party.",
        "position": 12,
        "page_no": 5,
        "layout_type": "paragraph",
        "font_size": 11,
        "language": "en"
    },

    {
        "clause_id": "7.1",
        "section": "Governing Law",
        "text": "This agreement shall be governed by and construed in accordance with the laws of India.",
        "position": 15,
        "page_no": 6,
        "layout_type": "paragraph",
        "font_size": 11,
        "language": "en"
    }
]


In [3]:
#verifying the input
print("Number of clauses:", len(clauses))
print("First clause sample:\n", clauses[0])


Number of clauses: 9
First clause sample:
 {'clause_id': '1.1', 'section': 'Definitions', 'text': 'This agreement shall mean the employment contract entered into between the Employer and the Employee.', 'position': 1, 'page_no': 1, 'layout_type': 'paragraph', 'font_size': 12, 'language': 'en'}


In [4]:
#loading Sentence-BERT model for NLP
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer(
    "all-mpnet-base-v2",
    device=device
)

print("Sentence-BERT model loaded on:", device)
print("Embedding dimension:", model.get_sentence_embedding_dimension())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence-BERT model loaded on: cpu
Embedding dimension: 768


In [5]:
#creating embeddings
texts = [c["text"] for c in clauses]

with torch.no_grad():
    embeddings = model.encode(
        texts,
        convert_to_tensor=True,
        normalize_embeddings=True
    )

print("Embeddings shape:", embeddings.shape)
print("Embedding vector (first clause, first 5 values):")
print(embeddings[0][:5])


Embeddings shape: torch.Size([9, 768])
Embedding vector (first clause, first 5 values):
tensor([-0.0097, -0.0481,  0.0190, -0.0580, -0.0200])


In [6]:
positions = np.array([c["position"] for c in clauses], dtype=np.float32)
positions_norm = positions / positions.max()
positions_tensor = torch.tensor(positions_norm).unsqueeze(1)

print("Original positions:", positions)
print("Normalized positions:", positions_norm)
print("Position tensor shape:", positions_tensor.shape)


Original positions: [ 1.  2.  4.  5.  7.  8. 10. 12. 15.]
Normalized positions: [0.06666667 0.13333334 0.26666668 0.33333334 0.46666667 0.53333336
 0.6666667  0.8        1.        ]
Position tensor shape: torch.Size([9, 1])


In [7]:
sections = list(set(c["section"] for c in clauses))
section_to_id = {sec: i for i, sec in enumerate(sections)}

section_ids = torch.tensor(
    [section_to_id[c["section"]] for c in clauses],
    dtype=torch.float32
).unsqueeze(1)

print("Section to ID mapping:", section_to_id)
print("Section ID tensor:", section_ids.squeeze().tolist())
print("Section ID tensor shape:", section_ids.shape)


Section to ID mapping: {'Duties': 0, 'Confidentiality': 1, 'Definitions': 2, 'Governing Law': 3, 'Commencement': 4, 'Termination': 5, 'Compensation': 6}
Section ID tensor: [2.0, 4.0, 0.0, 0.0, 6.0, 6.0, 1.0, 5.0, 3.0]
Section ID tensor shape: torch.Size([9, 1])


In [8]:
X = torch.cat(
    [embeddings, positions_tensor, section_ids],
    dim=1
)

print("Final node feature matrix X shape:", X.shape)
print("First node feature vector length:", X[0].shape[0])
print("Last 5 features of first node:", X[0][-5:])


Final node feature matrix X shape: torch.Size([9, 770])
First node feature vector length: 770
Last 5 features of first node: tensor([-0.0118, -0.0445, -0.0199,  0.0667,  2.0000])


In [9]:
node_metadata = {
    i: {
        "clause_id": clauses[i]["clause_id"],
        "section": clauses[i]["section"],
        "text": clauses[i]["text"]
    }
    for i in range(len(clauses))
}

print("Node metadata:")
for k, v in node_metadata.items():
    print(f"Node {k} →", v)


Node metadata:
Node 0 → {'clause_id': '1.1', 'section': 'Definitions', 'text': 'This agreement shall mean the employment contract entered into between the Employer and the Employee.'}
Node 1 → {'clause_id': '2.1', 'section': 'Commencement', 'text': 'The employment shall commence on 1st July 2025 and shall continue until terminated in accordance with this agreement.'}
Node 2 → {'clause_id': '3.1', 'section': 'Duties', 'text': 'The Employee shall perform such duties as may be assigned by the Employer from time to time.'}
Node 3 → {'clause_id': '3.2', 'section': 'Duties', 'text': 'The Employer may, at its sole discretion, modify the duties and responsibilities of the Employee.'}
Node 4 → {'clause_id': '4.1', 'section': 'Compensation', 'text': 'The Employee shall be paid a monthly salary as agreed between the parties.'}
Node 5 → {'clause_id': '4.2', 'section': 'Compensation', 'text': 'The Employer reserves the right to revise the salary structure at any time without prior notice.'}
Node 6 

In [10]:
#making graph
import itertools
from sklearn.metrics.pairwise import cosine_similarity

num_nodes = len(clauses)

node_id_map = {
    i: clauses[i]["clause_id"]
    for i in range(num_nodes)
}

print("Node ID Mapping:")
for k, v in node_id_map.items():
    print(f"Node {k} -> Clause {v}")


Node ID Mapping:
Node 0 -> Clause 1.1
Node 1 -> Clause 2.1
Node 2 -> Clause 3.1
Node 3 -> Clause 3.2
Node 4 -> Clause 4.1
Node 5 -> Clause 4.2
Node 6 -> Clause 5.1
Node 7 -> Clause 6.1
Node 8 -> Clause 7.1


In [11]:
edge_src = []
edge_dst = []
edge_labels = []  # for explainability only


In [12]:
for i in range(num_nodes - 1):
    edge_src.append(i)
    edge_dst.append(i + 1)
    edge_labels.append("sequential")

    edge_src.append(i + 1)
    edge_dst.append(i)
    edge_labels.append("sequential")
print("Sequential edges added:")
for i in range(len(edge_labels)):
    if edge_labels[i] == "sequential":
        print(f"{edge_src[i]} -> {edge_dst[i]}")


Sequential edges added:
0 -> 1
1 -> 0
1 -> 2
2 -> 1
2 -> 3
3 -> 2
3 -> 4
4 -> 3
4 -> 5
5 -> 4
5 -> 6
6 -> 5
6 -> 7
7 -> 6
7 -> 8
8 -> 7


In [13]:
for i, j in itertools.combinations(range(num_nodes), 2):
    if clauses[i]["section"] == clauses[j]["section"]:
        edge_src.extend([i, j])
        edge_dst.extend([j, i])
        edge_labels.extend(["same_section", "same_section"])
print("Section-based edges:")
for i, label in enumerate(edge_labels):
    if label == "same_section":
        print(f"{edge_src[i]} -> {edge_dst[i]}")


Section-based edges:
2 -> 3
3 -> 2
4 -> 5
5 -> 4


In [14]:
embedding_np = embeddings.cpu().numpy()

sim_matrix = cosine_similarity(embedding_np)

print("Cosine similarity matrix:")
print(np.round(sim_matrix, 3))


Cosine similarity matrix:
[[1.    0.501 0.513 0.432 0.635 0.411 0.465 0.402 0.611]
 [0.501 1.    0.353 0.243 0.478 0.379 0.303 0.429 0.369]
 [0.513 0.353 1.    0.59  0.531 0.338 0.435 0.146 0.312]
 [0.432 0.243 0.59  1.    0.384 0.559 0.454 0.177 0.19 ]
 [0.635 0.478 0.531 0.384 1.    0.533 0.333 0.294 0.447]
 [0.411 0.379 0.338 0.559 0.533 1.    0.367 0.306 0.279]
 [0.465 0.303 0.435 0.454 0.333 0.367 1.    0.21  0.197]
 [0.402 0.429 0.146 0.177 0.294 0.306 0.21  1.    0.368]
 [0.611 0.369 0.312 0.19  0.447 0.279 0.197 0.368 1.   ]]


In [15]:
SIM_THRESHOLD = 0.6

for i in range(num_nodes):
    for j in range(i + 1, num_nodes):
        if sim_matrix[i, j] >= SIM_THRESHOLD:
            edge_src.extend([i, j])
            edge_dst.extend([j, i])
            edge_labels.extend(["semantic", "semantic"])
print("Semantic similarity edges:")
for i, label in enumerate(edge_labels):
    if label == "semantic":
        print(f"{edge_src[i]} -> {edge_dst[i]}")


Semantic similarity edges:
0 -> 4
4 -> 0
0 -> 8
8 -> 0


In [16]:
edge_index = torch.tensor(
    [edge_src, edge_dst],
    dtype=torch.long
)

print("Edge index shape:", edge_index.shape)
print("Edge index tensor:")
print(edge_index)


Edge index shape: torch.Size([2, 24])
Edge index tensor:
tensor([[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 4, 5, 0, 4, 0, 8],
        [1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 3, 2, 5, 4, 4, 0, 8, 0]])


In [17]:
#printing final graph
print("\nFinal Graph Edges:")
for i in range(len(edge_labels)):
    print(
        f"Node {edge_src[i]} ({node_id_map[edge_src[i]]}) "
        f"-> Node {edge_dst[i]} ({node_id_map[edge_dst[i]]}) "
        f"| type: {edge_labels[i]}"
    )



Final Graph Edges:
Node 0 (1.1) -> Node 1 (2.1) | type: sequential
Node 1 (2.1) -> Node 0 (1.1) | type: sequential
Node 1 (2.1) -> Node 2 (3.1) | type: sequential
Node 2 (3.1) -> Node 1 (2.1) | type: sequential
Node 2 (3.1) -> Node 3 (3.2) | type: sequential
Node 3 (3.2) -> Node 2 (3.1) | type: sequential
Node 3 (3.2) -> Node 4 (4.1) | type: sequential
Node 4 (4.1) -> Node 3 (3.2) | type: sequential
Node 4 (4.1) -> Node 5 (4.2) | type: sequential
Node 5 (4.2) -> Node 4 (4.1) | type: sequential
Node 5 (4.2) -> Node 6 (5.1) | type: sequential
Node 6 (5.1) -> Node 5 (4.2) | type: sequential
Node 6 (5.1) -> Node 7 (6.1) | type: sequential
Node 7 (6.1) -> Node 6 (5.1) | type: sequential
Node 7 (6.1) -> Node 8 (7.1) | type: sequential
Node 8 (7.1) -> Node 7 (6.1) | type: sequential
Node 2 (3.1) -> Node 3 (3.2) | type: same_section
Node 3 (3.2) -> Node 2 (3.1) | type: same_section
Node 4 (4.1) -> Node 5 (4.2) | type: same_section
Node 5 (4.2) -> Node 4 (4.1) | type: same_section
Node 0 (1.1)

In [18]:
#step3:GNN
!pip install -q torch-geometric
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
print("PyTorch version:", torch.__version__)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hPyTorch version: 2.9.0+cpu


In [19]:
num_nodes = X.shape[0]

data = Data(
    x=X,
    edge_index=edge_index
)

print(data)
print("Number of nodes:", data.num_nodes)
print("Number of edges:", data.num_edges)


Data(x=[9, 770], edge_index=[2, 24])
Number of nodes: 9
Number of edges: 24


In [20]:
y = torch.tensor([0, 2], dtype=torch.long)  # example
data.y = y

print("Node labels:", data.y.tolist())


Node labels: [0, 2]


In [21]:
class GATNodeClassifier(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes):
        super().__init__()
        self.gat1 = GATConv(
            in_channels,
            hidden_channels,
            heads=4,
            dropout=0.3
        )
        self.gat2 = GATConv(
            hidden_channels * 4,
            num_classes,
            heads=1,
            concat=False
        )

    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.gat2(x, edge_index)
        return x


In [22]:
model = GATNodeClassifier(
    in_channels=X.shape[1],
    hidden_channels=64,
    num_classes=3
)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = torch.nn.CrossEntropyLoss()

print(model)


GATNodeClassifier(
  (gat1): GATConv(770, 64, heads=4)
  (gat2): GATConv(256, 3, heads=1)
)


In [23]:
# =========================
# WEAK LABEL GENERATOR
# =========================

LABEL_MAP = {
    "Neutral": 0,
    "Pro": 1,
    "Con": 2
}

def weak_label_clause(clause):
    """
    Generates a weak (noisy) label for a legal clause
    without manual annotation.
    """
    text = clause["text"].lower()
    section = clause["section"].lower()

    # Strong negative (unfavorable) signals
    con_patterns = [
        "sole discretion",
        "without notice",
        "at any time",
        "for any reason",
        "reserves the right",
        "immediately"
    ]

    # Strong positive (favorable) signals
    pro_patterns = [
        "mutual",
        "thirty (30) days",
        "prior written notice",
        "severance",
        "compensation",
        "by either party"
    ]

    # Always-neutral sections
    neutral_sections = [
        "definitions",
        "interpretation",
        "governing law"
    ]

    # Section-based neutral rule
    if section in neutral_sections:
        return LABEL_MAP["Neutral"]

    # Check negative signals
    for pat in con_patterns:
        if pat in text:
            return LABEL_MAP["Con"]

    # Check positive signals
    for pat in pro_patterns:
        if pat in text:
            return LABEL_MAP["Pro"]

    # Default
    return LABEL_MAP["Neutral"]


In [24]:
weak_labels = [weak_label_clause(c) for c in clauses]
data.y = torch.tensor(weak_labels, dtype=torch.long)

print("Weak labels generated:")
for i, lbl in enumerate(weak_labels):
    print(
        f"Clause {clauses[i]['clause_id']} → "
        f"{list(LABEL_MAP.keys())[list(LABEL_MAP.values()).index(lbl)]}"
    )


Weak labels generated:
Clause 1.1 → Neutral
Clause 2.1 → Neutral
Clause 3.1 → Neutral
Clause 3.2 → Con
Clause 4.1 → Neutral
Clause 4.2 → Con
Clause 5.1 → Neutral
Clause 6.1 → Pro
Clause 7.1 → Neutral


In [25]:
from collections import Counter

counts = Counter(weak_labels)
total = sum(counts.values())

class_weights = torch.tensor(
    [total / counts[i] for i in range(3)],
    dtype=torch.float
)

criterion = torch.nn.CrossEntropyLoss(weight=class_weights)


In [26]:
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.003,
    weight_decay=1e-4
)


In [27]:
def train_weakly_supervised_gnn(model, data, optimizer, criterion, epochs=80):
    model.train()

    for epoch in range(1, epochs + 1):
        optimizer.zero_grad()

        logits = model(data.x, data.edge_index)
        loss = criterion(logits, data.y)

        loss.backward()
        optimizer.step()

        if epoch % 10 == 0 or epoch == 1:
            print(
                f"Epoch {epoch:02d} | "
                f"Weak-Loss: {loss.item():.4f}"
            )


In [28]:
train_weakly_supervised_gnn(
    model=model,
    data=data,
    optimizer=optimizer,
    criterion=criterion,
    epochs=80
)


Epoch 01 | Weak-Loss: 1.1166
Epoch 10 | Weak-Loss: 0.4585
Epoch 20 | Weak-Loss: 0.2233
Epoch 30 | Weak-Loss: 0.1274
Epoch 40 | Weak-Loss: 0.0304
Epoch 50 | Weak-Loss: 0.1052
Epoch 60 | Weak-Loss: 0.0268
Epoch 70 | Weak-Loss: 0.0059
Epoch 80 | Weak-Loss: 0.0558


In [29]:
data.y = None  # inference mode
model.eval()

with torch.no_grad():
    logits = model(data.x, data.edge_index)
    probs = torch.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1)


In [30]:
INV_LABEL_MAP = {0: "Neutral", 1: "Pro", 2: "Con"}

print("\nFinal GNN Predictions:\n")

for i in range(data.num_nodes):
    conf = probs[i][preds[i]].item()
    label = INV_LABEL_MAP[preds[i].item()]

    flag = "⚠️ Review" if conf < 0.6 else ""

    print(
        f"Clause {clauses[i]['clause_id']} | "
        f"{label} | "
        f"Confidence: {conf:.2f} {flag}"
    )



Final GNN Predictions:

Clause 1.1 | Neutral | Confidence: 1.00 
Clause 2.1 | Neutral | Confidence: 1.00 
Clause 3.1 | Neutral | Confidence: 1.00 
Clause 3.2 | Con | Confidence: 1.00 
Clause 4.1 | Neutral | Confidence: 1.00 
Clause 4.2 | Con | Confidence: 0.98 
Clause 5.1 | Neutral | Confidence: 0.98 
Clause 6.1 | Pro | Confidence: 1.00 
Clause 7.1 | Neutral | Confidence: 1.00 


In [31]:
def is_amendable_clause(clause, label, confidence):
    non_amendable_sections = [
        "definitions",
        "interpretation",
        "governing law"
    ]

    if clause["section"].lower() in non_amendable_sections:
        return False

    if label == "Con":
        return True

    if confidence < 0.6:  # Needs review
        return True

    return False


In [32]:
print(is_amendable_clause(clauses[0], "Neutral", 0.77))  # Definitions
print(is_amendable_clause(clauses[3], "Con", 0.92))      # Risky clause


False
True


In [33]:
INV_LABEL_MAP = {0: "Neutral", 1: "Pro", 2: "Con"}

print("\nGNN OUTPUT SUMMARY:\n")

for i in range(len(clauses)):
    print(
        f"Clause {clauses[i]['clause_id']} | "
        f"Section: {clauses[i]['section']} | "
        f"Label: {INV_LABEL_MAP[preds[i].item()]} | "
        f"Confidence: {probs[i][preds[i]].item():.2f}"
    )



GNN OUTPUT SUMMARY:

Clause 1.1 | Section: Definitions | Label: Neutral | Confidence: 1.00
Clause 2.1 | Section: Commencement | Label: Neutral | Confidence: 1.00
Clause 3.1 | Section: Duties | Label: Neutral | Confidence: 1.00
Clause 3.2 | Section: Duties | Label: Con | Confidence: 1.00
Clause 4.1 | Section: Compensation | Label: Neutral | Confidence: 1.00
Clause 4.2 | Section: Compensation | Label: Con | Confidence: 0.98
Clause 5.1 | Section: Confidentiality | Label: Neutral | Confidence: 0.98
Clause 6.1 | Section: Termination | Label: Pro | Confidence: 1.00
Clause 7.1 | Section: Governing Law | Label: Neutral | Confidence: 1.00


In [34]:
clauses_for_genai = []

for i in range(len(clauses)):
    label = INV_LABEL_MAP[preds[i].item()]
    confidence = probs[i][preds[i]].item()

    if is_amendable_clause(clauses[i], label, confidence):
        clauses_for_genai.append({
            "index": i,
            "clause_id": clauses[i]["clause_id"],
            "section": clauses[i]["section"],
            "text": clauses[i]["text"],
            "label": label,
            "confidence": confidence
        })


In [35]:
print("\nCLAUSES SENT TO GENAI:\n")

for c in clauses_for_genai:
    print(
        f"Clause {c['clause_id']} | "
        f"Label: {c['label']} | "
        f"Confidence: {c['confidence']:.2f}"
    )



CLAUSES SENT TO GENAI:

Clause 3.2 | Label: Con | Confidence: 1.00
Clause 4.2 | Label: Con | Confidence: 0.98


In [37]:
!pip install -q transformers sentencepiece
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
device = "cpu"
print("Running on:", device)


Running on: cpu


In [38]:
model_id = "google/flan-t5-large"

tokenizer = AutoTokenizer.from_pretrained(model_id)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

gen_model = gen_model.to(device)
print("FLAN-T5-LARGE loaded on CPU")


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

FLAN-T5-LARGE loaded on CPU


In [39]:
def build_cpu_legal_prompt(clause_text):
    return (
        "You are a legal contract drafter. "
        "Rewrite the clause below to be fairer to the employee.\n\n"

        "STRICT RULES:\n"
        "- Output ONLY the rewritten legal clause.\n"
        "- Do NOT give advice or explanations.\n"
        "- Do NOT mention lawyers, attorneys, courts, or external help.\n"
        "- Do NOT suggest adding separate clauses.\n"
        "- Do NOT repeat the original wording.\n"
        "- Use formal contract language only.\n\n"

        f"Original clause: {clause_text}\n\n"
        "Rewritten clause:"
    )


In [40]:
def generate_cpu_amendment(prompt):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )

    outputs = gen_model.generate(
        **inputs,
        max_new_tokens=120,
        min_new_tokens=50,
        do_sample=True,          # REQUIRED
        temperature=0.6,
        top_p=0.9,
        repetition_penalty=1.4,
        no_repeat_ngram_size=4
    )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "Rewritten clause:" in text:
        text = text.split("Rewritten clause:")[-1]

    return text.strip()


In [41]:
print("\nCPU-BASED GENAI AMENDMENTS:\n")

cpu_amendments = []

for c in clauses_for_genai:
    print("=" * 70)
    print(f"Clause {c['clause_id']} | Section: {c['section']}")
    print(f"GNN Label: {c['label']} | Confidence: {c['confidence']:.2f}")

    print("\nORIGINAL CLAUSE:")
    print(c["text"])

    prompt = build_cpu_legal_prompt(c["text"])
    amended = generate_cpu_amendment(prompt)

    print("\nREWRITTEN CLAUSE:")
    print(amended)

    cpu_amendments.append({
        "clause_id": c["clause_id"],
        "original": c["text"],
        "amended": amended
    })



CPU-BASED GENAI AMENDMENTS:

Clause 3.2 | Section: Duties
GNN Label: Con | Confidence: 1.00

ORIGINAL CLAUSE:
The Employer may, at its sole discretion, modify the duties and responsibilities of the Employee.

REWRITTEN CLAUSE:
The Employer may, at its sole discretion, terminate the employment of the Employee. Upon termination, the Employee will be entitled to terminate the contract with the Employer. If the Employer terminates the contract, the Employer shall have no further obligation to pay the employee.
Clause 4.2 | Section: Compensation
GNN Label: Con | Confidence: 0.98

ORIGINAL CLAUSE:
The Employer reserves the right to revise the salary structure at any time without prior notice.

REWRITTEN CLAUSE:
The Employer reserves the right to revise the salary structure at any time without prior notice. The Employer reserves all rights not expressly granted in this contract. The Employer shall not be liable for any loss, damages, or inconvenience caused by any modification of the salary 