In [1]:
import json
import random

def infer_risk_from_question(question: str) -> str:
    q = question.lower()
    for k, v in RISK_KEYWORDS.items():
        if k in q:
            return v
    return "medium"  # safe default


def build_risk_dataset(cuad_json, output_path, max_samples=8000):
    samples = []

    with open(cuad_json, "r") as f:
        data = json.load(f)

    for doc in data["data"]:
        for para in doc.get("paragraphs", []):
            for qa in para.get("qas", []):
                question = qa.get("question", "")
                answers = qa.get("answers", [])

                if not answers:
                    continue

                clause_text = answers[0].get("text", "").strip()
                if len(clause_text) < 50:
                    continue

                risk = infer_risk_from_question(question)

                samples.append({
                    "instruction": "Assess the risk level of the contract clause.",
                    "input": clause_text,
                    "output": f"This clause is {risk} risk based on its legal implications."
                })

                if len(samples) >= max_samples:
                    break

        if len(samples) >= max_samples:
            break

    random.shuffle(samples)

    with open(output_path, "w") as f:
        for s in samples:
            f.write(json.dumps(s) + "\n")

    print(f"Saved {len(samples)} samples to {output_path}")


In [2]:
RISK_KEYWORDS = {
    "terminate": "high",
    "termination": "high",
    "liability": "high",
    "indemnif": "medium",
    "payment": "medium",
    "confidential": "low",
    "governing law": "low",
    "jurisdiction": "low"
}

build_risk_dataset(
    cuad_json="/content/drive/MyDrive/CUAD_v1/CUAD_v1/CUAD_v1.json",
    output_path="/content/drive/MyDrive/CUAD_v1/risk_train.jsonl",
    max_samples=8000
)


Saved 4961 samples to /content/drive/MyDrive/CUAD_v1/risk_train.jsonl


In [3]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/CUAD_v1/risk_train.jsonl"
)["train"]

dataset = dataset.train_test_split(test_size=0.1, seed=42)

dataset["train"].to_json("/content/train.jsonl")
dataset["test"].to_json("/content/val.jsonl")

print(len(dataset["train"]), len(dataset["test"]))


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

4464 497


In [4]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)

MODEL_NAME = "HuggingFaceTB/SmolLM-135M-Instruct"

# ---------------------------------------------------------
# LOAD DATA
# ---------------------------------------------------------
dataset = load_dataset(
    "json",
    data_files={
        "train": "/content/train.jsonl",
        "validation": "/content/val.jsonl"
    }
)

def format_prompt(ex):
    return {
        "text": (
            "### Instruction:\n"
            f"{ex['instruction']}\n\n"
            "### Clause:\n"
            f"{ex['input']}\n\n"
            "### Assessment:\n"
            f"{ex['output']}"
        )
    }

dataset = dataset.map(
    format_prompt,
    remove_columns=dataset["train"].column_names
)

# ---------------------------------------------------------
# TOKENIZER
# ---------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(ex):
    enc = tokenizer(
        ex["text"],
        truncation=True,
        max_length=256,
        padding="max_length"
    )
    enc["labels"] = enc["input_ids"].copy()
    return enc

dataset = dataset.map(tokenize, batched=True)

# ---------------------------------------------------------
# MODEL (FP32 — STABLE)
# ---------------------------------------------------------
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,
    device_map="auto"
)
model.gradient_checkpointing_enable()
model.config.use_cache = False

# ---------------------------------------------------------
# TRAINING ARGS
# ---------------------------------------------------------
training_args = TrainingArguments(
    output_dir="/kaggle/working/risk-smollm",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,   # effective batch = 16
    num_train_epochs=4,
    learning_rate=5e-5,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=250,
    save_steps=250,
    save_total_limit=2,
    report_to="none",
    remove_unused_columns=False
)


# ---------------------------------------------------------
# TRAINER
# ---------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"]
)

trainer.train()


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4464 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

Map:   0%|          | 0/4464 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss,Validation Loss
250,0.6775,0.676198
500,0.5865,0.641116
750,0.5498,0.625875
1000,0.5377,0.622981


TrainOutput(global_step=1116, training_loss=0.6220930870288589, metrics={'train_runtime': 1995.858, 'train_samples_per_second': 8.947, 'train_steps_per_second': 0.559, 'total_flos': 2912822646276096.0, 'train_loss': 0.6220930870288589, 'epoch': 4.0})

In [5]:
model.save_pretrained("/content/drive/MyDrive/CUAD_v1/risk-smollm")
tokenizer.save_pretrained("/content/drive/MyDrive/CUAD_v1/risk-smollm")


('/content/drive/MyDrive/CUAD_v1/risk-smollm/tokenizer_config.json',
 '/content/drive/MyDrive/CUAD_v1/risk-smollm/special_tokens_map.json',
 '/content/drive/MyDrive/CUAD_v1/risk-smollm/chat_template.jinja',
 '/content/drive/MyDrive/CUAD_v1/risk-smollm/vocab.json',
 '/content/drive/MyDrive/CUAD_v1/risk-smollm/merges.txt',
 '/content/drive/MyDrive/CUAD_v1/risk-smollm/added_tokens.json',
 '/content/drive/MyDrive/CUAD_v1/risk-smollm/tokenizer.json')

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_DIR = "/content/drive/MyDrive/CUAD_v1/risk-smollm"

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    torch_dtype=torch.float32,   # safe for testing
    device_map="auto"
)

model.eval()
print("✅ Model and tokenizer loaded successfully")


`torch_dtype` is deprecated! Use `dtype` instead!


✅ Model and tokenizer loaded successfully


In [3]:
prompt = """### Instruction:
Assess the risk level of the contract clause.

### Clause:
The company may terminate this agreement at any time without prior notice.

### Assessment:
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=80,
        do_sample=False
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


### Instruction:
Assess the risk level of the contract clause.

### Clause:
The company may terminate this agreement at any time without prior notice.

### Assessment:
This clause is high risk based on its legal implications.


In [4]:
clauses = [
    "The customer may cancel the agreement with 30 days notice.",
    "The company disclaims all liability for damages.",
    "This agreement shall be governed by the laws of California.",
]

for c in clauses:
    prompt = f"""### Instruction:
Assess the risk level of the contract clause.

### Clause:
{c}

### Assessment:
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=60)
    print("\nClause:", c)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Clause: The customer may cancel the agreement with 30 days notice.
### Instruction:
Assess the risk level of the contract clause.

### Clause:
The customer may cancel the agreement with 30 days notice.

### Assessment:
This clause is high risk based on its legal implications.

Clause: The company disclaims all liability for damages.
### Instruction:
Assess the risk level of the contract clause.

### Clause:
The company disclaims all liability for damages.

### Assessment:
This clause is high risk based on its legal implications.

Clause: This agreement shall be governed by the laws of California.
### Instruction:
Assess the risk level of the contract clause.

### Clause:
This agreement shall be governed by the laws of California.

### Assessment:
This clause is low risk based on its legal implications.
