In [2]:
import json
import os

CUAD_JSON = "/content/drive/MyDrive/CUAD_v1/CUAD_v1/CUAD_v1.json"
OUTPUT_JSONL = "/content/drive/MyDrive/CUAD_v1/cuad_instructions.jsonl"

def load_cuad(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)["data"]

def convert_to_instructions(cuad_data):
    samples = []

    for doc in cuad_data:
        context = doc["paragraphs"][0]["context"]

        for qa in doc["paragraphs"][0]["qas"]:
            instruction = qa["question"]
            input_text = context

            if qa["is_impossible"]:
                output = "The contract does not contain this clause."
            else:
                answers = [a["text"] for a in qa["answers"]]
                output = " ".join(set(answers))

            samples.append({
                "instruction": instruction,
                "input": input_text,
                "output": output
            })

    return samples

def save_jsonl(samples, path):
    with open(path, "w", encoding="utf-8") as f:
        for s in samples:
            f.write(json.dumps(s) + "\n")

if __name__ == "__main__":
    cuad_data = load_cuad(CUAD_JSON)
    instructions = convert_to_instructions(cuad_data)
    save_jsonl(instructions, OUTPUT_JSONL)

    print(f"Saved {len(instructions)} instruction samples")


Saved 20910 instruction samples


In [5]:
import json
import math

INPUT = "/content/drive/MyDrive/CUAD_v1/cuad_instructions.jsonl"
TRAIN = "/content/drive/MyDrive/CUAD_v1/train.jsonl"
VAL = "/content/drive/MyDrive/CUAD_v1/val.jsonl"

MAX_CHARS = 12500     # ~3400 tokens
OVERLAP = 800         # preserves clause continuity

train, val = [], []

def chunk_text(text, max_chars, overlap):
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chars
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    return chunks

with open(INPUT, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        item = json.loads(line)

        if not item["output"].strip():
            continue

        chunks = chunk_text(item["input"], MAX_CHARS, OVERLAP)

        for j, chunk in enumerate(chunks):
            new_item = {
                "instruction": item["instruction"],
                "input": chunk,
                "output": item["output"]
            }

            # 10% validation
            if (i + j) % 10 == 0:
                val.append(new_item)
            else:
                train.append(new_item)

def write(path, data):
    with open(path, "w", encoding="utf-8") as f:
        for x in data:
            f.write(json.dumps(x) + "\n")

write(TRAIN, train)
write(VAL, val)

print("Train:", len(train))
print("Val:", len(val))


Train: 94685
Val: 10521
