In [1]:
!pip install transformers peft accelerate datasets sentencepiece --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

# import os
# os.chdir('/content/drive/My Drive/_NLP/_NLP_Project/ModelTraining')

Mounted at /content/drive


## Qwen

In [3]:
# --- Imports ---
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [18]:
# --- 2. Load Dataset ---
dataset = load_dataset("NebulaSense/Legal_Clause_Instructions")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# --- 3. Load GPT-2 Model and Tokenizer (Fix pad_token) ---
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  # ✅ Fix
gpt2_model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=gpt2_tokenizer.eos_token_id)


In [19]:
# --- 4. Preprocessing Function ---
max_seq_length = 512

def preprocess_function(example):
    instruction = example['Instruction']
    input_field = example.get('Input', None)
    output_field = example['Output']

    if input_field:
        input_text = instruction + "\n" + input_field
    else:
        input_text = instruction

    # For Causal LM: input = prompt + expected output
    full_text = input_text + "\n" + output_field

    model_inputs = gpt2_tokenizer(
        full_text,
        max_length=max_seq_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    return {
        "input_ids": model_inputs["input_ids"].squeeze(0),
        "attention_mask": model_inputs["attention_mask"].squeeze(0),
        "labels": model_inputs["input_ids"].squeeze(0).clone()
    }

# --- 5. Tokenize Datasets ---
tokenized_train = train_dataset.map(preprocess_function, remove_columns=train_dataset.column_names)
tokenized_test = test_dataset.map(preprocess_function, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/4557 [00:00<?, ? examples/s]

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

In [20]:
!pip install bert_score



In [23]:
# --- 6. Define Training Arguments ---
training_args = TrainingArguments(
    output_dir="./gpt2_legal_outputs",
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",
    fp16=True if torch.cuda.is_available() else False,
)

# --- 7. Define Trainer ---
trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=gpt2_tokenizer,
)

# --- 8. Train ---
trainer.train()

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.248
200,0.6837
300,0.6156
400,0.5909
500,0.5687
600,0.5426
700,0.5255
800,0.5092
900,0.5217
1000,0.4887


TrainOutput(global_step=3420, training_loss=0.47800258711764687, metrics={'train_runtime': 232.2804, 'train_samples_per_second': 58.856, 'train_steps_per_second': 14.724, 'total_flos': 3572123369472000.0, 'train_loss': 0.47800258711764687, 'epoch': 3.0})

In [24]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 1. Load dataset
dataset = load_dataset("NebulaSense/Legal_Clause_Instructions")
test_dataset = dataset["test"]

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# 3. Prepare input texts
input_texts = []
reference_outputs = []

for example in test_dataset:
    instr_type = example["Instruction_Type"]
    instruction = example["Instruction"]
    input_field = example.get("Input", None)

    if instr_type not in ["generation", "modification"]:
        continue

    prompt = get_instruction_prompt(instr_type, instruction, input_field)

    if prompt and example["Output"]:
        input_texts.append(prompt)
        reference_outputs.append(example["Output"])


In [26]:
from tqdm import tqdm

batch_size = 16
generated_texts = []


batches = [input_texts[i:i+batch_size] for i in range(0, len(input_texts), batch_size)]

for batch in tqdm(batches, desc="Batch Generating..."):

    inputs = gpt2_tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1,
            no_repeat_ngram_size=4,
            pad_token_id=gpt2_tokenizer.eos_token_id,
            eos_token_id=gpt2_tokenizer.eos_token_id
        )


    batch_preds = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)


    for idx, pred_text in enumerate(batch_preds):
        prompt = batch[idx]
        if pred_text.lower().startswith(prompt.lower().strip()):
            pred_text = pred_text[len(prompt):].strip()
        generated_texts.append(pred_text)

Batch Generating...:   0%|          | 0/32 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batch Generating...:   3%|▎         | 1/32 [00:15<07:55, 15.34s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batch Generating...:   6%|▋         | 2/32 [00:29<07:23, 14.78s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batch Generating...:   9%|▉         | 3/32 [00:44<07:02, 14.55s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Batch Generating...:  12%|█▎        | 4/32 [00:5

In [27]:
from bert_score import score

P, R, F1 = score(generated_texts, reference_outputs, lang="en", verbose=True)

print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1: {F1.mean().item():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/16 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]



done in 19.02 seconds, 26.65 sentences/sec
Precision: 0.7315
Recall: 0.7936
F1: 0.7608
