In [None]:
!nvcc --version
!python -c "import torch; print('PyTorch CUDA:', torch.version.cuda)"

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
PyTorch CUDA: 12.1


In [None]:
# !pip uninstall -y torch torchvision torchaudio
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121  # CUDA 12.1 is closest available (12.5 not yet supported)

In [None]:
# pip install transformers datasets evaluate rouge_score

In [None]:
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import evaluate
import numpy as np

In [None]:
# Load model & tokenizer
model_name = "google-t5/t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Apply LoRA
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q", "v", "k"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 62,276,096 || trainable%: 2.8413


In [None]:
# Load dataset & tokenize
dataset = load_dataset("AjayMukundS/Legal_Text_Summarization-llama2")
def tokenize_function(examples):
    # Tokenize inputs (encoder)
    model_inputs = tokenizer(
        examples["text"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )

    # Tokenize labels (decoder)
    labels = tokenizer(
        examples["summary"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
tokenized_ds = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7773 [00:00<?, ? examples/s]

In [None]:
# Data collator for seq2seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
# Metrics (ROUGE)
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="legal-t5-lora",
    run_name="legal-lora-t5-small",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,3.418691,0.0112,0.0044,0.0091,0.0092,4.7


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,3.418691,0.0112,0.0044,0.0091,0.0092,4.7
2,3.708100,3.377537,0.0153,0.0064,0.0129,0.0127,5.8
3,3.596800,3.358495,0.0236,0.0101,0.0198,0.0198,8.8
4,3.548800,3.352857,0.0291,0.0132,0.0245,0.0245,10.5


TrainOutput(global_step=1944, training_loss=3.5981584085849088, metrics={'train_runtime': 1332.3658, 'train_samples_per_second': 23.336, 'train_steps_per_second': 1.459, 'total_flos': 4377057744125952.0, 'train_loss': 3.5981584085849088, 'epoch': 4.0})

In [None]:
model.save_pretrained("legal-t5-lora")
tokenizer.save_pretrained("legal-t5-lora")

('legal-t5-lora/tokenizer_config.json',
 'legal-t5-lora/special_tokens_map.json',
 'legal-t5-lora/spiece.model',
 'legal-t5-lora/added_tokens.json',
 'legal-t5-lora/tokenizer.json')

In [None]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="legal-t5-lora")
summarizer(text)

Device set to use cuda:0
Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "the inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs . it's the most aggressive action on tackling the climate crisis in American history . no one making under $400,000 per year will pay a penny more in taxes ."}]

In [None]:
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
from rouge_score import rouge_scorer
import torch
import nltk

# Initialize
nltk.download('punkt')
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()

# Load dataset
dataset = load_dataset("AjayMukundS/Legal_Text_Summarization-llama2", split="test[:10%]")

# =============================================
# 1. Evaluate Pre-trained Model (Baseline)
# =============================================
pretrained_model_name = "google-t5/t5-small"

# Load with memory optimization
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(
    pretrained_model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto",
    low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

# Initialize ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_model(model, tokenizer, dataset, max_samples=50):
    scores = []
    for i, example in enumerate(dataset):
        if i >= max_samples:  # Limit samples to save memory
            break

        inputs = tokenizer(example["text"],
                         truncation=True,
                         max_length=1024,
                         return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=150,
                min_length=50,
                num_beams=4,
                early_stopping=True
            )

        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        ref = example["summary"]

        scores.append(scorer.score(ref, pred))
        torch.cuda.empty_cache()

    # Calculate averages
    avg_scores = {
        'rouge1': sum(s['rouge1'].fmeasure for s in scores) / len(scores),
        'rouge2': sum(s['rouge2'].fmeasure for s in scores) / len(scores),
        'rougeL': sum(s['rougeL'].fmeasure for s in scores) / len(scores)
    }
    return avg_scores

print("Evaluating pre-trained model...")
pretrained_scores = evaluate_model(pretrained_model, tokenizer, dataset)
print(f"Pre-trained ROUGE: {pretrained_scores}")

# Clean up
del pretrained_model
torch.cuda.empty_cache()

# =============================================
# 2. Evaluate Fine-tuned Model
# =============================================
finetuned_path = "legal-t5-lora"  # Your fine-tuned model path

# Load with same optimizations
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(
    finetuned_path,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto",
    low_cpu_mem_usage=True
)
finetuned_tokenizer = AutoTokenizer.from_pretrained(finetuned_path)

print("\nEvaluating fine-tuned model...")
finetuned_scores = evaluate_model(finetuned_model, finetuned_tokenizer, dataset)
print(f"Fine-tuned ROUGE: {finetuned_scores}")

# =============================================
# 3. Comparison Report
# =============================================
print("\nPerformance Comparison:")
for metric in ['rouge1', 'rouge2', 'rougeL']:
    delta = finetuned_scores[metric] - pretrained_scores[metric]
    print(f"{metric.upper():<7}: {pretrained_scores[metric]:.3f} → {finetuned_scores[metric]:.3f} ({delta:+.3f})")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Evaluating pre-trained model...
Pre-trained ROUGE: {'rouge1': 0.14013292474655317, 'rouge2': 0.05397039884115993, 'rougeL': 0.09792356565974421}

Evaluating fine-tuned model...
Fine-tuned ROUGE: {'rouge1': 0.19945795550302123, 'rouge2': 0.09179483394193164, 'rougeL': 0.1272012410494623}

Performance Comparison:
ROUGE1 : 0.140 → 0.199 (+0.059)
ROUGE2 : 0.054 → 0.092 (+0.038)
ROUGEL : 0.098 → 0.127 (+0.029)
