# Install required libraries (if not already installed)

In [1]:
!pip install transformers datasets torch nltk torchvision bitsandbytes torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install --upgrade accelerate>=0.26.0
!pip install transformers[torch]

Looking in indexes: https://download.pytorch.org/whl/cu121


# Import necessary libraries

In [2]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset
import nltk

# Download sentence tokenizer

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shouv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load the tokenizer and model

In [4]:
model_name = 'facebook/bart-base'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

print("Model and Tokenizer Loaded Successfully!")

Model and Tokenizer Loaded Successfully!


# Define a Function for Abstractive Summarization

In [5]:
def generate_summary(text, max_input=1024, max_output=200):
    """
    Generate abstractive summary using BART model.
    
    Args:
        text (str): The legal text to summarize.
        max_input (int): Max token length for input text.
        max_output (int): Max token length for summary output.

    Returns:
        str: The generated summary.
    """
    # Tokenize input text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input, truncation=True).to(model.device)
    
    # Generate summary
    summary_ids = model.generate(inputs, max_length=max_output, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    # Decode and return summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test the function

In [6]:
sample_text = "In a landmark case, the Supreme Court ruled that freedom of speech does not include the right to incite violence. \
This decision overturned previous rulings and set a new precedent in constitutional law."

print("Generated Summary:", generate_summary(sample_text))

Generated Summary: summarize: In a landmark case, the Supreme Court ruled that freedom of speech does not include the right to incite violence. This decision overturned previous rulings and set a new precedent in constitutional law. The Supreme Court affirmed the First Amendment's First Amendment rights.


# Load a legal dataset (example: 'legal_trec' from Hugging Face datasets)

In [7]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0") 
# Rename columns to match "text" and "labels"
dataset = dataset.rename_columns({"article": "text", "highlights": "labels"})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 11490
    })
})


# Extract legal texts and their summaries

In [8]:
dataset["train"] = dataset["train"].select(range(20000))  # Use 100 samples for training
dataset["validation"] = dataset["validation"].select(range(1000))  # 20 for validation
dataset["test"] = dataset["test"].select(range(1000))  # 20 for testing

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 1000
    })
})


In [9]:
print(f"CUDA version: {torch.version.cuda}")

CUDA version: 12.1


# Fine-Tune BART on Legal Documents

In [10]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoModel
import torch
import time
from torch.utils.data import DataLoader
torch._dynamo.config.suppress_errors = True

# def preprocess_data(examples):
#     # Tokenize inputs
#     model_inputs = tokenizer(
#         examples["text"],
#         max_length=1024,
#         truncation=True,
#         padding="max_length"  # Changed from False to max_length
#     )
    
#     # Tokenize targets with the tokenizer
#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(
#             examples["labels"],
#             max_length=200,
#             truncation=True,
#             padding="max_length"  # Changed from False to max_length
#         )
    
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

def preprocess_data(examples):
    # Tokenize inputs and truncate/pad
    inputs = tokenizer(
        examples["text"],
        max_length=1024,
        truncation=True,
        padding="max_length"
    )
    
    # Tokenize targets (summaries)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["labels"],
            max_length=200,
            truncation=True,
            padding="max_length"
        )
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels["input_ids"]
    }
    
# Tokenize dataset
# tokenized_dataset = dataset.map(
#     preprocess_data,
#     batched=True,
#     remove_columns=["text", "labels"],
#     load_from_cache_file=True
# )

tokenized_dataset = dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=["text", "labels", "id"],  # Remove all non-tensor columns
    load_from_cache_file=False
)


# Training Arguments Optimized for Your Setup
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_legal_summarizer",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    generation_max_length=128,
    generation_num_beams=1,
    num_train_epochs=4,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=True,
    optim="adamw_bnb_8bit",
    gradient_checkpointing=True,
    dataloader_pin_memory=True,
    dataloader_num_workers=0,
    logging_steps=50,
    save_total_limit=2,
    predict_with_generate=True,
    remove_unused_columns=False  # Add this line
)

# Dynamic Padding Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True  # Enable dynamic padding
)

# Enable Flash Attention & Memory-Efficient Training
torch.backends.cuda.enable_flash_sdp(True)
torch.backends.cuda.enable_mem_efficient_sdp(True)
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.cuda.empty_cache()

# Optionally, create a custom DataLoader for training if you want to experiment with prefetch_factor
train_dataloader = DataLoader(
    tokenized_dataset["train"],
    batch_size=training_args.per_device_train_batch_size,
    num_workers=12,
    prefetch_factor=4,
    pin_memory=True,
    collate_fn=data_collator  # Use the data collator for dynamic padding
)
# Create Trainer using the custom DataLoader for training (if needed)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Start Training
trainer.train()

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,0.6187,0.483969
2,0.5613,0.47499
3,0.5074,0.479372
4,0.4775,0.478905




TrainOutput(global_step=5000, training_loss=0.6373593921661377, metrics={'train_runtime': 3039.7854, 'train_samples_per_second': 13.159, 'train_steps_per_second': 1.645, 'total_flos': 2.43894583296e+16, 'train_loss': 0.6373593921661377, 'epoch': 4.0})

# Load the fine tuned model

In [2]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

# Path to your saved model
model_path = "./bart_legal_summarizer/checkpoint-5000"  # Replace with your actual path

# Load tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

In [3]:
def summarize_text(text, max_input_length=1024, max_output_length=200):
    # Tokenize input
    inputs = tokenizer(
        "summarize: " + text,  # Add task prefix (optional, depends on your training)
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(device)
    
    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_output_length,
        num_beams=10,          # Beam search for better quality
        early_stopping=True,  # Stop early if plausible summary is found
        length_penalty=1.0    # Encourage longer summaries (adjust as needed)
    )
    
    # Decode and return
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [4]:
from nltk.tokenize import word_tokenize

def count_words(text):
    """Count words in a text string."""
    return len(word_tokenize(text))

def calculate_summarization_ratio(original_text, summary_text):
    """Calculate compression ratio of summary vs original text."""
    original_length = count_words(original_text)
    summary_length = count_words(summary_text)
    ratio = summary_length / original_length
    return ratio

In [5]:
# Example legal text
legal_text = """
The general principles governing the exercise of the discretion to award indemnity costs after rejection by an 
unsuccessful party of a so called Calderbank letter were set out in the judgment of the Full Court in Black v 
Lipovac [1998] FCA 699 ; (1998) 217 ALR 386. In summary those principles are: 1. Mere refusal of a "Calderbank offer" 
does not itself warrant an order for indemnity costs. In this connection it may be noted that Jessup J in Dais Studio 
Pty Ltd v Bullet Creative Pty Ltd [2008] FCA 42 said that (at [6]): if the rejection of such an offer is to ground a 
claim for indemnity costs, it must be by reason of some circumstance other than that the offer happened to comply with 
the Calderbank principle. 2. To obtain an order for indemnity costs the offeror must show that the refusal to accept it
was unreasonable. 3. The reasonableness of the conduct of the offeree is to be viewed in the light of the circumstances 
that existed when the offer was rejected.
"""

# Generate summary using your trained model
summary = summarize_text(legal_text)

# Calculate ratio
ratio = calculate_summarization_ratio(legal_text, summary)
print(f"Summarization Ratio: {ratio:.2f} (Summary is {ratio*100:.1f}% the length of the original)")
summary = summarize_text(legal_text)
print("Generated Summary:", summary)

Summarization Ratio: 0.33 (Summary is 32.7% the length of the original)
Generated Summary: The general principles governing the exercise of the discretion to award indemnity costs after rejection by an unsuccessful party of a so called Calderbank letter were set out in the judgment of the Full Court in Black v Lipovac .
In summary those principles are: 1. Mere refusal of a "Calderbank offer" 
does not itself warrant an order for indemnity .
