In [None]:
pip install transformers datasets torch kagglehub pandas scikit-learn


In [None]:
import kagglehub
import os
import pandas as pd

# Download the latest version of the legal document dataset
dataset_path = kagglehub.dataset_download("yuhi345432/legal-document-dataset")

# Verify dataset location
print("Path to dataset files:", dataset_path)

# List files in dataset directory
files = os.listdir(dataset_path)
print("Dataset Files:", files)


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Define dataset path
dataset_path = "/root/.cache/kagglehub/datasets/yuhi345432/legal-document-dataset/versions/1/dataset/IN-Abs"

# Define paths for train and test data
train_judgement_path = os.path.join(dataset_path, "train-data", "judgement")
train_summary_path = os.path.join(dataset_path, "train-data", "summary")

test_judgement_path = os.path.join(dataset_path, "test-data", "judgement")
test_summary_path = os.path.join(dataset_path, "test-data", "summary")

# Function to load text files from a directory
def load_text_files(folder_path):
    if not os.path.exists(folder_path):
        print(f"⚠️ Warning: Path does not exist: {folder_path}")
        return {}

    documents = {}
    files = sorted(os.listdir(folder_path))  # Sort files to maintain order
    for file in files:
        file_path = os.path.join(folder_path, file)
        if os.path.isfile(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                documents[file] = f.read()
    return documents  # Returns {filename: content}

# Load training data
train_judgements = load_text_files(train_judgement_path)
train_summaries = load_text_files(train_summary_path)

# Load test data
test_judgements = load_text_files(test_judgement_path)
test_summaries = load_text_files(test_summary_path)

# Match training judgements with summaries
train_files = set(train_judgements.keys()) & set(train_summaries.keys())  # Find matching files
train_data = [(train_judgements[f], train_summaries[f]) for f in train_files]

# Match test judgements with summaries
test_files = set(test_judgements.keys()) & set(test_summaries.keys())  # Find matching files
test_data = [(test_judgements[f], test_summaries[f]) for f in test_files]

# Convert to DataFrame
train_df = pd.DataFrame(train_data, columns=["document", "summary"])
test_df = pd.DataFrame(test_data, columns=["document", "summary"])

# Split train data into Train (80%) and Validation (20%)
if len(train_df) > 2:
    train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
else:
    train_df, val_df = train_df, pd.DataFrame(columns=["document", "summary"])  # Avoid empty validation

# Print dataset sizes
print(f" Total Train Documents: {len(train_df)}")
print(f"Total Validation Documents: {len(val_df)}")
print(f" Total Test Documents: {len(test_df)}")


In [None]:
from transformers import PegasusTokenizer

# Load PEGASUS tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(df):
    """Tokenizes the input documents and summaries."""
    if df.empty:
        return None  # Return None if dataset is empty

    model_inputs = tokenizer(
        list(df["document"]), truncation=True, padding="max_length", max_length=512
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            list(df["summary"]), truncation=True, padding="max_length", max_length=128
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize training and validation sets
train_encodings = tokenize_function(train_df)
val_encodings = tokenize_function(val_df)
test_encodings = tokenize_function(test_df)  # Optional, for test set evaluation

print(" Tokenization complete.")


In [None]:
import torch

class LegalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        """Initializes dataset with tokenized encodings."""
        if encodings is None:
            raise ValueError("Encodings cannot be None. Ensure tokenization is done correctly.")
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        """Returns dictionary of tensors for a given index."""
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

# Create datasets safely (handling potential empty cases)
train_dataset = LegalDataset(train_encodings) if train_encodings else None
val_dataset = LegalDataset(val_encodings) if val_encodings else None
test_dataset = LegalDataset(test_encodings) if test_encodings else None  # Optional test set

# Print dataset sizes
print(f" PyTorch datasets created:")
print(f"   - Training samples: {len(train_dataset) if train_dataset else 0}")
print(f"   - Validation samples: {len(val_dataset) if val_dataset else 0}")
print(f"   - Test samples: {len(test_dataset) if test_dataset else 0}")

In [None]:
from transformers import PegasusForConditionalGeneration, Trainer, TrainingArguments

# Load PEGASUS model
model_name = "google/pegasus-xsum"
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Define training parameters with improved settings
training_args = TrainingArguments(
    output_dir="./pegasus_legal_summarizer",  # Where model checkpoints will be saved
    evaluation_strategy="epoch",  # Evaluate at end of every epoch
    save_strategy="epoch",  # Save model after each epoch
    per_device_train_batch_size=4,  # Increased batch size for efficiency (adjust as per GPU RAM)
    per_device_eval_batch_size=4,
    num_train_epochs=10,  # Increased epochs for better fine-tuning
    learning_rate=3e-5,  # Slightly increased learning rate for better adaptation
    weight_decay=0.01,  # Prevent overfitting
    logging_dir="./logs",  # Logging directory for TensorBoard
    logging_steps=500,  # Log after every 500 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints to save space
    report_to="none",  # Avoid unnecessary logging (set to "wandb" if using Weights & Biases)
    load_best_model_at_end=True,  # Load the best checkpoint automatically
    metric_for_best_model="eval_loss",
)

# Ensure datasets exist before training
if train_dataset is None or val_dataset is None:
    raise ValueError("Training and validation datasets must not be None. Check tokenization and dataset loading.")

# Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start training
trainer.train()


In [None]:
import torch

def summarize_text(text, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
    """Generates a summary for a given text using PEGASUS."""

    # Move model to the appropriate device
    model.to(device)

    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Generate summary
    with torch.no_grad():  # Disable gradient calculations for efficiency
        summary_ids = model.generate(
            **inputs,
            max_length=128,
            num_beams=5,  # Increased beams for better quality
            early_stopping=True,
            repetition_penalty=2.5,  # Avoids repetitive phrases
            length_penalty=1.0,  # Controls summary length
        )

    # Decode and return summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Ensure validation texts exist
if len(test_df) > 0:
    example_doc = test_df["document"].iloc[0]
    generated_summary = summarize_text(example_doc, model, tokenizer)

    print("\n🔹 Original Document (First 500 chars):\n", example_doc[:500])
    print("\n🔹 Original Document (First 500 chars):\n", example_doc[:])
    print("\n📝 Generated Summary:\n", generated_summary)
else:
    print("⚠ No validation texts available. Check dataset loading.")


In [None]:
import os
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Define save directory
save_dir = "./fine_tuned_pegasus_legal_2"

# Ensure directory exists
os.makedirs(save_dir, exist_ok=True)

# Save model and tokenizer
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(" Fine-tuned PEGASUS model saved successfully!")

#  Reload the model later for inference
model = PegasusForConditionalGeneration.from_pretrained(save_dir)
tokenizer = PegasusTokenizer.from_pretrained(save_dir)
print(" Fine-tuned PEGASUS model reloaded successfully!")

#  Verify reloading by summarizing a sample from validation dataset
if len(test_df) > 0:
    example_doc = test_df["document"].iloc[0]
    generated_summary = summarize_text(example_doc, model, tokenizer)

    print("\n Original Document (First 500 chars):\n", example_doc[:1000])
    print("\n Reloaded Model - Generated Summary:\n", generated_summary)
else:
    print(" No validation texts available. Check dataset loading.")


In [None]:
if len(test_df) > 0:
    example_doc = test_df["document"].iloc[0]
    generated_summary = summarize_text(example_doc, model, tokenizer)

    print("\n Original Document (First 500 chars):\n", example_doc[:500])
    print("\n Reloaded Model - Generated Summary:\n", generated_summary)
else:
    print("⚠ No validation texts available. Check dataset loading.")