In [18]:
from datasets import load_dataset
import pandas as pd
import re
from transformers import PegasusTokenizer
from summa import summarizer
import spacy

nlp = spacy.load("en_core_web_sm")

# Load and merge datasets
dataset1 = load_dataset("ninadn/indian-legal")['train']  # Has 'Text' and 'Summary'
dataset2 = load_dataset("Yashaswat/Indian-Legal-Text-ABS")['train']  # Has 'judgement' and 'summary'

# Convert to pandas DataFrames
df1 = pd.DataFrame(dataset1)
df2 = pd.DataFrame(dataset2)

# Standardize column names
df1 = df1.rename(columns={'Text': 'Case', 'Summary': 'Summary'})
df2 = df2.rename(columns={'judgement': 'Case', 'summary': 'Summary'})

# Merge the DataFrames
merged_df = pd.concat([df1, df2], ignore_index=True)

# Convert back to Hugging Face dataset
from datasets import Dataset
ds = Dataset.from_pandas(merged_df)

# Take a subset for speed (optional, recommended for 2-day sprint)
ds = ds.select(range(min(1000, len(ds))))  # Use first 1000 samples or less

# Clean raw legal text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\\n', ' ', text)
    return text.strip()

# Extract citations and entities
def extract_legal_features(text):
    doc = nlp(text)
    citations = [ent.text for ent in doc.ents if ent.label_ in ["LAW", "ORG", "PERSON"]]
    return {"citations": citations, "cleaned_text": clean_text(text)}

# TextRank-based extractive summary with citation preservation
def extractive_summary_textrank(text, ratio=0.3):
    summary = summarizer.summarize(text, ratio=ratio)
    if not summary.strip():
        summary = text[:int(len(text) * ratio)]  # Fallback to truncation
    return summary

# Preprocessing with legal features
def preprocess_row(example):
    raw_case = example["Case"]
    summary = clean_text(example["Summary"])
    
    # Extract legal features
    legal_data = extract_legal_features(raw_case)
    extractive = extractive_summary_textrank(legal_data["cleaned_text"], ratio=0.3)
    
    return {
        "input_text": extractive,
        "target_summary": summary,
        "citations": legal_data["citations"]
    }

# Apply preprocessing
processed_dataset = ds.map(preprocess_row, remove_columns=ds.column_names)  # Fixed here

# Load PEGASUS tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# Tokenize with citation metadata
def tokenize_function(example):
    inputs = tokenizer(
        example["input_text"], truncation=True, padding="max_length", max_length=1024
    )
    targets = tokenizer(
        example["target_summary"], truncation=True, padding="max_length", max_length=128
    )
    inputs["labels"] = targets["input_ids"]
    inputs["citations"] = example["citations"]  # Preserve citations for later use
    return inputs

tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Save to disk
import torch
torch.save(tokenized_dataset, "tokenized_dataset_pegasus.pt")

print("✅ Extractive preprocessing + tokenization for PEGASUS complete.")
print(tokenized_dataset[0])  # Fixed here too: no 'train' key

Map: 100%|██████████| 1000/1000 [15:49<00:00,  1.05 examples/s]
Map: 100%|██████████| 1000/1000 [00:06<00:00, 149.82 examples/s]


✅ Extractive preprocessing + tokenization for PEGASUS complete.
{'input_ids': tensor([18539,   135,   109,  ...,  3467,   640,     1]), 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1]), 'labels': tensor([  139,  1395,   732,   115,  2132,   113,  9951,   593,  1035,   141,
         1201, 27030,   113,   109,   672,   113, 29674, 15536,  2046,   108,
        38876,   108,   117,   142,   198, 28039,  1395,   146,   270,   114,
         1863,  1395,   194,   373,   109,  1021,   110,   273,   113,  1201,
          950,  6806,   143,  9757,   158,   113,   109,  2128, 12315,  1035,
         2046,   108,  1925, 53968,   108,   111,   109,   713,   113,   253,
         1395,   246,  1923,   129, 26174,   115,  6506,   109,  1643,   135,
          253,   593,   118,   109,  2578,   113,  1201,   950,   113,   109,
         2128, 12315,  1035,  2046,   107,   139,  1395,   115,  2132,   113,
         3290,  7701, 29192,  1431,   593,  1035,   732,   141,   109, 29674,
         5227,  2046, 

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from rouge_score import rouge_scorer
import torch

# Load tokenizer + model
model_name = "google/pegasus-xsum"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# Load tokenized dataset
tokenized_dataset = torch.load("tokenized_dataset_pegasus.pt", weights_only=False)

# Split dataset manually into train and eval
train_size = int(0.8 * len(tokenized_dataset))
train_dataset = tokenized_dataset.select(range(train_size))
eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

# Custom compute metrics for ROUGE
def compute_metrics(pred):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    predictions = [tokenizer.decode(p, skip_special_tokens=True) for p in pred.predictions]
    labels = [tokenizer.decode(l, skip_special_tokens=True) for l in pred.label_ids]
    scores = [scorer.score(t, p) for t, p in zip(labels, predictions)]
    
    return {
        "rouge1": sum(s["rouge1"].fmeasure for s in scores) / len(scores),
        "rouge2": sum(s["rouge2"].fmeasure for s in scores) / len(scores),
        "rougeL": sum(s["rougeL"].fmeasure for s in scores) / len(scores),
    }

# Training config
training_args = Seq2SeqTrainingArguments(
    output_dir="./checkpoints_pegasus",
    per_device_train_batch_size=1,  # Minimal batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,  # Simulate batch size of 4
    predict_with_generate=True,
    eval_steps=50,  # Adjusted for smaller dataset
    learning_rate=5e-5,
    num_train_epochs=2,
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=50,
    logging_steps=25,
    fp16=False,  # M4 MPS uses FP32
    do_eval=True,
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Save the fine-tuned model
trainer.save_model("./pegasus_finetuned")
tokenizer.save_pretrained("./pegasus_finetuned")

print("✅ Fine-tuning complete. Model saved to ./pegasus_finetuned")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Seq2SeqTrainer(


RuntimeError: MPS backend out of memory (MPS allocated: 8.56 GB, other allocations: 504.36 MB, max allowed: 9.07 GB). Tried to allocate 16.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
    from transformers import PegasusForConditionalGeneration, Trainer, TrainingArguments

    # Load the model and tokenizer
    model_name = "google/pegasus-xsum"
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    training_args = TrainingArguments(
        output_dir='./results',          # output directory for model checkpoints
        evaluation_strategy="epoch",     # evaluate each `logging_steps`
        learning_rate=5e-5,              # learning rate
        per_device_train_batch_size=4,   # batch size for training
        per_device_eval_batch_size=4,    # batch size for evaluation
        weight_decay=0.01,               # strength of weight decay
        save_total_limit=3,              # limit the total amount of checkpoints
        num_train_epochs=3,              # number of training epochs
        report_to="tensorboard",         # enable logging to TensorBoard
    )

    # Set up TensorBoard to monitor training (run in a separate terminal)
    # tensorboard --logdir=./results/runs


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],  # Assuming this split exists
        eval_dataset=tokenized_dataset['validation']  # Assuming this split exists
    )

    # Train the model
    trainer.train()

    # Save the model after training
    model.save_pretrained('./final_model')

    # Train the model
    trainer.train()

    # Save the model after training
    model.save_pretrained('./final_model')

    # Evaluate the model on the test set (assuming a test split exists)
    results = trainer.evaluate(tokenized_dataset['test'])
    print(results)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: __init__() got an unexpected keyword argument 'evaluation_strategy'

In [17]:
import os
import shutil
from transformers.file_utils import default_cache_path

# Get the path to the cache directory
cache_dir = default_cache_path

# Check if the cache directory exists
if os.path.exists(cache_dir):
    # Remove the cache directory and all its contents
    shutil.rmtree(cache_dir)
    print(f"Removed the entire cache directory: {cache_dir}")
else:
    print("Cache directory not found.")


Removed the entire cache directory: /Users/puneetkohli/.cache/huggingface/hub


In [16]:
import os
from transformers.file_utils import default_cache_path

# Get the path to the cache directory
cache_dir = default_cache_path

# List all files in the cache directory
for filename in os.listdir(cache_dir):
    print(filename)



models--google--pegasus-xsum
datasets--Yashaswat--Indian-Legal-Text-ABS
.locks
datasets--ninadn--indian-legal
version.txt
datasets--d0r1h--ILC


In [None]:
cd ~/.cache/huggingface/transformers/
