In [None]:
from google.colab import drive
import zipfile

# Mount Google Drive
drive.mount('/content/drive')

# Extract the dataset
zip_path = '/content/drive/MyDrive/IN-Abs.zip'
extract_path = '/content/legal_data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

Mounted at /content/drive


In [None]:
# === STEP 1: Load and Preprocess Data ===
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import os

MAX_INPUT_LEN = 300
MAX_TARGET_LEN = 100

def load_data(judgments_path, summaries_path):
    data_pairs = []
    for filename in os.listdir(judgments_path):
        judgment_file = os.path.join(judgments_path, filename)
        summary_file = os.path.join(summaries_path, filename)

        if os.path.isfile(judgment_file) and os.path.isfile(summary_file):
            with open(judgment_file, 'r', encoding='utf-8') as f1, \
                 open(summary_file, 'r', encoding='utf-8') as f2:
                judgment = f1.read().strip()
                summary = f2.read().strip()
                summary = "<start> " + summary + " <end>"
                data_pairs.append((judgment, summary))
    return data_pairs

train_data = load_data("/content/legal_data/IN-Abs/train-data/judgement",
                       "/content/legal_data/IN-Abs/train-data/summary")
val_data = load_data("/content/legal_data/IN-Abs/test-data/judgement",
                     "/content/legal_data/IN-Abs/test-data/summary")


In [None]:
#LSTM
# Tokenization
tokenizer = Tokenizer(oov_token='<OOV>')
all_texts = [j for j, s in train_data] + [s for j, s in train_data]
tokenizer.fit_on_texts(all_texts)

input_sequences = tokenizer.texts_to_sequences([j for j, s in train_data])
target_sequences = tokenizer.texts_to_sequences([s for j, s in train_data])

input_sequences = [seq[:MAX_INPUT_LEN] for seq in input_sequences]
target_sequences = [seq[:MAX_TARGET_LEN] for seq in target_sequences]

encoder_input_data = pad_sequences(input_sequences, maxlen=MAX_INPUT_LEN, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=MAX_TARGET_LEN, padding='post')

# Decoder target is shifted decoder_input
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

# === STEP 2: Build Seq2Seq Model ===
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
latent_dim = 256

encoder_inputs = Input(shape=(None,))
decoder_inputs = Input(shape=(None,))

embedding_layer = Embedding(vocab_size, embedding_dim)
enc_emb = embedding_layer(encoder_inputs)
dec_emb = embedding_layer(decoder_inputs)

encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
decoder_outputs, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(dec_emb, initial_state=[state_h, state_c])
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# === STEP 3: Train ===
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=32, epochs=15, validation_split=0.1)

# === STEP 4: Inference Models ===
encoder_model = Model(encoder_inputs, [state_h, state_c])

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,))
dec_emb_inf = embedding_layer(decoder_inputs_single)
decoder_lstm_inf, state_h_inf, state_c_inf = LSTM(latent_dim, return_sequences=True, return_state=True)(
    dec_emb_inf, initial_state=decoder_states_inputs)
decoder_outputs_inf = decoder_dense(decoder_lstm_inf)

decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs_inf, state_h_inf, state_c_inf]
)

# === STEP 5: Decoding Function ===
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq, verbose=0)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index.get('<start>', 1)

    decoded_sentence = []
    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(decoded_sentence) > MAX_TARGET_LEN:
            break
        decoded_sentence.append(sampled_word)
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(decoded_sentence)

# === STEP 6: Summarization Function ===
def summarize_text(input_text):
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=MAX_INPUT_LEN, padding='post')
    return decode_sequence(input_seq)

# === STEP 7: Test Summarization ===
for i, (judgment, ref_summary) in enumerate(val_data[:5]):
    print(f"--- Sample {i+1} ---")
    print("Original Judgment:\n", judgment[:500], "...")
    print("\nGenerated Summary:\n", summarize_text(judgment))
    print("\nReference Summary:\n", ref_summary[:500], "...\n")

Epoch 1/15
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 501ms/step - loss: 8.0960 - val_loss: 6.4751
Epoch 2/15
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 550ms/step - loss: 6.4417 - val_loss: 6.4744
Epoch 3/15
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 554ms/step - loss: 6.3701 - val_loss: 6.0534
Epoch 4/15
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 556ms/step - loss: 5.9212 - val_loss: 5.7706
Epoch 5/15
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 559ms/step - loss: 5.6416 - val_loss: 5.5668
Epoch 6/15
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 558ms/step - loss: 5.4138 - val_loss: 5.3902
Epoch 7/15
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 561ms/step - loss: 5.2104 - val_loss: 5.2680
Epoch 8/15
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 558ms/step - loss: 5.0791 - val_loss: 5.1797
Epoch 9/

In [None]:
# === STEP 8: Evaluate with ROUGE ===
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = []

for judgment, ref_summary in val_data[:100]:  # Use a reasonable subset (e.g., 100)
    input_seq = tokenizer.texts_to_sequences([judgment])
    input_seq = pad_sequences(input_seq, maxlen=MAX_INPUT_LEN, padding='post')
    decoded_summary = decode_sequence(input_seq)

    # Clean reference summary
    ref_summary_clean = ref_summary.replace('<start>', '').replace('<end>', '').strip()
    decoded_summary_clean = decoded_summary.strip()

    # Score
    score = scorer.score(ref_summary_clean, decoded_summary_clean)
    scores.append(score)

# Compute average ROUGE scores
avg_rouge1 = sum([s['rouge1'].fmeasure for s in scores]) / len(scores)
avg_rouge2 = sum([s['rouge2'].fmeasure for s in scores]) / len(scores)
avg_rougeL = sum([s['rougeL'].fmeasure for s in scores]) / len(scores)

print(f"Average ROUGE-1 Score: {avg_rouge1:.4f}")
print(f"Average ROUGE-2 Score: {avg_rouge2:.4f}")
print(f"Average ROUGE-L Score: {avg_rougeL:.4f}")

Average ROUGE-1 Score: 0.0663
Average ROUGE-2 Score: 0.0013
Average ROUGE-L Score: 0.0519


In [None]:
#T5

In [None]:
from datasets import Dataset
import pandas as pd

# Convert lists of tuples into DataFrames
train_df = pd.DataFrame(train_data, columns=["judgment", "summary"])
val_df = pd.DataFrame(val_data, columns=["judgment", "summary"])

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

#TOKENIZATION USING T5 TOKENIZER
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["judgment"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LEN, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=MAX_TARGET_LEN, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_tokenized = val_dataset.map(preprocess_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/7030 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
#WITHOUT FINE TUNING T5
from transformers import T5ForConditionalGeneration

# Load pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small").to("cuda")

# Generate predictions (zero-shot)
def generate_summaries(dataset):
    inputs = tokenizer(["summarize: " + x for x in dataset["judgment"]],
                       return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_LEN).to("cuda")

    outputs = model.generate(**inputs, max_length=MAX_TARGET_LEN)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Generate predictions
zero_shot_preds = generate_summaries(val_dataset)

# Extract ground truth
references = val_dataset["summary"]


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# View Sample Predictions
for i in range(3):
    print(f"\nJudgment: {val_dataset[i]['judgment'][:300]}...")
    print(f"          Reference Summary: {references[i]}")
    print(f"          Zero-Shot Prediction: {zero_shot_preds[i]}")



Judgment: Appeal No. 390 of 1966.
Appeal by special leave from the Award, dated January 11, 13, 1964 of the Industrial Tribunal, Orissa, Cuttack in Industrial Dispute Case No. 8 of 1962.
D.L. Sengupta, Janardan Sharma, Anil Das Chowdhury and S.K. Nandy, for the appellants.
H.R. Gokhale, K. Gobind Das, N.C. Sh...
          Reference Summary: <start> An industrial dispute regarding fixation of wages and bonus, between the respondent paper mills and its workmen, the appellants, was referred to the Industrial Tribunal in October, 1962, and the reference included a dispute about bonus payable for the years 1962 63 and 1963 64.
The Tribunal held: (1 ) that there were in the region no ' other concerns in the same line of business which could be compared with the respondent but that there were three collieries, a steel plant, a cement factory and an aluminum company in the region which were comparable with the resportdent, that as the minimum wage in those industries which was about Rs. 95 wa

In [None]:
import evaluate

rouge = evaluate.load("rouge")
# Compute ROUGE scores
zero_shot_rouge = rouge.compute(predictions=zero_shot_preds, references=references)

# Print ROUGE scores (evaluate library returns float scores)
for metric, score in zero_shot_rouge.items():
    print(f"{metric.upper()} - F1: {score:.4f}")



ROUGE1 - F1: 0.0804
ROUGE2 - F1: 0.0340
ROUGEL - F1: 0.0600
ROUGELSUM - F1: 0.0729


In [None]:
!pip install rouge_score
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
#FINE TUNE T5

In [None]:
!pip install numpy==1.26

Collecting numpy==1.26
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you 

In [None]:
from datasets import Dataset
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm
import evaluate

# === 1. Prepare Dataset ===
train_data = [("judgment text 1", "summary 1"), ("judgment text 2", "summary 2")]
val_data = [("judgment val 1", "summary val 1")]

train_df = pd.DataFrame(train_data, columns=["judgment", "summary"])
val_df = pd.DataFrame(val_data, columns=["judgment", "summary"])

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# === 2. Tokenization ===
MAX_INPUT_LEN = 512
MAX_TARGET_LEN = 150

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["judgment"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LEN, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(examples["summary"], max_length=MAX_TARGET_LEN, truncation=True, padding="max_length", return_tensors="pt")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_tokenized = val_dataset.map(preprocess_function, batched=True)

# === 3. Convert to TensorDatasets ===
train_dataset_tensors = TensorDataset(
    torch.tensor(train_tokenized["input_ids"]),
    torch.tensor(train_tokenized["attention_mask"]),
    torch.tensor(train_tokenized["labels"])
)

val_dataset_tensors = TensorDataset(
    torch.tensor(val_tokenized["input_ids"]),
    torch.tensor(val_tokenized["attention_mask"]),
    torch.tensor(val_tokenized["labels"])
)

# === 4. Model Setup ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 3e-4

train_loader = DataLoader(train_dataset_tensors, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset_tensors, batch_size=BATCH_SIZE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

#loss=cross entropy loss
# === 5. Training Loop ===
model.train()
for epoch in range(EPOCHS):
    print(f"\n Epoch {epoch + 1}")
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())

# === 6. Evaluation ===
model.eval()
rouge = evaluate.load("rouge")

def generate_summaries(df):
    texts = ["summarize: " + x for x in df["judgment"]]
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_LEN).to(device)
    output_ids = model.generate(**inputs, max_length=MAX_TARGET_LEN)
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return preds

references = val_df["summary"].tolist()
predictions = generate_summaries(val_df)

for i in range(min(3, len(val_df))):
    print(f"\n Judgment: {val_df.iloc[i]['judgment']}")
    print(f" Reference: {references[i]}")
    print(f" Prediction: {predictions[i]}")

scores = rouge.compute(predictions=predictions, references=references)
for metric, value in scores.items():
    print(f"{metric.upper()} - F1: {value:.4f}")


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]


 Epoch 1


Epoch 1: 100%|██████████| 1/1 [00:08<00:00,  8.48s/it, loss=14.9]



 Epoch 2


Epoch 2: 100%|██████████| 1/1 [00:05<00:00,  5.98s/it, loss=9.69]



 Epoch 3


Epoch 3: 100%|██████████| 1/1 [00:09<00:00,  9.57s/it, loss=7.94]



 Judgment: judgment val 1
 Reference: summary val 1
 Prediction: val 1:
ROUGE1 - F1: 0.8000
ROUGE2 - F1: 0.6667
ROUGEL - F1: 0.8000
ROUGELSUM - F1: 0.8000


In [None]:
val_df = pd.DataFrame(val_data, columns=["judgment", "summary"])

In [None]:
# === 7. Test Inference from val_df ===
def summarize_judgment(judgment_text):
    model.eval()
    input_text = "summarize: " + judgment_text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_LEN).to(device)
    output_ids = model.generate(**inputs, max_length=MAX_TARGET_LEN)
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

# Run summarization on first 3 test judgments from val_df
print("\n Summarizing First 3 Judgments from Test Set:")
for i in range(3):
    original = val_df.iloc[i]['judgment']
    reference = val_df.iloc[i]['summary']
    prediction = summarize_judgment(original)

    print(f"\n ------------------------------------------------Judgment {i+1}:\n{original}")
    print(f" --------------------------------------------------Reference Summary:\n{reference}")
    print(f" --------------------------------------------------Predicted Summary:\n{prediction}")



💬 Summarizing First 3 Judgments from Test Set:

 ------------------------------------------------Judgment 1:
il Appeal No.18 of 1955.
Appeal from the judgment and decree dated March 20, 1651, of the Mysore High Court in R.A. No. 155 of 1947 48, arising out of the judgment and decree dated December 19, 1947, of the Court of Sub_Judge, Mysore, in 0.
section Suit No. 44 of 1946 47.
section K. Venkataranga Iyengar and K. Keshava Iyengar, for the appellant.
A. V. Viswanatha Sastri and K. R. Choudhry, for respondent No. 1. 1958.
November 13.
The Judgment of the Court was delivered by GAJENDRAGADKAR, J.
This appeal arises from a suit brought by the appellant in the court of the Subordinate Judge, Mysore, as the sole executor of the will alleged to have been executed by one Lakshmamma on August 22, 1945, (exhibit A).
In this suit the appellant claimed a declaration that the said Lakshmamma was the owner of the properties mentioned in the schedule attached to the plaint and as such was entitle

In [None]:
#BART
!pip install transformers datasets evaluate




In [None]:
#LEGAL BERT

In [None]:
!pip install transformers datasets rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=bcb37c793272a1be8ab4ed65048dd6b9697b865d090fa0ac738ba8827d3a5793
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

def preprocess_data(data, tokenizer, max_input_len=512, max_target_len=128):
    input_ids, attention_masks, labels = [], [], []

    for judgment, summary in data:
        encoded_input = tokenizer(judgment,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=max_input_len,
                                  return_tensors='pt')

        encoded_summary = tokenizer(summary,
                                    padding='max_length',
                                    truncation=True,
                                    max_length=max_target_len,
                                    return_tensors='pt')

        input_ids.append(encoded_input['input_ids'][0])
        attention_masks.append(encoded_input['attention_mask'][0])
        labels.append(encoded_summary['input_ids'][0])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

train_encodings = preprocess_data(train_data, tokenizer)
val_encodings = preprocess_data(val_data, tokenizer)

In [None]:
from torch.utils.data import Dataset

class LegalSummaryDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

train_dataset = LegalSummaryDataset(train_encodings)
val_dataset = LegalSummaryDataset(val_encodings)

In [None]:
from transformers import EncoderDecoderModel, TrainingArguments, Trainer, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "nlpaueb/legal-bert-base-uncased", "nlpaueb/legal-bert-base-uncased"
)

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

from transformers import EncoderDecoderModel, TrainingArguments, Trainer, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "nlpaueb/legal-bert-base-uncased", "nlpaueb/legal-bert-base-uncased"
)

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

training_args = TrainingArguments(
    output_dir="./legalbert_summarizer",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='epoch'
)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnish522005[0m ([33mnish522005-manipal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Epoch,Training Loss,Validation Loss
1,3.1681,3.057305
2,2.8119,2.894051
3,2.6233,2.858277


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


TrainOutput(global_step=10545, training_loss=3.028087808872185, metrics={'train_runtime': 4577.4285, 'train_samples_per_second': 4.607, 'train_steps_per_second': 2.304, 'total_flos': 1.29377469791232e+16, 'train_loss': 3.028087808872185, 'epoch': 3.0})

In [None]:
trainer.evaluate()

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


{'eval_loss': 2.858276605606079,
 'eval_runtime': 4.6163,
 'eval_samples_per_second': 21.662,
 'eval_steps_per_second': 10.831,
 'epoch': 3.0}

In [None]:
import torch

# Set model to evaluation mode
model.eval()

# Prepare to store predictions and labels
all_preds = []
all_labels = []

for batch in val_dataset:
    # Assuming batch contains 'input_ids', 'attention_mask', and 'labels'
    input_ids = torch.tensor(batch['input_ids']).unsqueeze(0).to(model.device)
    attention_mask = torch.tensor(batch['attention_mask']).unsqueeze(0).to(model.device)
    labels = batch['labels']

    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        pred = outputs[0].cpu().numpy()
        all_preds.append(pred)
        all_labels.append(labels)
pred_str = [tokenizer.decode(pred, skip_special_tokens=True) for pred in all_preds]
label_str = [tokenizer.decode(label, skip_special_tokens=True) for label in all_labels]
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1 = []
rouge2 = []
rougeL = []

for pred, label in zip(pred_str, label_str):
    scores = scorer.score(label, pred)
    rouge1.append(scores['rouge1'].fmeasure)
    rouge2.append(scores['rouge2'].fmeasure)
    rougeL.append(scores['rougeL'].fmeasure)

print("ROUGE-1:", sum(rouge1)/len(rouge1))
print("ROUGE-2:", sum(rouge2)/len(rouge2))
print("ROUGE-L:", sum(rougeL)/len(rougeL))

  input_ids = torch.tensor(batch['input_ids']).unsqueeze(0).to(model.device)
  attention_mask = torch.tensor(batch['attention_mask']).unsqueeze(0).to(model.device)
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


ROUGE-1: 0.15537418941455447
ROUGE-2: 0.04547265705269766
ROUGE-L: 0.13121445960783995
