<a href="https://colab.research.google.com/github/Prestigious526/Legal-Document-Summarization-using-Transformers/blob/main/Hybrid_SBERT_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================================================
# 1. SETUP: Install all necessary libraries
# ==============================================================================
print("Installing libraries...")
!pip install transformers datasets pandas matplotlib seaborn rouge-score sacrebleu bert-score sentencepiece torch tqdm evaluate --quiet

import os
import datasets
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BartTokenizer, BartForConditionalGeneration
)
from datasets import load_dataset
import evaluate
from tqdm.notebook import tqdm
from google.colab import drive

print("--- All libraries installed and imported successfully! ---")

Installing libraries...
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
--- All libraries installed and imported successfully! ---


In [None]:
# ==============================================================================
# 2. CONNECT TO YOUR GOOGLE DRIVE
# ==============================================================================
print("\n[STEP 1] Connecting to Google Drive...")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive successfully mounted!")
except Exception as e:
    print(f"Error mounting drive: {e}")
    raise SystemExit("Google Drive mount failed. Please fix connection issues and restart.")

# ==============================================================================
# 3. LOAD YOUR PRE-PROCESSED HYBRID DATA
# ==============================================================================
# Make sure this path is correct.
hybrid_data_path = "/content/drive/MyDrive/Hybrid_Dataset_Saved_SBERT"

print(f"\n[STEP 2] Loading your condensed TEST data from: {hybrid_data_path}")
try:
    # We only need the test dataset for evaluation
    loaded_hybrid_test_dataset = datasets.load_from_disk(f"{hybrid_data_path}/test")
    print("--- Condensed test data loaded successfully! ---")
    print(loaded_hybrid_test_dataset)
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please check that the path is correct.")
    raise SystemExit("Failed to load hybrid test data.")


[STEP 1] Connecting to Google Drive...
Mounted at /content/drive
Google Drive successfully mounted!

[STEP 2] Loading your condensed TEST data from: /content/drive/MyDrive/Hybrid_Dataset_Saved_SBERT
--- Condensed test data loaded successfully! ---
Dataset({
    features: ['text', 'summary'],
    num_rows: 3269
})


In [None]:
# ==============================================================================
# 4. DATA PREPARATION CLASS (CORRECTED)
# ==============================================================================
class BillSumDataset(Dataset):
    # --- This is the corrected version with DOUBLE underscores ---
    def __init__(self, docs, summaries, tokenizer,
                 max_input_len=1024,
                 max_output_len=128):
    # ---------------------------------------------------
        self.docs = docs
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    # --- Double underscore ---
    def __len__(self):
    # -----------------------
        return len(self.docs)

    # --- Double underscore ---
    def __getitem__(self, idx):
    # -----------------------
        doc = str(self.docs[idx])
        summary = str(self.summaries[idx])

        inputs = self.tokenizer(
            doc,
            max_length=self.max_input_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        targets = self.tokenizer(
            summary,
            max_length=self.max_output_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        labels = targets["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels
        }

In [None]:
# ==============================================================================
# 5. LOAD YOUR FINAL TRAINED MODEL
# ==============================================================================
print("\n[STEP 3] Loading your FINAL trained BART model from Google Drive...")

# --- THIS IS THE KEY ---
# We point to the final epoch you saved.
model_path = "/content/drive/MyDrive/My_ML_Project/sbert_hybrid_bart_model_epoch_4"
# ---------------------

tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# --- Create the PyTorch Test Dataset ---
test_dataset = BillSumDataset(loaded_hybrid_test_dataset['text'], loaded_hybrid_test_dataset['summary'], tokenizer)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(f"--- Model from {model_path} loaded successfully on {device} ---")


[STEP 3] Loading your FINAL trained BART model from Google Drive...
--- Model from /content/drive/MyDrive/My_ML_Project/sbert_hybrid_bart_model_epoch_4 loaded successfully on cuda ---


In [None]:
# ==============================================================================
# 6. EVALUATION
# ==============================================================================
print("\n[STEP 4] Starting Final Evaluation on the SBERT-HYBRID model...")
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")
bertscore = evaluate.load("bertscore")

model.eval() # Set model to evaluation mode
preds, refs = [], []
test_loader = DataLoader(test_dataset, batch_size=8) # Use a safe, small batch size

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating Hybrid Model"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]

        gen_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

        batch_preds = tokenizer.batch_decode(gen_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        labels[labels == -100] = tokenizer.pad_token_id
        batch_refs = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

        preds.extend(batch_preds)
        refs.extend(batch_refs)

# Compute metrics
rouge_scores = rouge.compute(predictions=preds, references=refs)
bleu_score = bleu.compute(predictions=preds, references=[[r] for r in refs])
bert_scores = bertscore.compute(predictions=preds, references=refs, lang="en")

print("\n\n=== FINAL EVALUATION RESULTS (SBERT-HYBRID MODEL) ===")
print("ROUGE:", rouge_scores)
print("BLEU:", bleu_score)
print(f"BERTScore F1 mean: {sum(bert_scores['f1']) / len(bert_scores['f1']):.4f}")

print("\n--- Evaluation Complete! ---")


[STEP 4] Starting Final Evaluation on the SBERT-HYBRID model...


Evaluating Hybrid Model:   0%|          | 0/409 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




=== FINAL EVALUATION RESULTS (SBERT-HYBRID MODEL) ===
ROUGE: {'rouge1': np.float64(0.511111008577326), 'rouge2': np.float64(0.3175068622218377), 'rougeL': np.float64(0.4013790951998616), 'rougeLsum': np.float64(0.427999994781588)}
BLEU: {'score': 27.026513109820186, 'counts': [159103, 96610, 74292, 59811], 'totals': [276480, 273211, 269942, 266673], 'precisions': [57.54593460648148, 35.360948131663804, 27.521467574516006, 22.428592320932378], 'bp': 0.8073037671753159, 'sys_len': 276480, 'ref_len': 335662}
BERTScore F1 mean: 0.8913

--- Evaluation Complete! ---
