In [1]:
!pip install transformers datasets scikit-learn




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import evaluate

# ===== CONFIG =====
lnn_model_name = "nlpaueb/legal-bert-base-uncased"  # proxy for LNN
t5_model_name = "t5-small"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_len_input = 512
max_len_output = 128
batch_size = 2
lnn_dim = 768
t5_dim = 512

# ===== DATASET =====
class LegalSummaryDataset(Dataset):
    def __init__(self, judgements, summaries, lnn_tokenizer, t5_tokenizer):
        self.judgements = judgements
        self.summaries = summaries
        self.lnn_tokenizer = lnn_tokenizer
        self.t5_tokenizer = t5_tokenizer

    def __len__(self):
        return len(self.judgements)

    def __getitem__(self, idx):
        # Tokenize for LNN (will be encoded in training loop)
        lnn_inputs = self.lnn_tokenizer(
            self.judgements[idx],
            return_tensors="pt",
            max_length=max_len_input,
            truncation=True,
            padding="max_length"
        )

        # Tokenize target for T5
        target_tokens = self.t5_tokenizer(
            self.summaries[idx],
            max_length=max_len_output,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        return {
            "lnn_input_ids": lnn_inputs["input_ids"].squeeze(),
            "lnn_attention_mask": lnn_inputs["attention_mask"].squeeze(),
            "labels": target_tokens["input_ids"].squeeze(),
            "decoder_attention_mask": target_tokens["attention_mask"].squeeze()
        }

# ===== LOAD DATA =====
def load_data(judgement_dir, summary_dir):
    judgements, summaries = [], []
    for fname in os.listdir(judgement_dir):
        with open(os.path.join(judgement_dir, fname), "r", encoding="utf-8") as jf, \
             open(os.path.join(summary_dir, fname), "r", encoding="utf-8") as sf:
            judgements.append(jf.read())
            summaries.append(sf.read())
    return judgements, summaries

train_judgements, train_summaries = load_data(
    "IN-Abs/train-data/judgement",
    "IN-Abs/train-data/summary"
)

test_judgements, test_summaries = load_data(
    "IN-Abs/test-data/judgement",
    "IN-Abs/test-data/summary"
)


# ===== LOAD MODELS =====
lnn_tokenizer = AutoTokenizer.from_pretrained(lnn_model_name)
lnn_model = AutoModel.from_pretrained(lnn_model_name).to(device)

t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device)

# Projection layer for embedding dimension match
projection = nn.Linear(lnn_dim, t5_dim).to(device)

# ===== DATALOADERS =====
train_dataset = LegalSummaryDataset(train_judgements, train_summaries, lnn_tokenizer, t5_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# ===== TRAINING =====
num_epochs = 10
optimizer = torch.optim.Adam(list(t5_model.parameters()) + list(projection.parameters()), lr=5e-5)

for epoch in range(num_epochs):
    t5_model.train()
    total_loss = 0

    for batch in train_loader:
        lnn_input_ids = batch["lnn_input_ids"].to(device)
        lnn_attention_mask = batch["lnn_attention_mask"].to(device)
        labels = batch["labels"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)

        # Get LNN embeddings
        with torch.no_grad():
            lnn_outputs = lnn_model(input_ids=lnn_input_ids, attention_mask=lnn_attention_mask)
            lnn_embeddings = lnn_outputs.last_hidden_state  # (batch, seq_len, 768)

        # Project to T5 dimension
        projected_embeddings = projection(lnn_embeddings)  # (batch, seq_len, 512)

        outputs = t5_model(
            inputs_embeds=projected_embeddings,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Avg Loss: {avg_loss:.4f}")

# ===== EVALUATION =====
rouge = evaluate.load("rouge")
t5_model.eval()
predictions = []

with torch.no_grad():
    for judgement in test_judgements:
        lnn_inputs = lnn_tokenizer(judgement, return_tensors="pt", max_length=max_len_input, truncation=True, padding="max_length").to(device)
        lnn_outputs = lnn_model(**lnn_inputs)
        lnn_embeddings = lnn_outputs.last_hidden_state
        projected_embeddings = projection(lnn_embeddings)

        output_ids = t5_model.generate(inputs_embeds=projected_embeddings, max_length=max_len_output)
        summary = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(summary)

# Compute ROUGE
rouge_scores = rouge.compute(predictions=predictions, references=test_summaries)
print("ROUGE Scores:", rouge_scores)


RuntimeError: Failed to import transformers.models.t5.modeling_t5 because of the following error (look up to see its traceback):
cannot import name 'float8_e4m3b11fnuz' from 'tensorflow.python.framework.dtypes' (C:\Users\asus\AppData\Local\Programs\Python\Python312\Lib\site-packages\tensorflow\python\framework\dtypes.py)

In [2]:
!pip install numpy==1.26.4 scipy==1.11.4 --force-reinstall

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)
Collecting scipy==1.11.4
  Downloading scipy-1.11.4-cp310-cp310-win_amd64.whl.metadata (60 kB)
Using cached numpy-1.26.4-cp310-cp310-win_amd64.whl (15.8 MB)
Downloading scipy-1.11.4-cp310-cp310-win_amd64.whl (44.1 MB)
   ---------------------------------------- 0.0/44.1 MB ? eta -:--:--
   - -------------------------------------- 1.6/44.1 MB 9.3 MB/s eta 0:00:05
   -- ------------------------------------- 2.9/44.1 MB 7.3 MB/s eta 0:00:06
   --- ------------------------------------ 3.9/44.1 MB 6.5 MB/s eta 0:00:07
   ---- ----------------------------------- 5.0/44.1 MB 6.0 MB/s eta 0:00:07
   ----- ---------------------------------- 6.0/44.1 MB 5.9 MB/s eta 0:00:07
   ------ --------------------------------- 7.1/44.1 MB 5.7 MB/s eta 0:00:07
   ------- -------------------------------- 8.4/44.1 MB 5.8 MB/s eta 0:00:07
   -------- ------------------------------- 9.7/44.1 MB 5.8 MB/s eta 0:00:06


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.45.1 requires packaging<25,>=20, but you have packaging 25.0 which is incompatible.

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
!pip install evaluate




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
!pip install rouge_score




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
