In [2]:
import datasets
# from datasets import DatasetDict
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Load dataset
dataset = datasets.load_dataset("coastalcph/lex_glue", "scotus")

# Tokenizer setup
tokenizer_legalbert = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

class TextClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(TextClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return self.fc(outputs.pooler_output)

model_legalbert = TextClassifier("nlpaueb/legal-bert-base-uncased", num_labels=13)
model_roberta = TextClassifier("roberta-base", num_labels=13)
metrics_data = []

def tokenize_with_chunks(texts, tokenizer, max_length=512, stride=256):
    input_ids_list, attention_mask_list = [], []
    
    for text in texts:
        encoding = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            stride=stride,
            return_overflowing_tokens=True,
            return_tensors="pt"
        )
        input_ids_list.append(encoding["input_ids"])
        attention_mask_list.append(encoding["attention_mask"])
    
    return input_ids_list, attention_mask_list

def train_model(model, tokenizer, dataset, model_name, epochs=10, batch_size=1, lr=2e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # Prepare encodings with chunking
    train_input_ids, train_attention_masks = tokenize_with_chunks(dataset["train"]["text"], tokenizer)
    val_input_ids, val_attention_masks = tokenize_with_chunks(dataset["validation"]["text"], tokenizer)
    
    train_labels = torch.tensor(dataset["train"]["label"])
    val_labels = torch.tensor(dataset["validation"]["label"])
    
    train_data = list(zip(train_input_ids, train_attention_masks, train_labels))
    val_data = list(zip(val_input_ids, val_attention_masks, val_labels))

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for chunked_inputs, chunked_masks, label in train_data:
            optimizer.zero_grad()
            logits_chunks = []
            
            for i in range(chunked_inputs.size(0)):  # Loop over chunks
                input_ids = chunked_inputs[i].unsqueeze(0).to(device)
                attention_mask = chunked_masks[i].unsqueeze(0).to(device)
                logits = model(input_ids, attention_mask)
                logits_chunks.append(logits)

            # Average logits over all chunks
            avg_logits = torch.stack(logits_chunks).mean(dim=0)
            loss = criterion(avg_logits, label.unsqueeze(0).to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        with torch.no_grad():
            for chunked_inputs, chunked_masks, label in val_data:
                logits_chunks = []
                for i in range(chunked_inputs.size(0)):
                    input_ids = chunked_inputs[i].unsqueeze(0).to(device)
                    attention_mask = chunked_masks[i].unsqueeze(0).to(device)
                    logits = model(input_ids, attention_mask)
                    logits_chunks.append(logits)

                avg_logits = torch.stack(logits_chunks).mean(dim=0)
                pred = torch.argmax(avg_logits, dim=1)
                all_preds.append(pred.item())
                all_labels.append(label.item())

        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        metrics_data.append({"Model": model_name, "Epoch": epoch+1, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1": f1})
        print(f"[{model_name}] Epoch {epoch+1}: Loss = {total_loss:.4f}, Accuracy = {accuracy:.4f}")

train_model(model_legalbert, tokenizer_legalbert, dataset, "legal-bert-base-uncased")
train_model(model_roberta, tokenizer_roberta, dataset, "roberta-base")

def train_tfidf_svm(dataset):
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(dataset["train"]["text"])
    X_val_tfidf = vectorizer.transform(dataset["validation"]["text"])
    svm_model = SVC(kernel="linear", probability=True, random_state=42)
    svm_model.fit(X_train_tfidf, dataset["train"]["label"])
    y_pred = svm_model.predict(X_val_tfidf)
    accuracy = accuracy_score(dataset["validation"]["label"], y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(dataset["validation"]["label"], y_pred, average='weighted')
    metrics_data.append({"Model": "TFIDF+SVM", "Epoch": 10, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1": f1})
    print(f"[TFIDF+SVM] Accuracy = {accuracy:.4f}")
    return svm_model, vectorizer

train_tfidf_svm(dataset)
metrics_df = pd.DataFrame(metrics_data)
print(metrics_df)
plt.figure(figsize=(10, 5))
sns.lineplot(data=metrics_df, x="Epoch", y="Accuracy", hue="Model", marker="o")
plt.title("Accuracy per Epoch")
plt.show()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 19.38 MiB is free. Including non-PyTorch memory, this process has 39.46 GiB memory in use. Of the allocated memory 38.93 GiB is allocated by PyTorch, and 37.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='1,2,3,4,5'

import tensorflow  as tf
tf.config.list_physical_devices('GPU')

2025-04-08 13:23:03.819655: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:4', device_type='GPU')]

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'  # Use a single GPU to avoid OOM

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import datasets

# Load dataset
dataset = datasets.load_dataset("coastalcph/lex_glue", "scotus")

# Tokenizers
tokenizer_legalbert = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

# Model class
class TextClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(TextClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token output
        return self.fc(cls_token)

# Dataset with chunking
class ChunkedTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=384, stride=128):
        self.samples = []
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride

        for idx, text in enumerate(texts):
            encoding = tokenizer(
                text,
                truncation=True,
                padding="max_length",
                max_length=max_length,
                stride=stride,
                return_overflowing_tokens=True,
                return_tensors="pt"
            )
            for i in range(encoding['input_ids'].size(0)):
                self.samples.append({
                    'input_ids': encoding['input_ids'][i],
                    'attention_mask': encoding['attention_mask'][i],
                    'label': labels[idx],
                    'chunk_group': idx
                })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        return (
            item['input_ids'],
            item['attention_mask'],
            item['label'],
            item['chunk_group']
        )

# Training function
def train_model(model, tokenizer, dataset, model_name, epochs=5, batch_size=2, lr=2e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    metrics_data = []

    train_dataset = ChunkedTextDataset(dataset["train"]["text"], dataset["train"]["label"], tokenizer)
    val_dataset = ChunkedTextDataset(dataset["validation"]["text"], dataset["validation"]["label"], tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        group_logits = {}
        group_labels = {}

        for input_ids, attention_mask, labels, group_ids in train_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)

            for i in range(len(group_ids)):
                group_id = group_ids[i].item()
                if group_id not in group_logits:
                    group_logits[group_id] = []
                    group_labels[group_id] = labels[i].item()
                group_logits[group_id].append(logits[i])

        for group_id, logits_list in group_logits.items():
            avg_logits = torch.stack(logits_list).mean(dim=0)
            label = torch.tensor([group_labels[group_id]]).to(device)
            loss = criterion(avg_logits.unsqueeze(0), label)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        torch.cuda.empty_cache()

        # Validation
        model.eval()
        all_preds, all_labels = [], []
        val_logits = {}
        val_labels = {}

        with torch.no_grad():
            for input_ids, attention_mask, labels, group_ids in val_loader:
                input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
                logits = model(input_ids, attention_mask)

                group_id = group_ids.item()
                if group_id not in val_logits:
                    val_logits[group_id] = []
                    val_labels[group_id] = labels.item()
                val_logits[group_id].append(logits.squeeze(0))

            for group_id, logits_list in val_logits.items():
                avg_logits = torch.stack(logits_list).mean(dim=0)
                pred = torch.argmax(avg_logits)
                all_preds.append(pred.item())
                all_labels.append(val_labels[group_id])

        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        metrics_data.append({"Model": model_name, "Epoch": epoch + 1, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1": f1})
        print(f"[{model_name}] Epoch {epoch+1}: Loss = {total_loss:.4f}, Accuracy = {accuracy:.4f}")

    return metrics_data

# Run training
metrics_data = []
model_legalbert = TextClassifier("nlpaueb/legal-bert-base-uncased", num_labels=13)
metrics_data += train_model(model_legalbert, tokenizer_legalbert, dataset, "legal-bert-base-uncased")

model_roberta = TextClassifier("roberta-base", num_labels=13)
metrics_data += train_model(model_roberta, tokenizer_roberta, dataset, "roberta-base")

# TF-IDF + SVM
def train_tfidf_svm(dataset):
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(dataset["train"]["text"])
    X_val = vectorizer.transform(dataset["validation"]["text"])
    svm = SVC(kernel="linear", probability=True)
    svm.fit(X_train, dataset["train"]["label"])
    y_pred = svm.predict(X_val)
    accuracy = accuracy_score(dataset["validation"]["label"], y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(dataset["validation"]["label"], y_pred, average='weighted')
    return {"Model": "TFIDF+SVM", "Epoch": 10, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1": f1}

metrics_data.append(train_tfidf_svm(dataset))

# Plot
metrics_df = pd.DataFrame(metrics_data)
print(metrics_df)
plt.figure(figsize=(10, 5))
sns.lineplot(data=metrics_df[metrics_df["Model"] != "TFIDF+SVM"], x="Epoch", y="Accuracy", hue="Model", marker="o")
plt.title("Accuracy per Epoch")
plt.show()


2025-04-08 11:46:23.282342: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
import os
import json
import datasets
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Load dataset
dataset = datasets.load_dataset("coastalcph/lex_glue", "scotus")

# Tokenizer setup
tokenizer_legalbert = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

# Chunking and disk caching
def save_chunks_to_disk(dataset_split, tokenizer, split_name, max_length=384, stride=128):
    os.makedirs(f"chunks/{split_name}", exist_ok=True)
    for i, example in enumerate(dataset_split):
        encoding = tokenizer(
            example["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            stride=stride,
            return_overflowing_tokens=True,
            return_tensors="pt"
        )
        label = example["label"]
        for j in range(encoding["input_ids"].size(0)):
            chunk = {
                "input_ids": encoding["input_ids"][j].tolist(),
                "attention_mask": encoding["attention_mask"][j].tolist(),
                "label": label
            }
            with open(f"chunks/{split_name}/{i}_{j}.json", "w") as f:
                json.dump(chunk, f)

# Save chunks once for both models
print("Chunking and saving to disk...")
save_chunks_to_disk(dataset["train"], tokenizer_legalbert, "train")
save_chunks_to_disk(dataset["validation"], tokenizer_legalbert, "validation")

# Dataset class to load from disk
class ChunkedDataset(torch.utils.data.Dataset):
    def __init__(self, split_name):
        self.paths = [f"chunks/{split_name}/" + f for f in os.listdir(f"chunks/{split_name}") if f.endswith(".json")]

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        with open(self.paths[idx], "r") as f:
            item = json.load(f)
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "label": torch.tensor(item["label"], dtype=torch.long)
        }

class TextClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(TextClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.fc(cls_output)

metrics_data = []

def train_model(model, tokenizer, split_prefix, model_name, epochs=3, batch_size=8, lr=2e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    train_dataset = ChunkedDataset(f"/train")
    val_dataset = ChunkedDataset(f"/validation")
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Evaluation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)
                outputs = model(input_ids, attention_mask)
                preds = torch.argmax(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        acc = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        metrics_data.append({"Model": model_name, "Epoch": epoch+1, "Accuracy": acc, "Precision": precision, "Recall": recall, "F1": f1})
        print(f"[{model_name}] Epoch {epoch+1}: Loss={total_loss:.4f}, Accuracy={acc:.4f}")

# Train LegalBERT
train_model(TextClassifier("nlpaueb/legal-bert-base-uncased", num_labels=13), tokenizer_legalbert, "chunks", "LegalBERT")

# Train RoBERTa
train_model(TextClassifier("roberta-base", num_labels=13), tokenizer_roberta, "chunks", "RoBERTa")

# TF-IDF + SVM Baseline
def train_tfidf_svm(dataset):
    vectorizer = TfidfVectorizer(min_df=2, max_df=0.95)
    X_train = vectorizer.fit_transform(dataset["train"]["text"])
    X_val = vectorizer.transform(dataset["validation"]["text"])
    clf = SVC(kernel="linear")
    clf.fit(X_train, dataset["train"]["label"])
    preds = clf.predict(X_val)
    acc = accuracy_score(dataset["validation"]["label"], preds)
    precision, recall, f1, _ = precision_recall_fscore_support(dataset["validation"]["label"], preds, average='weighted')
    metrics_data.append({"Model": "TFIDF+SVM", "Epoch": 10, "Accuracy": acc, "Precision": precision, "Recall": recall, "F1": f1})
    print(f"[TFIDF+SVM] Accuracy={acc:.4f}")

train_tfidf_svm(dataset)

# Plot
metrics_df = pd.DataFrame(metrics_data)
print(metrics_df)
plt.figure(figsize=(10, 5))
sns.lineplot(data=metrics_df, x="Epoch", y="Accuracy", hue="Model", marker="o")
plt.title("Model Accuracy Over Epochs")
plt.show()


Chunking and saving to disk...


2025-04-08 13:56:27.159963: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
