In [5]:
import datasets
import re
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import requests
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

# Load dataset
dataset = datasets.load_dataset("coastalcph/lex_glue", "scotus")

# Load legal dictionary from GitHub
legal_dict_url = "https://raw.githubusercontent.com/Victorambrose/BERT_Optimize/main/US_legal_dict.txt"
response = requests.get(legal_dict_url)
legal_terms = set(response.text.splitlines())

# Ensure necessary NLTK resources are downloaded
nltk_data_path = os.path.expanduser("~/nltk_data")  # Store in home directory
nltk.data.path.append(nltk_data_path)

nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('omw-1.4', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)
nltk.download('maxent_ne_chunker', download_dir=nltk_data_path)
nltk.download('words', download_dir=nltk_data_path)

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    tokens = word_tokenize(text.lower())  # Tokenize & lowercase
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))  # Load stopwords once
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words or word in legal_terms]
    return ' '.join(tokens)

# Apply preprocessing safely
dataset = dataset.map(lambda x: {"text": preprocess_text(x["text"])} )

# Tokenization
tokenizer_legalbert = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer_legalbert(examples["text"], truncation=True, padding=True, return_tensors="pt")

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define model class
class TextClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(TextClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return self.fc(outputs.pooler_output)

# Initialize models
model_legalbert = TextClassifier("nlpaueb/legal-bert-base-uncased", num_labels=13)
model_roberta = TextClassifier("roberta-base", num_labels=13)

# Training function
def train_model(model, tokenizer, dataset, epochs=3, batch_size=8, lr=2e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        dataset["train"]["text"], dataset["train"]["label"], test_size=0.1, random_state=42)
    
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")
    
    train_dataset = torch.utils.data.TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(train_labels))
    val_dataset = torch.utils.data.TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], torch.tensor(val_labels))
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

    return model

# Evaluation function
def evaluate_model(model, tokenizer, dataset):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    test_texts, test_labels = dataset["test"]["text"], dataset["test"]["label"]
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
    test_dataset = torch.utils.data.TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"], torch.tensor(test_labels))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)
    
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Train and evaluate both models
trained_legalbert = train_model(model_legalbert, tokenizer_legalbert, dataset)
trained_roberta = train_model(model_roberta, tokenizer_roberta, dataset)

legalbert_results = evaluate_model(trained_legalbert, tokenizer_legalbert, dataset)
roberta_results = evaluate_model(trained_roberta, tokenizer_roberta, dataset)

print("Legal-BERT Results:", legalbert_results)
print("RoBERTa Results:", roberta_results)

# Determine best model
best_model = "Legal-BERT" if legalbert_results["accuracy"] > roberta_results["accuracy"] else "RoBERTa"
print(f"Best performing model: {best_model}")

[nltk_data] Downloading package wordnet to /home/srmist5/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/srmist5/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/srmist5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/srmist5/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/srmist5/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/srmist5/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/srmist5/nltk_data...
[nltk_data]   Package words is already up-to-date!


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 1.238406051671526
Epoch 2, Loss: 0.6958643584570071
Epoch 3, Loss: 0.46629333054087724
Epoch 1, Loss: 1.450737627663155
Epoch 2, Loss: 0.8909384776601571
Epoch 3, Loss: 0.6953454813250005


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Legal-BERT Results: {'accuracy': 0.75, 'precision': 0.7436293084805153, 'recall': 0.75, 'f1': 0.7361039401937158}
RoBERTa Results: {'accuracy': 0.6664285714285715, 'precision': 0.6709629188741265, 'recall': 0.6664285714285715, 'f1': 0.6456191849879634}
Best performing model: Legal-BERT


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= '3,4'
import tensorflow as tf
tf.config.list_physical_devices('GPU')

2025-03-28 11:13:45.282723: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

In [10]:
pip install nltk


[33mDEPRECATION: Loading egg at /raid/home/srmist5/miniconda3/envs/tamil/lib/python3.12/site-packages/mask_rcnn-2.1-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/srmist5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/srmist5/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/srmist5/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/srmist5/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/srmist5/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/srmist5/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/srmist5/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [7]:
import nltk
nltk.data.path.append("/usr/local/nltk_data")  # Adjust path as needed

nltk.download('stopwords', download_dir='/usr/local/nltk_data')
nltk.download('punkt', download_dir='/usr/local/nltk_data')
nltk.download('wordnet', download_dir='/usr/local/nltk_data')
nltk.download('omw-1.4', download_dir='/usr/local/nltk_data')
nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/nltk_data')
nltk.download('maxent_ne_chunker', download_dir='/usr/local/nltk_data')
nltk.download('words', download_dir='/usr/local/nltk_data')


[nltk_data] Downloading package stopwords to /usr/local/nltk_data...


PermissionError: [Errno 13] Permission denied: '/usr/local/nltk_data'

In [6]:
nltk.data.path.append("/usr/local/nltk_data")


In [8]:
pip install --upgrade nltk


[33mDEPRECATION: Loading egg at /raid/home/srmist5/miniconda3/envs/tamil/lib/python3.12/site-packages/mask_rcnn-2.1-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
Note: you may need to restart the kernel to use updated packages.


In [9]:
import nltk
print(nltk.data.path)


['/home/srmist5/nltk_data', '/home/srmist5/miniconda3/envs/tamil/nltk_data', '/home/srmist5/miniconda3/envs/tamil/share/nltk_data', '/home/srmist5/miniconda3/envs/tamil/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/usr/local/nltk_data', '/usr/local/nltk_data', '/usr/local/nltk_data', '/usr/local/nltk_data']


In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/srmist5/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import datasets
import re
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import requests
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report, roc_curve, auc
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Load dataset
dataset = datasets.load_dataset("coastalcph/lex_glue", "scotus")

# Load legal dictionary from GitHub
legal_dict_url = "https://raw.githubusercontent.com/Victorambrose/BERT_Optimize/main/US_legal_dict.txt"
response = requests.get(legal_dict_url)
legal_terms = set(response.text.splitlines())

# Load abbreviation mapping safely
abbr_dict_url = "https://raw.githubusercontent.com/Victorambrose/BERT_Optimize/main/legal_abbr.txt"
response = requests.get(abbr_dict_url)

# Process lines carefully to avoid errors
abbr_dict = {}
for line in response.text.splitlines():
    if "=" in line:  # Ensure valid format
        key, value = line.split("=", 1)  # Split only at the first "="
        abbr_dict[key.strip()] = value.strip()  # Remove any extra spaces


# Ensure necessary NLTK resources are downloaded
nltk_data_path = os.path.expanduser("~/nltk_data")
nltk.data.path.append(nltk_data_path)

nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('punkt', download_dir=nltk_data_path)

# Load summarization model
summarizer = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-led-base-16384")
summarizer_tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-led-base-16384")

def preprocess_text(text):
    text = re.sub(r'\\s+', ' ', text)  # Remove extra spaces
    for abbr, full_form in abbr_dict.items():
        text = text.replace(abbr, full_form)  # Replace abbreviations
    tokens = word_tokenize(text.lower())  # Tokenize & lowercase
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))  # Load stopwords once
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words or word in legal_terms]
    return ' '.join(tokens)

def summarize_text(text):
    inputs = summarizer_tokenizer(text, return_tensors="pt", max_length=16384, truncation=True)
    summary_ids = summarizer.generate(**inputs)
    return summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Apply preprocessing and summarization
dataset = dataset.map(lambda x: {"text": summarize_text(preprocess_text(x["text"]))})

# Tokenization
tokenizer_legalbert = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer_legalbert(examples["text"], truncation=True, padding=True, return_tensors="pt")

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define model class
class TextClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(TextClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return self.fc(outputs.pooler_output)

# Initialize models
model_legalbert = TextClassifier("nlpaueb/legal-bert-base-uncased", num_labels=13)
model_roberta = TextClassifier("roberta-base", num_labels=13)

def train_model(model, tokenizer, dataset, epochs=3, batch_size=8, lr=2e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        dataset["train"]["text"], dataset["train"]["label"], test_size=0.1, random_state=42)
    
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")
    
    train_dataset = torch.utils.data.TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(train_labels))
    val_dataset = torch.utils.data.TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], torch.tensor(val_labels))
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")
    return model

def plot_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.show()
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    fpr, tpr, _ = roc_curve(y_true, y_pred, pos_label=1)
    plt.plot(fpr, tpr, label='ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

def evaluate_model(model, tokenizer, dataset):
    test_texts, test_labels = dataset["test"]["text"], dataset["test"]["label"]
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
    model.eval()
    predictions = torch.argmax(model(test_encodings["input_ids"], test_encodings["attention_mask"]), dim=1)
    plot_metrics(test_labels, predictions.cpu().numpy())

trained_legalbert = train_model(model_legalbert, tokenizer_legalbert, dataset)
evaluate_model(trained_legalbert, tokenizer_legalbert, dataset)

random_sample = random.choice(dataset["test"]["text"])
print("Random Sample Input:", random_sample)


[nltk_data] Downloading package wordnet to /home/srmist5/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/srmist5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/srmist5/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2025-03-28 15:05:05.828508: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Input ids are automatically padded from 2883 to 3072 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3723 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 4665 to 5120 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 8387 to 9216 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 10029 to 10240 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1005 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 6672 to 7168 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 5461 to 6144 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3358 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3379 to 4096 

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']= '4,5,6,7'