# This notebook deals with SORTING | workflow (1) : Deduplicate --> Sort | workflow (1) : Normalize --> Deduplicate --> Sort

In [1]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '2,3,4,5,6'

import tensorflow as tf
tf.config.list_physical_devices('GPU')

2025-04-09 16:35:52.230461: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744196752.249377  754579 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744196752.255243  754579 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744196752.269313  754579 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744196752.269328  754579 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744196752.269330  754579 computation_placer.cc:177] computation placer alr

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:4', device_type='GPU')]

In [2]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, set_seed
)
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch

## Choose three models , legal_BERT , legal_longformer , legal_Roberta
## datasets :

1) Original SCOTUS 
2) Dedup and sort 
3) Normalize Deduplicate and Sort the words bassed on TFIDF

In [None]:

model_names = {
    "legal_BERT": "nlpaueb/legal-bert-base-uncased",
    "legal_longformer": "lexlms/legal-longformer-base",
    "legal_Roberta":"lexlms/legal-roberta-base"
}

learning_rate = 3e-5
epochs = 20
seed = 42

set_seed(seed)

# ----- Load Dataset -----
original_dataset=load_dataset("coastalcph/lex_glue", "scotus")
dedup_and_sort = load_dataset("victorambrose11/normalized_scotus")
norm_dedup_sort = load_dataset("victorambrose11/scotus_normalize_deduplicate_sort")

label_list = original_dataset["train"].features["label"].names
num_labels = len(label_list)

# ----- Token Classification Metric -----
def compute_f1(pred):
    import numpy as np
    from sklearn.metrics import f1_score

    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids

    return {
        "micro_f1": f1_score(labels, preds, average="micro", zero_division=0),
        "macro_f1": f1_score(labels, preds, average="macro", zero_division=0),
    }

results = {}

# ----- Training Function -----
def train_transformer_model(model_key, dataset, dataset_label):
    model_checkpoint = model_names[model_key]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

    # Preprocessing
    def preprocess(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    
    encoded_dataset = dataset.map(preprocess, batched=True)
    encoded_dataset = encoded_dataset.rename_column("label", "labels")
    encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    training_args = TrainingArguments(
        output_dir=f"./results_{model_key}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        num_train_epochs=epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        seed=seed,
        logging_dir=f"./logs_{model_key}",
        logging_steps=50,
        warmup_steps=500,
        lr_scheduler_type="linear"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        compute_metrics=compute_f1,
        tokenizer=tokenizer,
    )

    trainer.train()

    metrics = trainer.evaluate()
    print(f"DEBUG: Evaluation Metrics: {metrics}")
    results[f"{model_key} ({dataset_label})"] = {
        "Micro F1": round(metrics.get("eval_micro_f1", 0.0) * 100, 2),
        "Macro F1": round(metrics.get("eval_macro_f1", 0.0) * 100, 2),
        "Dataset": dataset_label
    }
# ----- Train LegalBERT and LegalLongformer -----
# original_dataset
# tfidf_srt_original_dataset 
# tfidf_srt_normalized_dataset

## legal bert on all three datasets 
print('-'*50)
print('Training Legal BERT original dataset')
print('-'*50)
train_transformer_model("legal_BERT", original_dataset, "Original")
print('-'*50)
print('Training Legal BERT dedup_and_sort')
print('-'*50)
train_transformer_model("legal_BERT", dedup_and_sort, "dedup_and_sort")
print('-'*50)
print('Training Legal BERT norm_dedup_sort')
print('-'*50)
train_transformer_model("legal_BERT", norm_dedup_sort, "norm_dedup_sort")



print('-'*50)
print('Training legal_longformer original_dataset')
print('-'*50)
train_transformer_model("legal_longformer", original_dataset, "original_dataset")
print('-'*50)
print('Training legal_longformer dedup_and_sort')
print('-'*50)
train_transformer_model("legal_longformer", dedup_and_sort, "dedup_and_sort")
print('-'*50)
print('Training legal_longformer norm_dedup_sort')
print('-'*50)
train_transformer_model("legal_longformer", norm_dedup_sort, "norm_dedup_sort")


## legal_Roberta on all three datasets 

# train_transformer_model("legal_Roberta", original_dataset, "Original")
print('-'*50)
print('Training legal_Roberta original_dataset')
print('-'*50)
train_transformer_model("legal_Roberta", original_dataset, "original_dataset")
print('-'*50)
print('Training legal_Roberta dedup_and_sort')
print('-'*50)
train_transformer_model("legal_Roberta", dedup_and_sort, "dedup_and_sort")
print('-'*50)
print('Training legal_Roberta norm_dedup_sort')
print('-'*50)
train_transformer_model("legal_Roberta", norm_dedup_sort,"norm_dedup_sort")



# ----- Train TF-IDF + SVM -----
def train_svm(dataset, dataset_label=""):
    print(f"\n🚀 Training TF-IDF + SVM on {dataset_label}...")

    train_texts = dataset["train"]["text"]
    train_labels = dataset["train"]["label"]
    val_texts = dataset["validation"]["text"]
    val_labels = dataset["validation"]["label"]

    vectorizer = TfidfVectorizer(max_features=10000)
    X_train = vectorizer.fit_transform(train_texts)
    X_val = vectorizer.transform(val_texts)

    clf = LinearSVC(max_iter=epochs * 100)  # Emulating epoch-based behavior
    clf.fit(X_train, train_labels)
    preds = clf.predict(X_val)

    model_name = f"tfidf_svm ({dataset_label})"
    results[model_name] = {
        "Micro F1": round(f1_score(val_labels, preds, average="micro") * 100, 2),
        "Macro F1": round(f1_score(val_labels, preds, average="macro") * 100, 2),
        "Dataset": dataset_label
    }

## SVM on all three datasets 
print('-'*50)
print('Training SVM OG')
print('-'*50)
train_svm(original_dataset, "Original")
print('-'*50)
print('Training SVM dedup_and_sort')
print('-'*50)
train_svm(dedup_and_sort,  "dedup_and_sort")
print('-'*50)
print('Training SVM norm_dedup_sort')
print('-'*50)
train_svm(norm_dedup_sort, "norm_dedup_sort")


# ----- Print Table -----
results_df = pd.DataFrame.from_dict(results, orient="index")

print("\n📋 Results Summary:\n")
print(results_df)

# ----- Plot Chart -----
results_df.plot(kind="bar", figsize=(14, 8), ylim=(0, 100))
plt.title("Model Comparison: Micro and Macro F1 Scores")
plt.ylabel("F1 Score (%)")
plt.xticks(rotation=0)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()
# results_df.to_excel('/home/srmist5/victor/BERT_Optimize/new_final/revised_results_variant_1_performance.xlsx',index=False)

README.md:   0%|          | 0.00/811 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/16.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1400 [00:00<?, ? examples/s]

--------------------------------------------------
Training Legal BERT original dataset
--------------------------------------------------


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
