# This notebook deals with SORTING | workflow (1) : Deduplicate --> Sort | workflow (1) : Normalize --> Deduplicate --> Sort

In [1]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '2,3,4,5,6'

import tensorflow as tf
tf.config.list_physical_devices('GPU')

2025-04-10 15:47:04.836590: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744280224.857199 3037679 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744280224.863434 3037679 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744280224.877359 3037679 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744280224.877377 3037679 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744280224.877379 3037679 computation_placer.cc:177] computation placer alr

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:4', device_type='GPU')]

In [2]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, set_seed
)
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch

## Choose three models , legal_BERT , legal_longformer , legal_Roberta
## datasets :

1) Original SCOTUS 
2) Dedup and sort 
3) Normalize Deduplicate and Sort the words bassed on TFIDF

In [None]:

model_names = {
    "legal_BERT": "nlpaueb/legal-bert-base-uncased",
    "legal_BERT_small": "nlpaueb/legal-bert-small-uncased",
    "legal_longformer": "lexlms/legal-longformer-base",
    "legal_Roberta":"lexlms/legal-roberta-base"
}

learning_rate = 1e-5
epochs = 10
seed = 42

set_seed(seed)

# ----- Load Dataset -----
dataset=load_dataset("coastalcph/lex_glue", "scotus")

label_list = dataset["train"].features["label"].names
num_labels = len(label_list)

# ----- Token Classification Metric -----
def compute_f1(pred):
    import numpy as np
    from sklearn.metrics import f1_score

    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids

    return {
        "micro_f1": f1_score(labels, preds, average="micro", zero_division=0),
        "macro_f1": f1_score(labels, preds, average="macro", zero_division=0),
    }

results = {}

# ----- Training Function -----
def train_transformer_model(model_key, dataset, dataset_label):
    model_checkpoint = model_names[model_key]
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

    # Preprocessing
    def preprocess(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    
    encoded_dataset = dataset.map(preprocess, batched=True)
    encoded_dataset = encoded_dataset.rename_column("label", "labels")
    encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    training_args = TrainingArguments(
        output_dir=f"./results_{model_key}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        num_train_epochs=epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        seed=seed,
        logging_dir=f"./logs_{model_key}",
        logging_steps=50,
        warmup_steps=500,
        lr_scheduler_type="linear"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        compute_metrics=compute_f1,
        tokenizer=tokenizer,
    )

    trainer.train()

    metrics = trainer.evaluate()
    print(f"DEBUG: Evaluation Metrics: {metrics}")
    results[f"{model_key} ({dataset_label})"] = {
        "Micro F1": round(metrics.get("eval_micro_f1", 0.0) * 100, 2),
        "Macro F1": round(metrics.get("eval_macro_f1", 0.0) * 100, 2),
        "Dataset": dataset_label
    }


print('-'*50)
print('Training legal_BERT')
print('-'*50)
train_transformer_model("legal_BERT", dataset, "OG")
print('-'*50)
print('Training legal_longformer')
print('-'*50)
train_transformer_model("legal_longformer", dataset, "OG")
print('-'*50)
print('Training legal_Roberta')
print('-'*50)
train_transformer_model("legal_Roberta", dataset,"OG")



# ----- Train TF-IDF + SVM -----
def train_svm(dataset, dataset_label=""):
    print(f"\n🚀 Training TF-IDF + SVM on {dataset_label}...")

    train_texts = dataset["train"]["text"]
    train_labels = dataset["train"]["label"]
    val_texts = dataset["validation"]["text"]
    val_labels = dataset["validation"]["label"]

    vectorizer = TfidfVectorizer(max_features=10000)
    X_train = vectorizer.fit_transform(train_texts)
    X_val = vectorizer.transform(val_texts)

    clf = LinearSVC(max_iter=epochs * 100)  # Emulating epoch-based behavior
    clf.fit(X_train, train_labels)
    preds = clf.predict(X_val)

    model_name = f"tfidf_svm ({dataset_label})"
    results[model_name] = {
        "Micro F1": round(f1_score(val_labels, preds, average="micro") * 100, 2),
        "Macro F1": round(f1_score(val_labels, preds, average="macro") * 100, 2),
        "Dataset": dataset_label
    }

## SVM on all three datasets 
print('-'*50)
print('Training SVM OG')
print('-'*50)
train_svm(dataset, "Original")



# ----- Print Table -----
results_df = pd.DataFrame.from_dict(results, orient="index")

print("\n📋 Results Summary:\n")
print(results_df)

# ----- Plot Chart -----
results_df.plot(kind="bar", figsize=(14, 8), ylim=(0, 100))
plt.title("Model Comparison: Micro and Macro F1 Scores")
plt.ylabel("F1 Score (%)")
plt.xticks(rotation=0)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()
# results_df.to_excel('/home/srmist5/victor/BERT_Optimize/new_final/revised_results_variant_1_performance.xlsx',index=False)

README.md:   0%|          | 0.00/811 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1400 [00:00<?, ? examples/s]

--------------------------------------------------
Training Legal BERT original dataset
--------------------------------------------------


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1
1,2.2798,2.00419,0.405714,0.111831
2,1.2907,1.266581,0.647857,0.378878
3,0.9061,0.984531,0.695,0.452076
4,0.6897,0.836641,0.738571,0.604348
5,0.4741,0.843505,0.745,0.655031
6,0.2984,0.834876,0.766429,0.686413
7,0.1846,0.872613,0.776429,0.696071
8,0.112,0.979551,0.774286,0.698415
9,0.0675,1.033457,0.766429,0.689449
10,0.0392,1.100302,0.776429,0.70805




DEBUG: Evaluation Metrics: {'eval_loss': 1.2057017087936401, 'eval_micro_f1': 0.7871428571428571, 'eval_macro_f1': 0.7187258867162242, 'eval_runtime': 4.1509, 'eval_samples_per_second': 337.277, 'eval_steps_per_second': 8.432, 'epoch': 20.0}
--------------------------------------------------
Training Legal BERT dedup_and_sort
--------------------------------------------------


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1
1,2.1599,1.902084,0.440714,0.134928
2,1.397,1.390161,0.627857,0.355136
3,1.0717,1.237594,0.629286,0.362497
4,0.8971,1.091375,0.675714,0.465638
5,0.6789,1.032162,0.695714,0.548376
6,0.5233,1.056544,0.712143,0.546608
7,0.3986,1.05833,0.717143,0.569711
8,0.2479,1.150014,0.707857,0.598516
9,0.197,1.177957,0.719286,0.613777
10,0.1241,1.265226,0.72,0.621169




DEBUG: Evaluation Metrics: {'eval_loss': 1.4170739650726318, 'eval_micro_f1': 0.7321428571428571, 'eval_macro_f1': 0.643125976839279, 'eval_runtime': 4.3267, 'eval_samples_per_second': 323.569, 'eval_steps_per_second': 8.089, 'epoch': 20.0}
--------------------------------------------------
Training Legal BERT norm_dedup_sort
--------------------------------------------------


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1
1,2.1593,1.895596,0.442143,0.132168
2,1.39,1.384931,0.625,0.353831
3,1.0698,1.2266,0.636429,0.365991
4,0.8926,1.101609,0.682857,0.484806
5,0.6897,1.028835,0.703571,0.566915
6,0.5189,1.084986,0.707143,0.54287
7,0.3996,1.065645,0.707143,0.569042
8,0.2627,1.192724,0.690714,0.558562
9,0.1992,1.175649,0.718571,0.603475
10,0.1474,1.280041,0.712143,0.617409




DEBUG: Evaluation Metrics: {'eval_loss': 1.308088779449463, 'eval_micro_f1': 0.74, 'eval_macro_f1': 0.6419052927236286, 'eval_runtime': 4.0452, 'eval_samples_per_second': 346.089, 'eval_steps_per_second': 8.652, 'epoch': 20.0}
--------------------------------------------------
Training legal_longformer original_dataset
--------------------------------------------------


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at lexlms/legal-longformer-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Initializing global attention on CLS token...
Initializing global attention on CLS token...
Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1
1,2.2031,2.147526,0.203571,0.055014
2,1.143,1.146523,0.667857,0.38786
3,0.87,0.913445,0.727857,0.592984
4,0.6844,0.832653,0.731429,0.640014
5,0.4832,0.821564,0.757857,0.663207
6,0.3197,0.873743,0.780714,0.707209
7,0.2098,0.948407,0.78,0.695889
8,0.1508,1.079141,0.767857,0.685682
9,0.0895,1.12892,0.770714,0.701341
10,0.0626,1.17729,0.777143,0.700987




KeyboardInterrupt: 