### Testing Legal-bert

In [None]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

# declare file path with os
import os
os.chdir('/content/drive/MyDrive/_NLP/_NLP_Project/ModelTraining')

Mounted at /content/drive


In [None]:
!pip install transformers datasets evaluate accelerate sentence-transformers nlpaug scikit-learn pandas torch

Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nlpaug
  Using cached nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cach

In [None]:
import pandas as pd
from datasets import Dataset, ClassLabel

# Load your dataset (now with 'unknown' examples)
df = pd.read_json('contract_type_unknow.json')

# Verify distribution
print("Class distribution:")
print(df['label'].value_counts())

# Convert to HuggingFace Dataset
label_list = df['label'].unique()
id2label = {i:label for i,label in enumerate(label_list)}
label2id = {label:i for i,label in enumerate(label_list)}

dataset = Dataset.from_pandas(df)
features = dataset.features.copy()
features['label'] = ClassLabel(names=list(label_list))
dataset = dataset.cast(features)

Class distribution:
label
unknown                           500
Non-Disclosure Agreement (NDA)    300
Service Agreement                 300
Employment Contract               300
Name: count, dtype: int64


Casting the dataset:   0%|          | 0/1400 [00:00<?, ? examples/s]

In [None]:
!pip install nlpaug transformers[torch] nltk --upgrade -q
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
import torch
import pandas as pd
from nlpaug.augmenter.word import SynonymAug, ContextualWordEmbsAug
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Check GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [None]:
# from nlpaug.augmenter.word import SynonymAug, ContextualWordEmbsAug
# from sklearn.model_selection import train_test_split
# import nltk

# # Download the missing NLTK resource
# nltk.download('averaged_perceptron_tagger_eng')

# # Initialize augmenters
# synonym_aug = SynonymAug(aug_src='wordnet')
# context_aug = ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")

# def augment_text(text, label, n=2):
#     augmented = []
#     # Skip augmentation for Employment Contracts (overrepresented)
#     if label == "Employment Contract":
#         return [(text, label)]

#     for _ in range(n):
#         # Apply both augmentations
#         aug_text = synonym_aug.augment(text)
#         aug_text = context_aug.augment(aug_text)
#         augmented.append((aug_text, label))
#     return augmented

# # Apply augmentation to under-represented classes
# augmented_data = []
# for _, row in df.iterrows():
#     if row['label'] != "Employment Contract":  # Only augment NDAs and Service Agreements
#         augmented_data.extend(augment_text(row['text'], row['label']))
#     else:
#         augmented_data.append((row['text'], row['label']))

# # Create balanced dataset
# balanced_df = pd.DataFrame(augmented_data, columns=['text', 'label'])
# print("\nBalanced class distribution:")
# print(balanced_df['label'].value_counts())



# Initialize augmenters with GPU optimization
synonym_aug = SynonymAug(aug_src='wordnet', aug_p=0.3)  # Only augment 30% of words
context_aug = ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    device=device,  # Use GPU
    batch_size=16,  # Process multiple texts simultaneously
    aug_p=0.2      # Lower probability for faster processing
)

def augment_batch(texts_labels):
    """Process a batch of texts in parallel"""
    results = []
    for text, label in texts_labels:
        if label == "Employment Contract":
            results.append((text, label))
            continue

        # Apply augmentations sequentially
        try:
            aug_text = synonym_aug.augment(text)
            aug_text = context_aug.augment(aug_text)
            results.append((aug_text, label))
        except Exception as e:
            print(f"Error augmenting: {text[:50]}... - {str(e)}")
            results.append((text, label))  # Fallback to original

    return results

def parallel_augment(df, n_augments=2):
    """Parallelized augmentation pipeline"""
    # Prepare input batches
    augmented_data = []
    inputs = []

    # Only augment non-Employment contracts
    non_employment = df[df['label'] != "Employment Contract"]
    for _, row in non_employment.iterrows():
        for _ in range(n_augments):
            inputs.append((row['text'], row['label']))

    # Process in parallel batches
    with ThreadPoolExecutor(max_workers=4) as executor:
        batch_size = 32
        futures = []

        for i in tqdm(range(0, len(inputs), batch_size), desc="Augmenting"):
            batch = inputs[i:i+batch_size]
            futures.append(executor.submit(augment_batch, batch))

        for future in tqdm(futures, desc="Collecting results"):
            augmented_data.extend(future.result())

    # Add original Employment contracts
    employment = df[df['label'] == "Employment Contract"]
    for _, row in employment.iterrows():
        augmented_data.append((row['text'], row['label']))

    return pd.DataFrame(augmented_data, columns=['text', 'label'])

# Usage
balanced_df = parallel_augment(df, n_augments=2)
print("\nBalanced class distribution:")
print(balanced_df['label'].value_counts())

Augmenting: 100%|██████████| 69/69 [00:00<00:00, 100.15it/s]
Collecting results: 100%|██████████| 69/69 [01:39<00:00,  1.45s/it]


Balanced class distribution:
label
unknown                           1000
Non-Disclosure Agreement (NDA)     600
Service Agreement                  600
Employment Contract                300
Name: count, dtype: int64





In [None]:
# 1. Clean your DataFrame first
balanced_df = balanced_df.dropna()  # Remove any null values
balanced_df['text'] = balanced_df['text'].apply(lambda x: str(x) if not isinstance(x, str) else x)

# 2. Verify consistent types
assert all(isinstance(x, str) for x in balanced_df['text']), "Non-string values found in text column"
assert all(isinstance(x, str) for x in balanced_df['label']), "Non-string values found in label column"

# 3. Convert to Dataset
dataset = Dataset.from_pandas(balanced_df)

# 4. If still getting errors, use this robust alternative:
def safe_dataset_creation(df):
    return Dataset.from_dict({
        'text': df['text'].values.tolist(),
        'label': df['label'].values.tolist()
    })

dataset = safe_dataset_creation(balanced_df)

In [None]:
from datasets import Dataset, ClassLabel, DatasetDict
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

# Split data
train_df, test_df = train_test_split(balanced_df, test_size=0.2, stratify=balanced_df['label'])

# Convert to HuggingFace Dataset
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})

# Convert labels to IDs
label2id = {label: idx for idx, label in enumerate(balanced_df['label'].unique())}
id2label = {idx: label for label, idx in label2id.items()}

dataset = dataset.map(lambda x: {'label': label2id[x['label']]})
dataset = dataset.class_encode_column("label")

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/2000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2000 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/500 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Compute class weights
train_labels = tokenized_datasets["train"]["label"]
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')

# Custom loss function
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # Add **kwargs to accept extra arguments
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Load model
# model = AutoModelForSequenceClassification.from_pretrained(
#     "nlpaueb/legal-bert-base-uncased",
#     num_labels=len(label2id),
#     id2label=id2label,
#     label2id=label2id,
#     hidden_dropout_prob=0.3,
#     attention_probs_dropout_prob=0.2
# )

model = AutoModelForSequenceClassification.from_pretrained(
    "nlpaueb/legal-bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    # Added regularization
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2,
    classifier_dropout=0.1
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#check transformer version
import transformers
print(transformers.__version__)

4.51.3


In [None]:
# pip install transformers==4.38.0

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)

#     # Calculate metrics
#     accuracy = accuracy_score(labels, predictions)
#     f1 = f1_score(labels, predictions, average='weighted')

#     # Per-class metrics
#     report = classification_report(labels, predictions, target_names=list(label2id.keys()), output_dict=True)

#     return {
#         'accuracy': accuracy,
#         'f1': f1,
#         'NDA_precision': report['Non-Disclosure Agreement (NDA)']['precision'],
#         'Service_recall': report['Service Agreement']['recall'],
#         'Employment_f1': report['Employment Contract']['f1-score'],
#         'unknow': report['unknown']['f1-score']
#     }

# training_args = TrainingArguments(
#     output_dir="./legal_bert_contracts",
#     eval_strategy="steps",
#     eval_steps=200,
#     save_strategy="steps",
#     save_steps=200,
#     learning_rate=3e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=10,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model="f1",
#     greater_is_better=True,
#     logging_dir='./logs',
#     logging_steps=50,
#     report_to="none"
# )


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Get class names in correct order
    class_names = list(label2id.keys())

    try:
        # Calculate overall metrics
        accuracy = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average='weighted')

        # Generate classification report
        report = classification_report(
            labels,
            predictions,
            target_names=class_names,
            output_dict=True,
            zero_division=0  # Handle cases where division by zero might occur
        )

        # Safely extract per-class metrics
        metrics = {
            'accuracy': accuracy,
            'f1_weighted': f1,
        }

        # Add metrics for each class dynamically
        for class_name in class_names:
            if class_name in report:
                metrics.update({
                    f'{class_name}_precision': report[class_name]['precision'],
                    f'{class_name}_recall': report[class_name]['recall'],
                    f'{class_name}_f1': report[class_name]['f1-score']
                })

        return metrics

    except Exception as e:
        print(f"Error computing metrics: {str(e)}")
        return {
            'accuracy': 0,
            'f1_weighted': 0,
            'error': str(e)
        }

training_args = TrainingArguments(
    output_dir='./new_results',
    per_device_train_batch_size=8,  # Reduced from 16
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Reduced from 10
    learning_rate=5e-5,  # Increased from 2e-5
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    logging_steps=100,
    # New regularization parameters
    gradient_accumulation_steps=2,
    warmup_steps=100,
    logging_dir='./logs',
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
# Train with new compute_metrics
trainer.train()

# Evaluate
results = trainer.evaluate()
print("Final evaluation results:", results)

# Generate classification report
predictions = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(predictions.predictions, axis=-1)

print("\nClassification Report:")
print(classification_report(
    tokenized_datasets["test"]["label"],
    preds,
    target_names=list(label2id.keys())
))

Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,Non-disclosure agreement (nda) Precision,Non-disclosure agreement (nda) Recall,Non-disclosure agreement (nda) F1,Service agreement Precision,Service agreement Recall,Service agreement F1,Unknown Precision,Unknown Recall,Unknown F1,Employment contract Precision,Employment contract Recall,Employment contract F1
1,0.0123,0.005848,0.998,0.998,1.0,0.991667,0.995816,0.991736,1.0,0.995851,1.0,1.0,1.0,1.0,1.0,1.0
2,0.0393,0.010399,0.994,0.994004,0.983471,0.991667,0.987552,0.991667,0.991667,0.991667,1.0,1.0,1.0,1.0,0.983333,0.991597
3,0.0245,0.053614,0.994,0.993994,0.983607,1.0,0.991736,1.0,0.983333,0.991597,0.995025,1.0,0.997506,1.0,0.983333,0.991597
4,0.003,0.000778,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.0044,6.6e-05,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Final evaluation results: {'eval_loss': 6.55238181934692e-05, 'eval_accuracy': 1.0, 'eval_f1_weighted': 1.0, 'eval_Non-Disclosure Agreement (NDA)_precision': 1.0, 'eval_Non-Disclosure Agreement (NDA)_recall': 1.0, 'eval_Non-Disclosure Agreement (NDA)_f1': 1.0, 'eval_Service Agreement_precision': 1.0, 'eval_Service Agreement_recall': 1.0, 'eval_Service Agreement_f1': 1.0, 'eval_unknown_precision': 1.0, 'eval_unknown_recall': 1.0, 'eval_unknown_f1': 1.0, 'eval_Employment Contract_precision': 1.0, 'eval_Employment Contract_recall': 1.0, 'eval_Employment Contract_f1': 1.0, 'eval_runtime': 2.1399, 'eval_samples_per_second': 233.658, 'eval_steps_per_second': 14.954, 'epoch': 5.0}

Classification Report:
                                precision    recall  f1-score   support

Non-Disclosure Agreement (NDA)       1.00      1.00      1.00       120
             Service Agreement       1.00      1.00      1.00       120
                       unknown       1.00      1.00      1.00       200
    

In [None]:
# # Train
# trainer.train()

# # Evaluate
# results = trainer.evaluate()
# print("Final evaluation results:", results)

# # Generate classification report
# predictions = trainer.predict(tokenized_datasets["test"])
# preds = np.argmax(predictions.predictions, axis=-1)

# print("\nClassification Report:")
# print(classification_report(
#     tokenized_datasets["test"]["label"],
#     preds,
#     target_names=list(label2id.keys())
# ))

Epoch,Training Loss,Validation Loss,Accuracy,F1,Nda Precision,Service Recall,Employment F1,Unknow
1,1.0626,0.049398,0.986,0.985988,1.0,1.0,1.0,1.0
2,0.0974,0.014589,0.998,0.997996,0.991736,1.0,0.991597,1.0
3,0.0318,0.000632,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0021,0.00091,1.0,1.0,1.0,1.0,1.0,1.0
5,0.0111,0.005051,0.998,0.998,1.0,1.0,1.0,1.0


Final evaluation results: {'eval_loss': 0.0006317264051176608, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_NDA_precision': 1.0, 'eval_Service_recall': 1.0, 'eval_Employment_f1': 1.0, 'eval_unknow': 1.0, 'eval_runtime': 1.9922, 'eval_samples_per_second': 250.978, 'eval_steps_per_second': 16.063, 'epoch': 5.0}

Classification Report:
                                precision    recall  f1-score   support

Non-Disclosure Agreement (NDA)       1.00      1.00      1.00       120
             Service Agreement       1.00      1.00      1.00       120
                       unknown       1.00      1.00      1.00       200
           Employment Contract       1.00      1.00      1.00        60

                      accuracy                           1.00       500
                     macro avg       1.00      1.00      1.00       500
                  weighted avg       1.00      1.00      1.00       500



In [None]:
# Get misclassified examples
test_df = test_df.reset_index(drop=True)
test_df['predicted'] = [id2label[p] for p in preds]

# Ensure both columns have the same data type (string)
test_df['label'] = test_df['label'].astype(str)
test_df['predicted'] = test_df['predicted'].astype(str)

misclassified = test_df[test_df['label'] != test_df['predicted']]

# Print only if there are misclassified examples
if not misclassified.empty:
    print("\nSample misclassified examples:")
    print(misclassified[['text', 'label', 'predicted']].sample(5))
else:
    print("\nNo misclassified examples found.")

# Save for manual inspection (if any)
misclassified.to_csv("misclassified_examples.csv", index=False)


No misclassified examples found.


In [None]:
# Save the best model
model.save_pretrained("./best_legal_bert_contract_classifier_unknow")
tokenizer.save_pretrained("./best_legal_bert_contract_classifier_unknow")



('./best_legal_bert_contract_classifier_unknow/tokenizer_config.json',
 './best_legal_bert_contract_classifier_unknow/special_tokens_map.json',
 './best_legal_bert_contract_classifier_unknow/vocab.txt',
 './best_legal_bert_contract_classifier_unknow/added_tokens.json',
 './best_legal_bert_contract_classifier_unknow/tokenizer.json')

In [None]:
# Load for inference
from transformers import pipeline
from transformers import AutoTokenizer

import torch

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

classifier = pipeline(
    "text-classification",
    model="./best_legal_bert_contract_classifier_unknow",
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test with new examples
test_queries = [
    "I need an non disclosure agreement to protect my software idea",  # Should be NDA
    "Create an agreement for a consultant to audit our financesr",      # Should be Service
    "Create an employment agreement for a full-time engineer",  # Should be Employment
    "Blah Blah Blah"  # Should be lower score
]

for query in test_queries:
    result = classifier(query)[0]
    print(f"\nQuery: {query}")
    print(f"Predicted: {result['label']} (Confidence: {result['score']:.2f})")

Device set to use cuda:0



Query: I need an non disclosure agreement to protect my software idea
Predicted: Non-Disclosure Agreement (NDA) (Confidence: 1.00)

Query: Create an agreement for a consultant to audit our financesr
Predicted: Service Agreement (Confidence: 1.00)

Query: Create an employment agreement for a full-time engineer
Predicted: Employment Contract (Confidence: 1.00)

Query: Blah Blah Blah
Predicted: unknown (Confidence: 1.00)


In [None]:
# Test with new examples
test_queries = [
    "I need nda between Company A and B organization",  # Should be NDA
    "Create an agreement for a consultant to audit our development team",      # Should be Service
    "We need employee agreement to hire software engineer",  # Should be Employment
    "How are you?"  # Should be lower score
]

for query in test_queries:
    result = classifier(query)[0]
    print(f"\nQuery: {query}")
    print(f"Predicted: {result['label']} (Confidence: {result['score']:.2f})")


Query: I need nda between Company A and B organization
Predicted: Non-Disclosure Agreement (NDA) (Confidence: 1.00)

Query: Create an agreement for a consultant to audit our development team
Predicted: Service Agreement (Confidence: 1.00)

Query: We need employee agreement to hire software engineer
Predicted: Employment Contract (Confidence: 1.00)

Query: How are you?
Predicted: unknown (Confidence: 1.00)
