# 🧠 Legal-BERT Trainer with LEDGAR Dataset
This notebook uses Hugging Face's `Trainer` API to fine-tune Legal-BERT on the LEDGAR contract clause classification task.

In [None]:
!pip install transformers datasets huggingface_hub accelerate

In [None]:
from huggingface_hub import login
login(token="hf_your_token_here")

In [None]:
# Load LEDGAR dataset and model
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

dataset = load_dataset("lex_glue", "ledgar")
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

def tokenize(example):
    return tokenizer(example['text'], padding="max_length", truncation=True)

encoded_dataset = dataset.map(tokenize, batched=True)
model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=100)

In [None]:
# Setup Trainer API
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"]
)

trainer.train()

---
**Generated and managed by BitstandBytes | Legal NLP Automation Pipeline**
- Author: Shaun Muldowney
- Dataset: LEDGAR (LexGLUE)
- Trainer: Hugging Face `Trainer` with Legal-BERT

In [None]:
# 🧠 Load classification training dataset
!pip install datasets
from datasets import load_dataset
dataset = load_dataset('json', data_files='legalbert_training_dataset.jsonl', split='train')

# Preview dataset
print(dataset[0])

In [None]:
# 🧠 Load NER tagging dataset
ner_dataset = load_dataset('json', data_files='legalbert_ner_dataset.jsonl', split='train')

# Check NER example
print(ner_dataset[0])