In [6]:
# Install required libraries if not already installed
!pip install transformers datasets torch scikit-learn pandas numpy

import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the LegalBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

# Sample Dataset (Replace this with real legal data)
data = {
    "text": [
        "The court ruled that the contract was invalid due to fraud.",
        "Plaintiff seeks damages for breach of contract under Section 5.",
        "The patent application was rejected based on prior art.",
        "This case involves intellectual property rights infringement.",
        "The defendant was found guilty of securities fraud."
    ],
    "label": ["Contract Law", "Contract Law", "Patent Law", "IP Law", "Securities Law"]
}

df = pd.DataFrame(data)

# Convert labels to numerical values
df["label_num"] = df["label"].astype("category").cat.codes

# Train-Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(df["text"], df["label_num"], test_size=0.2, random_state=42)

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Convert data to Hugging Face Dataset format
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})

# Tokenize dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Load LegalBERT model
model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=len(df["label"].unique()))
model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train model
trainer.train()

# Evaluate model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# Compute accuracy and F1-score
acc = accuracy_score(test_labels, preds)
f1 = f1_score(test_labels, preds, average="weighted")

print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")


Using device: cpu


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.5419,1.164743
2,1.4648,1.247092
3,1.3932,1.154899


Accuracy: 1.0000
F1 Score: 1.0000


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Define paths
MODEL_PATH = "./legalbert_model"

# Save the trained model and tokenizer
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

print("Model saved successfully!")


Model saved successfully!


In [8]:
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the trained model and tokenizer
MODEL_PATH = "./legalbert_model"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize FastAPI app
app = FastAPI(title="Legal Document Classifier API")

# Define request schema
class TextInput(BaseModel):
    text: str

# API endpoint for prediction
@app.post("/predict")
def predict(input_data: TextInput):
    # Tokenize input text
    inputs = tokenizer(input_data.text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get predicted label
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    
    return {"predicted_label": predicted_class}

# Root endpoint
@app.get("/")
def home():
    return {"message": "Welcome to the Legal Document Classifier API!"}


In [14]:
import nest_asyncio
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import uvicorn

# Apply nest_asyncio to allow asynchronous event loops in Jupyter
nest_asyncio.apply()

# Load the trained model and tokenizer
MODEL_PATH = "./legalbert_model"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize FastAPI app
app = FastAPI(title="Legal Document Classifier API")

# Define request schema
class TextInput(BaseModel):
    text: str

# API endpoint for prediction
@app.post("/predict")
def predict(input_data: TextInput):
    # Tokenize input text
    inputs = tokenizer(input_data.text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get predicted label
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    
    return {"predicted_label": predicted_class}

# Root endpoint
@app.get("/")
def home():
    return {"message": "Welcome to the Legal Document Classifier API!"}

# Run the FastAPI app
uvicorn.run(app, host="0.0.0.0", port=8001)


INFO:     Started server process [27432]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [27432]


KeyboardInterrupt: 

In [17]:
import os

MODEL_PATH = r"C:\Users\cfranklin2019\OneDrive - Florida Atlantic University\Documents\GitHub\LegalML\Legal Document Classification\Notebooks\legalbert_model"

if os.path.exists(MODEL_PATH):
    print("Path exists:", MODEL_PATH)
    print("Files:", os.listdir(MODEL_PATH))
else:
    print("Path does NOT exist. Check the directory.")



Path exists: C:\Users\cfranklin2019\OneDrive - Florida Atlantic University\Documents\GitHub\LegalML\Legal Document Classification\Notebooks\legalbert_model
Files: ['config.json', 'model.safetensors', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'vocab.txt']


In [18]:
import os
from transformers import AutoTokenizer

MODEL_PATH = os.path.abspath(r"C:\Users\cfranklin2019\OneDrive - Florida Atlantic University\Documents\GitHub\LegalML\Legal Document Classification\Notebooks\legalbert_model")

print("Using model path:", MODEL_PATH)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)


Using model path: C:\Users\cfranklin2019\OneDrive - Florida Atlantic University\Documents\GitHub\LegalML\Legal Document Classification\Notebooks\legalbert_model


In [19]:
from transformers import AutoModel

model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True)
