In [None]:
!pip install transformers datasets seqeval

import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
from seqeval.metrics import precision_score, recall_score, f1_score
from shutil import rmtree
import matplotlib.pyplot as plt

In [None]:
# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load and sample dataset
dataset = load_dataset("dell-research-harvard/newswire")["train"]
dataset = dataset.shuffle(seed=42).select(range(2000))
split = dataset.train_test_split(test_size=0.2, seed=42)
train_data, test_data = split["train"], split["test"]

# Label mappings
label_names = ["O","B-PER","I-PER","B-ORG","I-ORG","B-LOC","I-LOC","B-MISC","I-MISC"]
label2id = {l:i for i,l in enumerate(label_names)}
id2label = {i:l for l,i in label2id.items()}

In [None]:
# Tokenize and align labels
def tokenize_and_align_labels(examples, tokenizer):
    tokenized = tokenizer(
        examples["ner_words"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    aligned = []
    for i, labels in enumerate(examples["ner_labels"]):
        word_ids = tokenized.word_ids(batch_index=i)
        prev = None; lab_ids = []
        for wid in word_ids:
            if wid is None:
                lab_ids.append(-100)
            elif wid != prev:
                lab_ids.append(label2id[labels[wid]])
            else:
                lbl = labels[wid]
                if lbl.startswith("B-"):
                    lbl = "I-" + lbl.split("-",1)[1]
                lab_ids.append(label2id[lbl])
            prev = wid
        aligned.append(lab_ids)
    tokenized["labels"] = aligned
    return tokenized

In [None]:
# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=2)
    true_labels, pred_labels = [], []
    for labs, ps in zip(labels, preds):
        t, p = [], []
        for l, pr in zip(labs, ps):
            if l != -100:
                t.append(id2label[l]); p.append(id2label[pr])
        true_labels.append(t); pred_labels.append(p)
    return {
        "precision": precision_score(true_labels, pred_labels),
        "recall":    recall_score(true_labels, pred_labels),
        "f1":        f1_score(true_labels, pred_labels)
    }

# Models and learning rate settings
models = [
    "dbmdz/bert-large-cased-finetuned-conll03-english",
    "Jean-Baptiste/roberta-large-ner-english",
    "elastic/distilbert-base-uncased-finetuned-conll03-english",
    "dell-research-harvard/historical_newspaper_ner"
]

learning_rates = [1e-5, 2e-5, 3e-5, 5e-5, 1e-4]

In [None]:
# Checks different learning rates
results = []

for model_name in models:
    for lr in learning_rates:
        print(f"\n>>> {model_name} | learning_rate={lr}")

        # Tokenizer and model
        tok = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
        model = AutoModelForTokenClassification.from_pretrained(
            model_name, num_labels=len(label_names),
            id2label=id2label, label2id=label2id,
            ignore_mismatched_sizes=True
        ).to(device)

        # Tokenize datasets
        train_tok = train_data.map(lambda x: tokenize_and_align_labels(x, tok), batched=True)
        eval_tok  = test_data.map( lambda x: tokenize_and_align_labels(x, tok), batched=True)
        collator = DataCollatorForTokenClassification(tok)

        # Train arguments
        args = TrainingArguments(
            output_dir="./tmp",
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=1,
            learning_rate=lr,
            weight_decay=0.01,
            logging_steps=5000,
            save_strategy="no",
            fp16=(device=="cuda"),
            report_to=[]
        )

        trainer = Trainer(
            model=model, args=args,
            train_dataset=train_tok,
            eval_dataset=eval_tok,
            tokenizer=tok,
            data_collator=collator,
            compute_metrics=compute_metrics
        )

        # Train and evaluate
        trainer.train()
        m = trainer.evaluate()
        print("F1:", m["eval_f1"])
        results.append({
            "model": model_name, "learning_rate": lr,
            "f1": m["eval_f1"], "precision": m["eval_precision"],
            "recall": m["eval_recall"], "loss": m["eval_loss"]
        })
        del trainer, model, tok, train_tok, eval_tok
        torch.cuda.empty_cache()
        rmtree("./tmp", ignore_errors=True)
        rmtree("./results", ignore_errors=True)
        rmtree("./logs", ignore_errors=True)

df = pd.DataFrame(results)
df = df.sort_values(["model","f1"], ascending=[True, False])
print(df)

In [None]:
short_names = {
    "dbmdz/bert-large-cased-finetuned-conll03-english": "BERT-Large",
    "Jean-Baptiste/roberta-large-ner-english":           "RoBERTa-Large",
    "elastic/distilbert-base-uncased-finetuned-conll03-english": "DistilBERT",
    "dell-research-harvard/historical_newspaper_ner":    "Custom Hist"
}
df['model_short'] = df['model'].map(short_names)

# Define our colour palette
colors = ['#274472', '#C0D2C1', '#CFA15A', '#E777C2']

pivot_df = df.pivot(index='learning_rate', columns='model_short', values='f1')

plt.figure(figsize=(50, 15))
pivot_df.plot(marker='o', color=colors)
plt.title('F1 Score vs. Learning Rate for Each Model')
plt.xlabel('Learning Rate')
plt.ylabel('F1 Score')
plt.xscale('log')
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()