In [None]:
model_name = "roberta-base"

ds = load_dataset("surrey-nlp/PLOD-CW-25")

label_list = ['O', 'B-AC', 'B-LF', 'I-LF']
label_to_id = {label: i for i, label in enumerate(label_list)}

for split in ["train", "validation", "test"]:
    ds[split] = ds[split].map(lambda x: {"ner_tags": [label_to_id[tag] for tag in x["ner_tags"]]})

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True)
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = ds.map(tokenize_and_align_labels, batched=True)
model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=len(label_list))

args = TrainingArguments(
    model_name,
    eval_strategy= "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=10,
    optim="schedule_free_adamw",
    lr_scheduler_type="constant",
)

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.evaluate()




In [None]:
# Test Results
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
print("Below is the results on the test data")
print(results)


In [None]:
# Class wise bar chart
labels = [key for key in results if isinstance(results[key], dict)]  # e.g., ['AC', 'LF']
precision = [results[label]['precision'] for label in labels]
recall = [results[label]['recall'] for label in labels]
f1 = [results[label]['f1'] for label in labels]

x = range(len(labels))
width = 0.2

plt.figure(figsize=(8, 6))
plt.bar([p - width for p in x], precision, width, label='Precision')
plt.bar(x, recall, width, label='Recall')
plt.bar([p + width for p in x], f1, width, label='F1 Score')

plt.xticks(x, labels)
plt.ylim(0, 1)
plt.ylabel('Score')
plt.title('Token-level Metrics by Class')
plt.legend()
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()


In [None]:
print("Below is the Confusion Matrix of the results")
true_labels_flat = [label for sent in true_labels for label in sent]
pred_labels_flat = [label for sent in true_predictions for label in sent]

labels = sorted(list(set(true_labels_flat + pred_labels_flat)))
cm = confusion_matrix(true_labels_flat, pred_labels_flat, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
fig, ax = plt.subplots(figsize=(8, 6))
disp.plot(cmap="Blues", ax=ax, xticks_rotation=45)
plt.title("Token-Level Confusion Matrix on Test Set")
plt.grid(False)
plt.show()