In [None]:
!pip install -U transformers datasets seqeval -q

In [None]:
import numpy as np
from datasets import load_dataset  # load dataset from Hugging Face
from transformers import AutoTokenizer, AutoModelForTokenClassification  # load correct tokenizer for model and BERT for NER
from transformers import TrainingArguments, Trainer  # TrainingArguments: configure training , Trainer: handles training loop
from seqeval.metrics import classification_report # evaluate NER performance

In [None]:
dataset = load_dataset("lhoestq/conll2003")  # load CoNLL-2003, it contains tokens, ner_tag(label)
print(dataset)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # bert-base-uncased tokenizer splits words into subwords and adds special tokens [CLS] & [SEP]

In [None]:
print(dataset["train"].features)

In [None]:
label_list = [
    "O",
    "B-PER", "I-PER",
    "B-ORG", "I-ORG",
    "B-LOC", "I-LOC",
    "B-MISC", "I-MISC"
]

# Tokenize and align labels
# original : ["California"] labels-> [B-LOC] , BERT split into: ["cal", "##ifornia"] so need to align labels correctly
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,  # cut off any input text that exceeds the max length
        is_split_into_words=True # input is already tokenized
    )

    all_labels = []

    # align lables
    for i, labels in enumerate(examples["ner_tags"]):
      # examples["ner_tags"] → list of label sequences (one per sentence)
      # labels → the label list for one sentence
      # index of the sentence in the batch

        word_ids = tokenized_inputs.word_ids(batch_index=i) # tells which token belongs to which word
        # EX:
        ## Original words: ["John", "Washington"]
        ## Tokenized: ["[CLS]", "john", "wash", "##ington", "[SEP]"]
        ## word_ids becomes: [None, 0, 1, 1, None]

        previous_word_idx = None # used to detect subwords
        label_ids = [] # final aligned labels for tokens

        for word_idx in word_ids:
            if word_idx is None: # special tokens : CLS, SEP
                label_ids.append(-100) # -100: PyTorch loss function ignores -100, so model doesn't learn from special tokens
            elif word_idx != previous_word_idx: # at the first token of a word
                label_ids.append(labels[word_idx]) # assign the real NER label
            else: # word_idx == previous_word_idx : token is a continuation of the same word
                label_ids.append(-100)
            previous_word_idx = word_idx

        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,  # process in batches
    num_proc=4, # use 4 CPU cores
    remove_columns=dataset["train"].column_names  # remove original columns
)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_list)
)

In [None]:
training_args = TrainingArguments(
    output_dir="./ner_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01, # regularization
    logging_dir="./logs",
)

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2) # Choose highest probability label for each token

    true_labels = [
        [label_list[l] for l in label if l != -100]  # if label is NOT -100, Convert integer ID in to label name
        for label in labels  # loop through each sentence
    ]  # EX: [0, 0, 1, 2, -100, -100] -> ["O", "O", "B-PER", "I-PER"]

    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, label) if l != -100]  # loop token by token and only keep predictions for real tokens
        for pred, label in zip(predictions, labels)  # loop through predicted sentence and true sentence
    ]

    print(classification_report(true_labels, true_predictions))
    return {}

In [None]:
from transformers import DataCollatorForTokenClassification

# data collator: dynamically pads inputs, Pads labels correctly, keeps alignment between tokens and labels
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(  # high-level training engine from Transformers
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=tokenized_datasets["validation"].select(range(500)),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()  # load batch, pad using data_collator, forward pass, compute loss, backpropagation, update weights, repeat

In [None]:
trainer.evaluate()

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # check GPU is available then use cude otherwise CPU
model.to(device) # move model to device , if model on GPU but input is on CPU -> error

text = "John works at Google in California"
tokens = tokenizer(text, return_tensors="pt", truncation=True).to(device)  # return_tensors="pt" : return PyTorch tensors , # .to(device): move tokenized tensors to GPU

with torch.no_grad(): # not training so do not compute gradients
    outputs = model(**tokens) # run the model

predictions = outputs.logits.argmax(dim=2) # selects label with highest probability

predicted_labels = [label_list[p.item()] for p in predictions[0]]  # convert label IDs into label names

for token, label in zip(tokenizer.tokenize(text), predicted_labels[1:-1]):  # print token and label
    print(f"{token}: {label}")