<a href="https://colab.research.google.com/github/Thashmila-Dewmini/Face-Recognition/blob/main/Named_Entity_Recognition_(NER)_with_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers datasets seqeval -q

In [None]:
import numpy as np
from datasets import load_dataset  # load dataset from Hugging Face
from transformers import AutoTokenizer, AutoModelForTokenClassification  # load correct tokenizer for model and BERT for NER
from transformers import TrainingArguments, Trainer  # TrainingArguments: configure training , Trainer: handles training loop
from seqeval.metrics import classification_report # evaluate NER performance

In [None]:
dataset = load_dataset("lhoestq/conll2003")  # load CoNLL-2003, it contains tokens, ner_tag(label)
print(dataset)



dataset_infos.json: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/281k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # bert-base-uncased tokenizer splits words into subwords and adds special tokens [CLS] & [SEP]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
print(dataset["train"].features)

{'id': Value('string'), 'tokens': List(Value('string')), 'pos_tags': List(Value('int64')), 'chunk_tags': List(Value('int64')), 'ner_tags': List(Value('int64'))}


In [None]:
label_list = [
    "O",
    "B-PER", "I-PER",
    "B-ORG", "I-ORG",
    "B-LOC", "I-LOC",
    "B-MISC", "I-MISC"
]

# Tokenize and align labels
# original : ["California"] labels-> [B-LOC] , BERT split into: ["cal", "##ifornia"] so need to align labels correctly
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,  # cut off any input text that exceeds the max length
        is_split_into_words=True # input is already tokenized
    )

    all_labels = []

    # align lables
    for i, labels in enumerate(examples["ner_tags"]):
      # examples["ner_tags"] → list of label sequences (one per sentence)
      # labels → the label list for one sentence
      # index of the sentence in the batch

        word_ids = tokenized_inputs.word_ids(batch_index=i) # tells which token belongs to which word
        # EX:
        ## Original words: ["John", "Washington"]
        ## Tokenized: ["[CLS]", "john", "wash", "##ington", "[SEP]"]
        ## word_ids becomes: [None, 0, 1, 1, None]

        previous_word_idx = None # used to detect subwords
        label_ids = [] # final aligned labels for tokens

        for word_idx in word_ids:
            if word_idx is None: # special tokens : CLS, SEP
                label_ids.append(-100) # -100: PyTorch loss function ignores -100, so model doesn't learn from special tokens
            elif word_idx != previous_word_idx: # at the first token of a word
                label_ids.append(labels[word_idx]) # assign the real NER label
            else: # word_idx == previous_word_idx : token is a continuation of the same word
                label_ids.append(-100)
            previous_word_idx = word_idx

        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,  # process in batches
    num_proc=4, # use 4 CPU cores
    remove_columns=dataset["train"].column_names  # remove original columns
)

Map (num_proc=4):   0%|          | 0/14041 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3250 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_list)
)



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mBertForTokenClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
bert.pooler.dense.weight                   | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
bert.pooler.dense.bias                     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were 

In [None]:
training_args = TrainingArguments(
    output_dir="./ner_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01, # regularization
    logging_dir="./logs",
)

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2) # Choose highest probability label for each token

    true_labels = [
        [label_list[l] for l in label if l != -100]  # if label is NOT -100, Convert integer ID in to label name
        for label in labels  # loop through each sentence
    ]  # EX: [0, 0, 1, 2, -100, -100] -> ["O", "O", "B-PER", "I-PER"]

    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, label) if l != -100]  # loop token by token and only keep predictions for real tokens
        for pred, label in zip(predictions, labels)  # loop through predicted sentence and true sentence
    ]

    print(classification_report(true_labels, true_predictions))
    return {}

In [None]:
from transformers import DataCollatorForTokenClassification

# data collator: dynamically pads inputs, Pads labels correctly, keeps alignment between tokens and labels
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(  # high-level training engine from Transformers
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=tokenized_datasets["validation"].select(range(500)),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()  # load batch, pad using data_collator, forward pass, compute loss, backpropagation, update weights, repeat

Step,Training Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=250, training_loss=0.37764437866210937, metrics={'train_runtime': 41.3757, 'train_samples_per_second': 48.338, 'train_steps_per_second': 6.042, 'total_flos': 43284097225008.0, 'train_loss': 0.37764437866210937, 'epoch': 1.0})

In [None]:
trainer.evaluate()

              precision    recall  f1-score   support

         LOC       0.74      0.80      0.77       268
        MISC       0.56      0.32      0.41        85
         ORG       0.55      0.77      0.64       195
         PER       0.97      0.98      0.97       332

   micro avg       0.76      0.81      0.78       880
   macro avg       0.71      0.72      0.70       880
weighted avg       0.77      0.81      0.78       880



{'eval_loss': 0.18937650322914124,
 'eval_runtime': 1.6998,
 'eval_samples_per_second': 294.145,
 'eval_steps_per_second': 37.062,
 'epoch': 1.0}

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # check GPU is available then use cude otherwise CPU
model.to(device) # move model to device , if model on GPU but input is on CPU -> error

text = "John works at Google in California"
tokens = tokenizer(text, return_tensors="pt", truncation=True).to(device)  # return_tensors="pt" : return PyTorch tensors , # .to(device): move tokenized tensors to GPU

with torch.no_grad(): # not training so do not compute gradients
    outputs = model(**tokens) # run the model

predictions = outputs.logits.argmax(dim=2) # selects label with highest probability

predicted_labels = [label_list[p.item()] for p in predictions[0]]  # convert label IDs into label names

for token, label in zip(tokenizer.tokenize(text), predicted_labels[1:-1]):  # print token and label
    print(f"{token}: {label}")

john: B-PER
works: O
at: O
google: B-ORG
in: O
california: B-LOC
