<h1> Named Entity Recognition Application </h1>

<h2> Check for GPU </h2>

In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [4]:
device

'cuda:0'

<h2> The Dataset </h2>

In [8]:
from datasets import load_dataset
dataset = load_dataset('conll2003')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [10]:
dataset['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [11]:
label_list = dataset["train"].features["ner_tags"].feature.names
dataset = dataset.remove_columns(['pos_tags', 'chunk_tags'])

<h2> Tokenizing the Data </h2>

In [12]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
dataset["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [14]:
example_row = dataset["train"][2]
tokenized_input = tokenizer(example_row["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(f"Original Tokens: {example_row['tokens']}")
print(f"Updated Tokens: {tokens}")

Original Tokens: ['BRUSSELS', '1996-08-22']
Updated Tokens: ['[CLS]', 'brussels', '1996', '-', '08', '-', '22', '[SEP]']


In [15]:
from transformers import PreTrainedTokenizer

def tokenize_and_align_labels(examples, tokenizer: PreTrainedTokenizer, label_all_word_parts: bool=False):

    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            # Default behavior only labels the first token of a given word
            # But sometimes you may want to label all the word parts for very specific use cases.
            elif label_all_word_parts or (word_idx != previous_word_idx):
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [16]:
tokenized_ds = dataset.map(lambda examples: tokenize_and_align_labels(examples, tokenizer), batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [17]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [18]:
import numpy as np
example = tokenized_ds['train'][2]

print(example.keys())
print(f"input_ids: {example['input_ids']}")
print(f"labels: {example['labels']}")
print(f"attention: {example['attention_mask']}")
decoded_ids = [tokenizer.decode(id) for id in example["input_ids"]]
translated_labels = [label_list[label] if label != -100 else label for label in example['labels']]
print("\ntranslated ids and labels:")
for i in np.arange(len(example['input_ids'])):
    print(decoded_ids[i], translated_labels[i])


dict_keys(['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'])
input_ids: [101, 9371, 2727, 1011, 5511, 1011, 2570, 102]
labels: [-100, 5, 0, -100, -100, -100, -100, -100]
attention: [1, 1, 1, 1, 1, 1, 1, 1]

translated ids and labels:
[CLS] -100
brussels B-LOC
1996 O
- -100
08 -100
- -100
22 -100
[SEP] -100


<h2> Building the Trainer </h2>

In [19]:
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate

In [20]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [21]:
# Metric used for evaluating NER performance
seqeval = evaluate.load("seqeval")

# Uses the label_list we created earlier
# Create function to display metrics when training the model
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [22]:
label2id = {label: index for index, label in enumerate(label_list)}
id2label = {value: key for key, value in label2id.items()}

print(f"label2id: {label2id}")
print(f"id2label: {id2label}")

label2id: {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [23]:
def model_init():
    return AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(id2label.keys()),
        id2label=id2label,
        label2id=label2id
    )

In [24]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
training_args = TrainingArguments(
    output_dir="named_entity-recognition-fine_tuned-conll2003-distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_strategy="epoch",
    eval_strategy='epoch',
    metric_for_best_model="eval_loss",
    save_strategy="epoch",
    push_to_hub=True,
    remove_unused_columns=False,
    load_best_model_at_end=True
)


In [29]:
trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=tokenized_ds["train"].remove_columns(["id", "tokens", "ner_tags"]),
    eval_dataset=tokenized_ds["validation"].remove_columns(["id", "tokens", "ner_tags"]),
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    model_init=model_init,
    data_collator=data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
trainer.train()

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1474,0.056292,0.909408,0.920734,0.915036,0.983782
2,0.0393,0.050692,0.92355,0.935207,0.929342,0.986118
3,0.0244,0.050361,0.931484,0.938068,0.934764,0.986877


TrainOutput(global_step=2634, training_loss=0.07035736758870072, metrics={'train_runtime': 331.1429, 'train_samples_per_second': 127.205, 'train_steps_per_second': 7.954, 'total_flos': 510122266253334.0, 'train_loss': 0.07035736758870072, 'epoch': 3.0})

In [31]:
import pandas as pd

train_logs = []
valid_logs = []
for index, x in enumerate(trainer.state.log_history):
    if 'loss' in x.keys():
        train_logs.append(x)
    elif 'eval_loss' in x.keys():
        valid_logs.append(x)
    else:
        pass

train_logs = pd.DataFrame(train_logs)
valid_logs = pd.DataFrame(valid_logs)
logs = train_logs.merge(valid_logs, on=["epoch", "step"])

display(logs)


Unnamed: 0,loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second
0,0.1474,2.036511,1.3e-05,1.0,878,0.056292,0.909408,0.920734,0.915036,0.983782,6.253,519.747,32.624
1,0.0393,0.038726,7e-06,2.0,1756,0.050692,0.92355,0.935207,0.929342,0.986118,5.7248,567.701,35.634
2,0.0244,1.368961,0.0,3.0,2634,0.050361,0.931484,0.938068,0.934764,0.986877,6.5091,499.301,31.341


In [32]:
trainer.evaluate()

{'eval_loss': 0.05036058649420738,
 'eval_precision': 0.9314839572192514,
 'eval_recall': 0.9380679905755638,
 'eval_f1': 0.9347643803454637,
 'eval_accuracy': 0.986877458042911,
 'eval_runtime': 5.6243,
 'eval_samples_per_second': 577.851,
 'eval_steps_per_second': 36.271,
 'epoch': 3.0}

<h2> Using the Model </h2>

In [38]:
from transformers import pipeline

repo_name = "chescore/named_entity-recognition-fine_tuned-conll2003-distilbert"
aggregation_strategy = "average"
pipe = pipeline(model=repo_name, task='token-classification', device=0)

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [39]:
for index, row in enumerate(tokenized_ds["test"]):
    if index < 5:
        text = " ".join(row["tokens"])
        othertags = [id2label[id] for id in row["ner_tags"]]

        # Note that this is the only line you actually need to use the pipe. The rest is for context.
        pipe_output = pipe(text)

        print(f"Row: {index}")
        print(f"Text: {text}")
        print(f"Actual NER tags: {othertags}")
        for output in pipe_output:
            print(output)
        print("\n")

Row: 0
Transaction: SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
Actual NER tags: ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']
{'entity': 'B-LOC', 'score': 0.99847, 'index': 3, 'word': 'japan', 'start': 9, 'end': 14}
{'entity': 'B-LOC', 'score': 0.9982443, 'index': 8, 'word': 'china', 'start': 31, 'end': 36}


Row: 1
Transaction: Nadim Ladki
Actual NER tags: ['B-PER', 'I-PER']
{'entity': 'B-PER', 'score': 0.8343791, 'index': 1, 'word': 'nad', 'start': 0, 'end': 3}
{'entity': 'B-PER', 'score': 0.47137254, 'index': 2, 'word': '##im', 'start': 3, 'end': 5}
{'entity': 'I-PER', 'score': 0.9894666, 'index': 3, 'word': 'lad', 'start': 6, 'end': 9}
{'entity': 'I-PER', 'score': 0.8628099, 'index': 4, 'word': '##ki', 'start': 9, 'end': 11}


Row: 2
Transaction: AL-AIN , United Arab Emirates 1996-12-06
Actual NER tags: ['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']
{'entity': 'B-LOC', 'score': 0.9942059, 'index': 1, 'word': 'al', 'start': 0, 'end': 2}
{'entity

In [40]:
from evaluate import evaluator

task_evaluator = evaluator("token-classification")
pipe = pipeline(
    task="token-classification", model=repo_name, device=0
)

results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=tokenized_ds["test"],
    metric="seqeval",
    input_column="tokens",
    label_column="ner_tags",
    join_by=" "
)
results

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

{'LOC': {'precision': 0.90625,
  'recall': 0.9214628297362111,
  'f1': 0.9137931034482758,
  'number': 1668},
 'MISC': {'precision': 0.7560321715817694,
  'recall': 0.8034188034188035,
  'f1': 0.7790055248618785,
  'number': 702},
 'ORG': {'precision': 0.855457227138643,
  'recall': 0.8729680915111379,
  'f1': 0.8641239570917759,
  'number': 1661},
 'PER': {'precision': 0.9723791588198368,
  'recall': 0.9579468150896723,
  'f1': 0.9651090342679127,
  'number': 1617},
 'overall_precision': 0.8900523560209425,
 'overall_recall': 0.9029745042492918,
 'overall_f1': 0.8964668658815257,
 'overall_accuracy': 0.9791105846882739,
 'total_time_in_seconds': 187.5491923630002,
 'samples_per_second': 18.41116965898068,
 'latency_in_seconds': 0.054314854434694526}

In [44]:
ebikes_text = "Hi, I need a pickup from Jomo Kenyatta International Airport to Westgate Shopping Mall in Westlands, Nairobi. I'll be at the Terminal 1E arrivals gate in 15 minutes."
ebikes_output = pipe(ebikes_text)
print(ebikes_text)
for output in ebikes_output:
    print(output)

Hi, I need a pickup from Jomo Kenyatta International Airport to Westgate Shopping Mall in Westlands, Nairobi. I'll be at the Terminal 1E arrivals gate in 15 minutes.
{'entity': 'B-LOC', 'score': 0.64509803, 'index': 8, 'word': 'jo', 'start': 25, 'end': 27}
{'entity': 'I-LOC', 'score': 0.55681837, 'index': 9, 'word': '##mo', 'start': 27, 'end': 29}
{'entity': 'I-LOC', 'score': 0.5327879, 'index': 10, 'word': 'kenya', 'start': 30, 'end': 35}
{'entity': 'I-ORG', 'score': 0.53992647, 'index': 11, 'word': '##tta', 'start': 35, 'end': 38}
{'entity': 'I-LOC', 'score': 0.44267473, 'index': 12, 'word': 'international', 'start': 39, 'end': 52}
{'entity': 'B-LOC', 'score': 0.92699814, 'index': 15, 'word': 'west', 'start': 64, 'end': 68}
{'entity': 'I-LOC', 'score': 0.8915621, 'index': 16, 'word': '##gate', 'start': 68, 'end': 72}
{'entity': 'B-LOC', 'score': 0.99030006, 'index': 20, 'word': 'west', 'start': 90, 'end': 94}
{'entity': 'I-LOC', 'score': 0.97965187, 'index': 21, 'word': '##lands', 's

In [45]:
ebikes_text_2 = "I need a ride to pick me up from Kenyatta National Hospital and drop me off at The Sarit Centre in Westlands, Nairobi. I’ll be waiting at the main entrance in 10 minutes."
ebikes_output_2 = pipe(ebikes_text_2)
print(ebikes_text_2)
for output in ebikes_output_2:
    print(output)

I need a ride to pick me up from Kenyatta National Hospital and drop me off at The Sarit Centre in Westlands, Nairobi. I’ll be waiting at the main entrance in 10 minutes.
{'entity': 'B-LOC', 'score': 0.9611432, 'index': 10, 'word': 'kenya', 'start': 33, 'end': 38}
{'entity': 'I-LOC', 'score': 0.92154497, 'index': 11, 'word': '##tta', 'start': 38, 'end': 41}
{'entity': 'I-LOC', 'score': 0.8892494, 'index': 12, 'word': 'national', 'start': 42, 'end': 50}
{'entity': 'I-LOC', 'score': 0.8958392, 'index': 13, 'word': 'hospital', 'start': 51, 'end': 59}
{'entity': 'B-LOC', 'score': 0.89922255, 'index': 20, 'word': 'sar', 'start': 83, 'end': 86}
{'entity': 'I-LOC', 'score': 0.8050661, 'index': 21, 'word': '##it', 'start': 86, 'end': 88}
{'entity': 'I-LOC', 'score': 0.8505431, 'index': 22, 'word': 'centre', 'start': 89, 'end': 95}
{'entity': 'B-LOC', 'score': 0.99062425, 'index': 24, 'word': 'west', 'start': 99, 'end': 103}
{'entity': 'I-LOC', 'score': 0.9846762, 'index': 25, 'word': '##lands'

In [46]:
ebikes_text_3 = "Please arrange a pickup for me at The Hub Karen and drop me off at Lavington Mall. I'll be ready at the main entrance in 20 minutes."
ebikes_output_3 = pipe(ebikes_text_3)
print(ebikes_text_3)
for output in ebikes_output_3:
    print(output)

Please arrange a pickup for me at The Hub Karen and drop me off at Lavington Mall. I'll be ready at the main entrance in 20 minutes.
{'entity': 'B-LOC', 'score': 0.6238979, 'index': 10, 'word': 'karen', 'start': 42, 'end': 47}
{'entity': 'B-LOC', 'score': 0.9769778, 'index': 16, 'word': 'la', 'start': 67, 'end': 69}
{'entity': 'I-LOC', 'score': 0.94736916, 'index': 17, 'word': '##vington', 'start': 69, 'end': 76}
{'entity': 'I-LOC', 'score': 0.49357557, 'index': 18, 'word': 'mall', 'start': 77, 'end': 81}
