In [1]:
# dataset 
from datasets import load_dataset

dataset = load_dataset("conll2003")
train = dataset["train"]
test = dataset["test"]
label_names = train.features["ner_tags"].feature.names

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [2]:
train[0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [3]:
train.features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [4]:
# tokenizer 
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")



In [16]:
train[2]["ner_tags"], train[2]['tokens']

([5, 0], ['BRUSSELS', '1996-08-22'])

In [74]:
k = 0
inp = tokenizer(
            train[k]["tokens"], truncation=True, is_split_into_words=True
        )
inp['input_ids'], train[k]["tokens"],  inp.word_ids(), train[k]["ner_tags"]

([101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102],
 ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None],
 [3, 0, 7, 0, 0, 0, 7, 0, 0])

In [51]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word: 
            current_word = word_id
            try:
                label = -100 if word_id is None else labels[word_id]
            except:
                label = -100
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def preprocess_function(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [52]:
tokenized_train_dataset = train.map(
    preprocess_function,
    batched=True,
    remove_columns=train.column_names
)

tokenized_test_dataset = test.map(
    preprocess_function,
    batched=True,
    remove_columns=train.column_names
)

In [71]:
tokenized_train_dataset[200], train[200]

({'input_ids': [101, 157, 11612, 15824, 1592, 102],
  'token_type_ids': [0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1],
  'labels': [-100, 3, 4, 4, 4, -100]},
 {'id': '200',
  'tokens': ['THAWRA'],
  'pos_tags': [38],
  'chunk_tags': [11],
  'ner_tags': [3]})

In [88]:
# model
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import evaluate
import numpy as np
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
ID2LABEL = {i: label for i, label in enumerate(label_names)}
LABEL2ID = {v: k for k, v in ID2LABEL.items()}


model = AutoModelForTokenClassification.from_pretrained("bert-base-cased",
                                                            id2label=ID2LABEL,
                                                            label2id=LABEL2ID,
                                                            num_labels=len(ID2LABEL)
                                                            ).to(device)
train_args  = TrainingArguments(output_dir = "ner_training",
                                num_train_epochs=3,
                                learning_rate = 2E-5,
                                per_device_train_batch_size = 16,
                                per_device_eval_batch_size = 16,
                                weight_decay = 0.01,
                                evaluation_strategy = "epoch",
                                save_strategy = "epoch",
                                logging_strategy = "epoch",
                                )
eval_comp = evaluate.load("seqeval")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = eval_comp.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }
trainer = Trainer(model = model,
                args = train_args,
                data_collator = DataCollatorForTokenClassification(tokenizer),
                train_dataset = tokenized_train_dataset,
                eval_dataset = tokenized_test_dataset,
                compute_metrics = compute_metrics,
                tokenizer=tokenizer,
            )

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [89]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1579,0.155726,0.855965,0.900673,0.87775,0.966862
2,0.0449,0.156902,0.873743,0.907932,0.89051,0.970423
3,0.0251,0.171829,0.888316,0.915368,0.901639,0.972471


Checkpoint destination directory ner_training\checkpoint-878 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ner_training\checkpoint-1756 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ner_training\checkpoint-2634 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=2634, training_loss=0.07597705751272794, metrics={'train_runtime': 297.116, 'train_samples_per_second': 141.773, 'train_steps_per_second': 8.865, 'total_flos': 1054683418795902.0, 'train_loss': 0.07597705751272794, 'epoch': 3.0})

In [90]:
from transformers import pipeline

model_checkpoint = "ner_training/checkpoint-2634"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("ajay lives in Chennai and keeps a samsung phone")

[{'entity_group': 'LOC',
  'score': 0.9971277,
  'word': 'Chennai',
  'start': 14,
  'end': 21}]