Token classification is a generic task encompasses any problem formulated as "adding label to each token in sentence" such as :    
* NER
* POS
* Chunking

# Token classification

In [None]:
!pip install datasets
!pip install git+https://github.com/huggingface/transformers

In [None]:
#@ Loading datasets
from datasets import load_dataset
raw_datasets = load_dataset("conll2003")

In [None]:
# inspecting datasets
print("Raw datasets\n",raw_datasets)
print("\n\ntraining data\n",raw_datasets["train"][0])

In [4]:
# Inspecting ner feautres
ner_features = raw_datasets["train"].features["ner_tags"]
label_names = ner_features.feature.names                                            # returns list of ner labels
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
# decoding words with correspondings labels
words = raw_datasets["train"]["tokens"][5]                                        # words
labels = raw_datasets["train"]["ner_tags"][5]                                     # ner_tags
line1 = ""
line2 = ""
for word,label in zip(words,labels):
  full_label = label_names[label]                                                 # returns full label namer of ner tags
  max_length = max(len(word), len(full_label))                                    # max length among tag and words
  line1 += word + " "*(max_length - len(word) + 1)                                # making alignment of word with corresponding ner tags
  line2 += full_label + " "*(max_length - len(full_label) + 1 )

print(line1)                                                                      # printing results
print(line2)

" We do n't support any such recommendation because we do n't see any grounds for it , " the Commission 's chief spokesman Nikolaus van   der   Pas   told a news briefing . 
O O  O  O   O       O   O    O              O       O  O  O   O   O   O       O   O  O O O   B-ORG      O  O     O         B-PER    I-PER I-PER I-PER O    O O    O        O 


In [None]:
pos_labels = raw_datasets["train"].features["pos_tags"].feature.names             # returning pos labels
pos_labels

In [7]:
# decoding words with correspondings pos tags
words = raw_datasets["train"]["tokens"][5]                                      # words
labels = raw_datasets["train"]["pos_tags"][5]                                   # pos labels
line01 = ""
line02 = ""
for word,label in zip(words,labels):                                            # labeling corresponding words with pos labels
  full_label = pos_labels[label]                                                # return full label for pos tag number
  max_length = max(len(word), len(full_label))                                  # max length for alignment
  line01 += word + " "*(max_length-len(word) + 1)                               # aligning words with its pos labels
  line02 += full_label + " "*(max_length -len(word) + 1)

print(line01)                                                                   # printing results
print(line02)

" We  do  n't support any such recommendation because we  do  n't see any grounds for it  , " the Commission 's  chief spokesman Nikolaus van der Pas told a  news briefing . 
" PRP  VBP  RB VB DT JJ NN IN PRP  VBP  RB VB DT NNS IN PRP  , " DT NNP POS  JJ NN NNP NNP FW NNP VBD DT  NN NN . 


In [8]:
# processing data
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"                                            # selecting model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)                     # loading tokenizer


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [9]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)              # tokenizing pretokenized so is_split = True]
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [10]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [11]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [42]:
#@ aligning labels and tokens
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id                                              # Start of a new word!
            label = -100 if word_id is None else labels[word_id]                # changes special tokens to -100
            new_labels.append(label)
        elif word_id is None:

            new_labels.append(-100)                                             # Special token
        else:
            label = labels[word_id]                                             # Same word as previous token
            if label % 2 == 1:                                                  # If the label is B-XXX we change it to I-XXX
                label += 1
            new_labels.append(label)

    return new_labels

In [43]:
labels = raw_datasets["train"][0]["ner_tags"]                                   # getting ner labels
word_ids = inputs.word_ids()                                                    # getting word ids
print(labels)
print(align_labels_with_tokens(labels, word_ids))                               # aligning everything

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [44]:
#@ Tokenizing and aligning labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True           # tokenizing pretokenize tokens
    )
    all_labels = examples["ner_tags"]                                           # returning lables of ner
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)                                 # getting word_ids
        new_labels.append(align_labels_with_tokens(labels, word_ids))           # aliging labels and tokens since token is increased in tokenizing

    tokenized_inputs["labels"] = new_labels                                     # updating label which is aligned
    return tokenized_inputs                                                     # return updated algined tokenized inputs

In [45]:
#@ Applying final preprocessing to datasets
tokenized_datasets = raw_datasets.map(                                          # mapping tokenizing fucntions
    tokenize_and_align_labels,                                                  # tokenizing functions that aligns labels also
    batched=True,                                                               # batching prevent loading all data at once making efficient use of ram
    remove_columns=raw_datasets["train"].column_names,                          # removing all columns
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets["train"][0]

In [47]:
#@ Padding all values of tokenzied datasets
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)         # makes dynamic padding i.e makes all value equal length

In [48]:
#@ lets see the differnce of using padding or collator
# Before using collator
for i in range(2):
  print(f'Before Collator {tokenized_datasets["train"][i]["labels"]}')

batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
# After using collator
print(f'After Collator {batch["labels"]}')

Before Collator [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
Before Collator [-100, 1, 2, -100]
After Collator tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])


In [None]:
!pip install seqeval
!pip install evaluate

In [50]:
#@ Setting up evaluation metrics
import evaluate
metric = evaluate.load("seqeval")                                 # seqeval is popular for token classification

In [51]:
labels = raw_datasets["train"][0]["ner_tags"]
print(labels)
labels = [label_names[i] for i in labels]
labels

[3, 0, 7, 0, 0, 0, 7, 0, 0]


['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [None]:
# lets check the performance of evaluatio on our datasets
predictions = labels.copy()                                            # copying labels
predictions[2] = "0"                                                   # changing to decrease accuracy test
metric.compute(predictions =[predictions], references=[labels])        #  evaluation

In [53]:
#@ customizing the evaluation by removing values where 100 is kept for evaluations
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [54]:
# creating mapping values for id2label and label2id
id2label = {id:label for id,l in enumerate(label_names)}
label2id = {v:k for k,v in id2label.items()}

In [55]:
#@ loading models for token classifications
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,                                                        # specifying which model to use
    id2label=id2label,                                                       # mapping of id to label
    label2id=label2id,                                                       # mapping of label to id
)

In [56]:
model.config.num_labels                                                     # inspecting number of labels model have

9

In [57]:
#@ Finetuning models
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install accelerate
!pip install torch

In [59]:
# creating training arguments for models
from transformers import TrainingArguments
import accelerate
import torch

args = TrainingArguments(
    "bert-finetuned-ner",                                      # path of directory
    evaluation_strategy="epoch",                               # return metrics in every epochs
    save_strategy="epoch",                                     # save weights in every epochs
    learning_rate=2e-5,                                        # learning rate
    num_train_epochs=3,                                        # specifying epochs number
    weight_decay=0.01,                                         # weight decay
    push_to_hub=True                                           # pushes update at every epcohs to hub
)

In [60]:
#@ Training model
from transformers import Trainer
trainer = Trainer(
    model=model,                                              # token classification model
    args=args,                                                # training args
    train_dataset=tokenized_datasets["train"],                # training datasets
    eval_dataset=tokenized_datasets["validation"],            # validation datasets
    data_collator=data_collator,                              # dynamic padding
    compute_metrics=compute_metrics,                          # calculating evaluation
    tokenizer=tokenizer,                                      # specifyig tokenizer
)
trainer.train()                                               # calling triaining cycle

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0273,0.076095,0.925008,0.942444,0.933645,0.984886
2,0.0183,0.067054,0.93634,0.950522,0.943377,0.986519
3,0.0077,0.073097,0.934445,0.952373,0.943324,0.986666


Checkpoint destination directory bert-finetuned-ner/checkpoint-1756 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-ner/checkpoint-3512 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-ner/checkpoint-5268 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=5268, training_loss=0.017605114929044256, metrics={'train_runtime': 607.2607, 'train_samples_per_second': 69.366, 'train_steps_per_second': 8.675, 'total_flos': 921792849708600.0, 'train_loss': 0.017605114929044256, 'epoch': 3.0})

In [61]:
# push once trainining is complete
trainer.push_to_hub(commit_message="Training complete")

CommitInfo(commit_url='https://huggingface.co/Utshav/bert-finetuned-ner/commit/1ed7228626d7a11cfa1b43318b8eb9b021db0692', commit_message='Training complete', commit_description='', oid='1ed7228626d7a11cfa1b43318b8eb9b021db0692', pr_url=None, pr_revision=None, pr_num=None)

In [62]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "utshav/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)


In [73]:
token_classifier("My name is Baikuntha parsad , I am working at Nayasansar")

[{'entity_group': 'PER',
  'score': 0.9909994,
  'word': 'Baikuntha parsad',
  'start': 11,
  'end': 27},
 {'entity_group': 'ORG',
  'score': 0.99254155,
  'word': 'Nayasansar',
  'start': 46,
  'end': 56}]