# Inside the token-classification pipeline

In [1]:
from transformers import pipeline

token_classifier = pipeline("token-classification")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClass

[{'entity': 'I-PER',
  'score': 0.99938285,
  'index': 4,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.99815494,
  'index': 5,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.99590707,
  'index': 6,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.99923277,
  'index': 7,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': 0.9738931,
  'index': 12,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': 0.976115,
  'index': 13,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': 0.9887976,
  'index': 14,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.9932106,
  'index': 16,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [2]:
token_classifier = pipeline("token-classification", aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

# let’s see how to obtain these results without using the pipeline() function!

How we can do the classification without using pipline

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
inputs = tokenizer(example, return_tensors="pt")
outputs = model(**inputs)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
print(inputs["input_ids"].shape)  #input shape
print(outputs.logits.shape)       #output shape(one set of logits for each token 
                                  #in the input sequence)
                                  #a batch with 1 sequence of 
                                  #19 tokens and the model has 
                                  #9 different labels

torch.Size([1, 19])
torch.Size([1, 19, 9])


In [13]:
import torch

#a softmax function to convert those logits to probabilities
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
#take the argmax to get predictions (note that we can take the argmax on the 
#logits because the softmax does not change the order)
predictions = outputs.logits.argmax(dim=-1)[0].tolist()
print(predictions)

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]


In [11]:
outputs.logits.argmax(dim=-1)

tensor([[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]])

In [10]:
#how about if we take the argmax on probabilities :D
#we should get the same result:
import torch

predictions_soft = torch.tensor(probabilities).argmax(dim=-1).tolist()
print(predictions_soft)  #YAYYYYYYYY!

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]


In [7]:
inputs.tokens()

['[CLS]',
 'My',
 'name',
 'is',
 'S',
 '##yl',
 '##va',
 '##in',
 'and',
 'I',
 'work',
 'at',
 'Hu',
 '##gging',
 'Face',
 'in',
 'Brooklyn',
 '.',
 '[SEP]']

In [6]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [60]:
tokens

['[CLS]',
 'My',
 'name',
 'is',
 'S',
 '##yl',
 '##va',
 '##in',
 'and',
 'I',
 'work',
 'at',
 'Hu',
 '##gging',
 'Face',
 'in',
 'Brooklyn',
 '.',
 '[SEP]']

In [62]:
inputs.word_ids()

[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]

In [65]:
start, end = inputs.word_to_chars(3)
example[start:end]

'Sylvain'

In [58]:
inputs.word_to_chars(3)

CharSpan(start=8, end=10)

In [37]:
inputs.word_ids()[1]

0

In [72]:
results = []
tokens = inputs.tokens()
classes = model.config.id2label
token_id = inputs.word_ids()

for idx, pred in enumerate(predictions):
    label = classes[pred]
    id_token = token_id[idx]
    if label != 'O':
        if id_token != None or id_token != 0:
            start, end = inputs.word_to_chars(id_token)
            from_word = example[start:end]
        # elif id_token == None or :
        #     from_word = "None"

        results.append({"entity": label, 
                        "score": probabilities[idx][pred], 
                        "index": predictions[idx], 
                        "word": tokens[idx], 
                        "from_word": from_word})

results

[{'entity': 'I-PER',
  'score': 0.9993828535079956,
  'index': 4,
  'word': 'S',
  'from_word': 'Sylvain'},
 {'entity': 'I-PER',
  'score': 0.9981548190116882,
  'index': 4,
  'word': '##yl',
  'from_word': 'Sylvain'},
 {'entity': 'I-PER',
  'score': 0.995907187461853,
  'index': 4,
  'word': '##va',
  'from_word': 'Sylvain'},
 {'entity': 'I-PER',
  'score': 0.9992327690124512,
  'index': 4,
  'word': '##in',
  'from_word': 'Sylvain'},
 {'entity': 'I-ORG',
  'score': 0.9738929867744446,
  'index': 6,
  'word': 'Hu',
  'from_word': 'Hugging'},
 {'entity': 'I-ORG',
  'score': 0.9761149883270264,
  'index': 6,
  'word': '##gging',
  'from_word': 'Hugging'},
 {'entity': 'I-ORG',
  'score': 0.9887974858283997,
  'index': 6,
  'word': 'Face',
  'from_word': 'Face'},
 {'entity': 'I-LOC',
  'score': 0.99321049451828,
  'index': 8,
  'word': 'Brooklyn',
  'from_word': 'Brooklyn'}]

In [78]:
results[4:8]

[{'entity': 'I-ORG',
  'score': 0.9738929867744446,
  'index': 6,
  'word': 'Hu',
  'from_word': 'Hugging'},
 {'entity': 'I-ORG',
  'score': 0.9761149883270264,
  'index': 6,
  'word': '##gging',
  'from_word': 'Hugging'},
 {'entity': 'I-ORG',
  'score': 0.9887974858283997,
  'index': 6,
  'word': 'Face',
  'from_word': 'Face'},
 {'entity': 'I-LOC',
  'score': 0.99321049451828,
  'index': 8,
  'word': 'Brooklyn',
  'from_word': 'Brooklyn'}]

In [83]:
#what if we use "inputs_with_offsets["offset_mapping"]" with simple tokenizer
inputs["offset_mapping"]

KeyError: 'offset_mapping'

It is not possible since out tokenizer doe not have "return_offsets_mapping=True"

In [10]:
results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        start, end = offsets[idx]
        results.append(
            {
                "entity": label,
                "score": probabilities[idx][pred],
                "word": tokens[idx],
                "start": start,
                "end": end,
            }
        )

print(results)

[{'entity': 'I-PER', 'score': 0.9993828535079956, 'word': 'S', 'start': 11, 'end': 12}, {'entity': 'I-PER', 'score': 0.9981548190116882, 'word': '##yl', 'start': 12, 'end': 14}, {'entity': 'I-PER', 'score': 0.995907187461853, 'word': '##va', 'start': 14, 'end': 16}, {'entity': 'I-PER', 'score': 0.9992327690124512, 'word': '##in', 'start': 16, 'end': 18}, {'entity': 'I-ORG', 'score': 0.9738929867744446, 'word': 'Hu', 'start': 33, 'end': 35}, {'entity': 'I-ORG', 'score': 0.9761149883270264, 'word': '##gging', 'start': 35, 'end': 40}, {'entity': 'I-ORG', 'score': 0.9887974858283997, 'word': 'Face', 'start': 41, 'end': 45}, {'entity': 'I-LOC', 'score': 0.99321049451828, 'word': 'Brooklyn', 'start': 49, 'end': 57}]


# Grouping entities

In [82]:
results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
print(inputs_with_offsets["offset_mapping"])
print(tokens)

[(0, 0), (0, 2), (3, 7), (8, 10), (11, 12), (12, 14), (14, 16), (16, 18), (19, 22), (23, 24), (25, 29), (30, 32), (33, 35), (35, 40), (41, 45), (46, 48), (49, 57), (57, 58), (0, 0)]
['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in', 'Brooklyn', '.', '[SEP]']


In [120]:
import numpy as np

results_gp = []
inputs_with_offset = tokenizer(example, return_offsets_mapping=True)
tokens_gp = inputs_with_offset.tokens()
offsets = inputs_with_offset['offset_mapping']

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label_raw = model.config.id2label[pred]
    if label_raw != "O":
        label_B_or_I = label_raw[2:]
        start, _ = offsets[idx]

        all_scores = []
        while (idx < len(predictions)
                and model.config.id2label[predictions[idx]] == f"I-{label_B_or_I}"):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1
                
        score = np.mean(all_scores).item()

        results_gp.append({"entity": label_B_or_I, 
                        "score": score,
                        "word": example[start:end],
                        "start": start,
                        "end": end}
                        )
    idx += 1

        
    
results_gp

[{'entity': 'PER',
  'score': 0.998169407248497,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity': 'ORG',
  'score': 0.9796018203099569,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity': 'LOC',
  'score': 0.99321049451828,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]