In [1]:
import pickle

In [2]:
from transformers import pipeline
ner = pipeline("ner", aggregation_strategy='simple')

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Device set to use cpu


In [3]:
with open('ner_train.pkl', 'rb') as f:
  corpus_train = pickle.load(f)

with open('ner_test.pkl', 'rb') as f:
  corpus_test = pickle.load(f)

In [4]:
corpus_test[0]

[('CRICKET', 'O'),
 ('-', 'O'),
 ('LEICESTERSHIRE', 'B-ORG'),
 ('TAKE', 'O'),
 ('OVER', 'O'),
 ('AT', 'O'),
 ('TOP', 'O'),
 ('AFTER', 'O'),
 ('INNINGS', 'O'),
 ('VICTORY', 'O'),
 ('.', 'O')]

In [5]:
inputs = []
targets = []

for sentence_tag_pairs in corpus_test:
  tokens = []
  target = []
  for token, tag in sentence_tag_pairs:
    tokens.append(token)
    target.append(tag)
  inputs.append(tokens)
  targets.append(target)

In [6]:
inputs[9]

['He',
 'was',
 'well',
 'backed',
 'by',
 'England',
 'hopeful',
 'Mark',
 'Butcher',
 'who',
 'made',
 '70',
 'as',
 'Surrey',
 'closed',
 'on',
 '429',
 'for',
 'seven',
 ',',
 'a',
 'lead',
 'of',
 '234',
 '.']

In [7]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

In [8]:
detokenizer.detokenize(inputs[9])

'He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven, a lead of 234.'

In [9]:
targets[9]

['O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [10]:
ner(detokenizer.detokenize(inputs[9]))

[{'entity_group': 'LOC',
  'score': 0.99967515,
  'word': 'England',
  'start': 22,
  'end': 29},
 {'entity_group': 'PER',
  'score': 0.99974275,
  'word': 'Mark Butcher',
  'start': 38,
  'end': 50},
 {'entity_group': 'ORG',
  'score': 0.9996264,
  'word': 'Surrey',
  'start': 66,
  'end': 72}]

In [11]:
def compute_prediction(tokens, input_, ner_result):
  # map hugging face ner result to list of tags for later performance assessment
  # tokens is the original tokenized sentence
  # input_ is the detokenized string

  predicted_tags = []
  state = 'O' # keep track of state, so if O --> B, if B --> I, if I --> I
  current_index = 0

  # keep track of last group since the group may change
  # between consecutive entities
  # e.g. we want B-MISC -> B-PER -> I-PER
  # not          B-MISC -> I-PER -> I-PER
  last_group = None

  for token in tokens:
    # find the token in the input_ (should be at or near the start)
    index = input_.find(token)
    assert(index >= 0)
    current_index += index # where we are currently pointing to

    # print(token, current_index) # debug

    # check if this index belongs to an entity and assign label
    tag = 'O'
    for entity in ner_result:
      group = entity['entity_group']
      if current_index >= entity['start'] and current_index < entity['end']:
        # then this token belongs to an entity
        if state == 'O':
          state = 'B'
        elif last_group != group:
          state = 'B'
        else:
          state = 'I'
        tag = f"{state}-{group}"
        last_group = group
        break
    if tag == 'O':
      # reset the state
      state = 'O'
      last_group = None
    predicted_tags.append(tag)

    # remove the token from input_
    input_ = input_[index + len(token):]

    # update current_index
    current_index += len(token)

  # sanity check
  # print("len(predicted_tags)", len(predicted_tags))
  # print("len(tokens)", len(tokens))
  assert(len(predicted_tags) == len(tokens))
  return predicted_tags

In [12]:
input_ = detokenizer.detokenize(inputs[9])
ner_result = ner(input_)
ptags = compute_prediction(inputs[9], input_, ner_result)

In [13]:
# TMP
input2 = detokenizer.detokenize(inputs[11])
ner_result2 = ner(input2)
ptags2 = compute_prediction(inputs[11], input2, ner_result2)
ptags2

['B-MISC',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [14]:
from sklearn.metrics import accuracy_score, f1_score

In [15]:
accuracy_score(targets[9], ptags)

1.0

In [16]:
for targ, pred in zip(targets[9], ptags):
  print(targ, pred)

O O
O O
O O
O O
O O
B-LOC B-LOC
O O
B-PER B-PER
I-PER I-PER
O O
O O
O O
O O
B-ORG B-ORG
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O


In [17]:
# # get detokenized inputs to pass into ner model
# detok_inputs = []
# for tokens in inputs:
#   text = detokenizer.detokenize(tokens)
#   detok_inputs.append(text)
# # 17 min on CPU, 3 min on GPU
# ner_results = ner(detok_inputs)
# predictions = []
# for tokens, text, ner_result in zip(inputs, detok_inputs, ner_results):
#   pred = compute_prediction(tokens, text, ner_result)
#   predictions.append(pred)
# # https://stackoverflow.com/questions/11264684/flatten-list-of-lists
# def flatten(list_of_lists):
#   flattened = [val for sublist in list_of_lists for val in sublist]
#   return flattened
# # flatten targets and predictions
# flat_predictions = flatten(predictions)
# flat_targets = flatten(targets)
# accuracy_score(flat_targets, flat_predictions)
# 0.9920892614676191
# f1_score(flat_targets, flat_predictions, average='macro')