In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datasets
conll = datasets.load_dataset("conll2003")
CONLL_NER_TAGS = conll['train'].features['ner_tags'].feature.names
print(CONLL_NER_TAGS)
conll["test"][2]

Found cached dataset conll2003 (/Users/azatsultanov/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


{'id': '2',
 'tokens': ['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06'],
 'pos_tags': [22, 6, 22, 22, 23, 11],
 'chunk_tags': [11, 0, 11, 12, 12, 12],
 'ner_tags': [5, 0, 5, 6, 6, 0]}

In [3]:
from transformers import (pipeline, 
        AutoModelForTokenClassification, AutoTokenizer, 
        BertForTokenClassification, BertTokenizer)

# Load pretrained model and tokenizer for English NER task (dslim/bert-base-NER)
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

In [5]:
tokens = "His name is , as we know , Jerry Arahamson".split()

NER = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=None)
model_output = NER(" ".join(tokens))
model_output

[{'entity': 'B-PER',
  'score': 0.9994836,
  'index': 9,
  'word': 'Jerry',
  'start': 27,
  'end': 32},
 {'entity': 'I-PER',
  'score': 0.99950564,
  'index': 10,
  'word': 'Ara',
  'start': 33,
  'end': 36},
 {'entity': 'I-PER',
  'score': 0.9956499,
  'index': 11,
  'word': '##ham',
  'start': 36,
  'end': 39},
 {'entity': 'I-PER',
  'score': 0.6499991,
  'index': 12,
  'word': '##son',
  'start': 39,
  'end': 42}]

In [6]:
def unite_entities(entities):
    if len(entities) <= 1:
        return entities

    united_result = []
    cur_entity = {key: entities[0][key] for key in ['entity', 'word', 'start', 'end']}
    for entity in entities[1:]:
        if entity['word'].startswith('##'):
            cur_entity['word'] += entity['word'].lstrip('#')
            cur_entity['end'] = entity['end']
        else:
            united_result.append(cur_entity)
            cur_entity = {key: entity[key] for key in ['entity', 'word', 'start', 'end']}
    united_result.append(cur_entity)
    return united_result

entities = unite_entities(model_output)
entities


[{'entity': 'B-PER', 'word': 'Jerry', 'start': 27, 'end': 32},
 {'entity': 'I-PER', 'word': 'Arahamson', 'start': 33, 'end': 42}]

In [56]:
def convert_entities_to_bio(tokens, entities):
    bio_tags = []

    cur_entity_idx = 0
    for token in tokens:
        if token == entities[cur_entity_idx]['word']:
            bio_tags.append(entities[cur_entity_idx]['entity'])
            cur_entity_idx += 1
        else:
            bio_tags.append('O')
    return bio_tags

bio_tags = convert_entities_to_bio(tokens, entities)
bio_tags

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER']

In [55]:


def postprocessing_model(model_output):
    entities = unite_entities(model_output)
    bio_tags = convert_entities_to_bio(tokens, entities)
    return bio_tags

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER']

In [25]:
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.training import biluo_tags_to_offsets, offsets_to_biluo_tags

In [None]:
offsets_to_biluo_tags()

In [22]:
from itertools import islice

sample = conll['train'][1]

iob_tags = list(map(lambda x: CONLL_NER_TAGS[x], sample['ner_tags']))
words = sample['tokens']

print(iob_tags)

biluo_tags = [iob_to_biluo(i) for i in iob_tags]

doc = Doc(Vocab(), words=words)
biluo_tags_to_offsets(doc, biluo_tags)


['B-PER', 'I-PER']


ValueError: [E177] Ill-formed IOB input detected: B

In [None]:
biluo_tags_to_offsets()

In [38]:
import typing as tp
import torch

start_end_dct = {1:2, 3:4, 5:6, 7:8}


def tokenize_and_preserve_tags(example: tp.Dict[str, tp.Any],
                               tokenizer: BertTokenizer,
                               label2id: tp.Dict[str, int],
                               tokenizer_params={}) -> tp.Dict[str, tp.Any]:
    # write your own function to split each pair of word-token to same number of pieces.

    encoded = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, **tokenizer_params)
    encoded.update(example)

    labels = []
    label = example["ner_tags"]
    # for i, label in enumerate(examples["ner_tags"]):
    #   print(label)
    word_ids = encoded.word_ids(batch_index=0)
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        # Special tokens have a word id that is None. We set the label to -100 so they are automatically
        # ignored in the loss function.
        if word_idx is None:
            label_ids.append(0)
        # We set the label for the first token of each word.
        elif word_idx != previous_word_idx:
            label_ids.append(label[word_idx])
        # For the other tokens in a word, we set the label to either the current label or -100, depending on
        # the label_all_tokens flag.
        else:
            # print(label[word_idx])
            if label[word_idx] in start_end_dct:
              label_ids.append(start_end_dct[label[word_idx]])
            else:
              label_ids.append(label[word_idx])
        previous_word_idx = word_idx

    label_ids[0] = 0
    label_ids[-1] = 0
    
    # <YOUR CODE HERE>
    encoded['labels'] = [label2id[CONLL_NER_TAGS[i]] for i in label_ids]
    encoded['text_labels'] = [CONLL_NER_TAGS[i] for i in label_ids]
    
    # assert len(encoded['labels']) == len(encoded["input_ids"])
    return encoded
    

In [39]:
test_sentence = "His name is Jerry Abrahamson"
test_example = {"tokens": test_sentence.split(" "), "ner_tags": [0, 0, 0, 1, 2]}
test_result = tokenize_and_preserve_tags(test_example, tokenizer, model.config.label2id, {'return_tensors': 'pt'})
test_result

{'input_ids': tensor([[ 101, 1230, 1271, 1110, 5466, 7752, 2142,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]]), 'tokens': ['His', 'name', 'is', 'Jerry', 'Abrahamson'], 'ner_tags': [0, 0, 0, 1, 2], 'labels': [0, 0, 0, 0, 3, 4, 4, 0], 'text_labels': ['O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O']}

In [40]:
conll_prepared = conll.map(lambda x: tokenize_and_preserve_tags(x, tokenizer, model.config.label2id, tokenizer_params={'return_tensors': 'pt'}))
conll_prepared['train'][1]


Loading cached processed dataset at /Users/azatsultanov/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-574d598a9a4387b0.arrow
Loading cached processed dataset at /Users/azatsultanov/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-293a8d4b2cdbd114.arrow
Loading cached processed dataset at /Users/azatsultanov/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-9d19ba3e83272a3d.arrow


{'id': '1',
 'tokens': ['Peter', 'Blackburn'],
 'pos_tags': [22, 22],
 'chunk_tags': [11, 12],
 'ner_tags': [1, 2],
 'input_ids': [[101, 1943, 14428, 102]],
 'token_type_ids': [[0, 0, 0, 0]],
 'attention_mask': [[1, 1, 1, 1]],
 'labels': [0, 3, 4, 0],
 'text_labels': ['O', 'B-PER', 'I-PER', 'O']}

In [41]:
def preprocessing_dataset(dataset):
    tokens = []
    labels = []
    for i in dataset:
        tokens.append(i['tokens'])
        labels.append(i['text_labels'])
        
    return {'X': tokens, 'y_true': labels}

In [42]:
from sbe_vallib import NerSampler

sampler = NerSampler(train=preprocessing_dataset(conll_prepared['train']),
                     oos=preprocessing_dataset(conll_prepared['test']))
sampler.set_seed(1, bootstrap=True)


In [58]:
import numpy as np

class ModelWrapper():
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.id2label = model.config.id2label
        self.NER_hugface = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy='first')
    
    def predict(self, X):
        with torch.no_grad():
            encoded = self.tokenizer(X, is_split_into_words=True,
                                    truncation=True,
                                    padding=True,
                                    return_tensors='pt')
            model_output = model(**encoded)
            print(model_output['offset_mapping'])
            ner_entities = self.NER_hugface.postprocess(model_output)
            return ner_entities
    
    # def _logit2label(self, logits):
    #     idxs = torch.argmax(logits, dim=-1)
    #     unique_vals, inv_idxs = torch.unique(idxs, return_inverse=True, dim=-1)
    #     unique_vals = np.array([model.config.id2label[int(i)] for i in unique_vals[0]])
    #     labels = unique_vals[inv_idxs].reshape(idxs.shape)
    #     return labels


wrapped_model = ModelWrapper(model, tokenizer)

In [46]:
tok_output = tokenizer(['Lyuis', 'Arbrams'], is_split_into_words=True,
                                    truncation=True,
                                    padding=True,
                                    return_tensors='pt')


In [49]:
model(**tok_output).logits.shape

torch.Size([1, 8, 9])