In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|██████████| 9.57k/9.57k [00:00<00:00, 10.1MB/s]
Downloading metadata: 100%|██████████| 3.73k/3.73k [00:00<00:00, 11.3MB/s]
Downloading readme: 100%|██████████| 12.3k/12.3k [00:00<00:00, 5.77MB/s]
Downloading data: 100%|██████████| 983k/983k [00:00<00:00, 3.57MB/s]
Generating train split: 100%|██████████| 14041/14041 [00:00<00:00, 19109.69 examples/s]
Generating validation split: 100%|██████████| 3250/3250 [00:00<00:00, 18922.98 examples/s]
Generating test split: 100%|██████████| 3453/3453 [00:00<00:00, 21757.35 examples/s]


In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [21]:
def get_labels(indexes,labels):
    return [labels[i] for i in indexes]

In [31]:
ner_labels = raw_datasets['train'].features['ner_tags'].feature.names
ner_labels

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [26]:
raw_datasets['train']['tokens'][3]

['The',
 'European',
 'Commission',
 'said',
 'on',
 'Thursday',
 'it',
 'disagreed',
 'with',
 'German',
 'advice',
 'to',
 'consumers',
 'to',
 'shun',
 'British',
 'lamb',
 'until',
 'scientists',
 'determine',
 'whether',
 'mad',
 'cow',
 'disease',
 'can',
 'be',
 'transmitted',
 'to',
 'sheep',
 '.']

In [27]:
get_labels(indexes=raw_datasets['train']['ner_tags'][3],labels=ner_labels)

['O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-MISC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-MISC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

# Tokenizer

In [45]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [46]:
tokenizer.is_fast

True

In [49]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [50]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [56]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [54]:
5%2

1

In [59]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [62]:
labels

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [63]:
word_ids

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [65]:
labels[7]

0

In [73]:
raw_datasets['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [80]:
def align_labels_with_tokens(labels,word_ids):
    new_labels = []
    current_word = None
    for word in word_ids:
        if word!=current_word:
            label = -100 if word is None else labels[word]
            current_word = word
            new_labels.append(label)
        elif word is None:
            new_labels.append(-100)
        else:
            label = labels[word]
            if label%2==1:
                label+=1
            new_labels.append(label)

    return new_labels

In [81]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [82]:
def align_labels_with_tokens_ignore_subsequent(labels,word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            label = -100 if word_id is None else labels[word_id]
            current_word = word_id
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            new_labels.append(-100)
    return new_labels

In [83]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens_ignore_subsequent(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, -100, 0, -100]
