In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import create_optimizer

# Load CoNLL 2003 Dataset
![](images/conll2003.png)

In [None]:
conll = load_dataset('conll2003')
conll

Daten liegen im CoNLL-Format vor und enthalten neben Tokens und Named Entities auch POS Tags und Phrasenannotationen.
![](images/conll-format.png)

## Tokens des ersten Trainingsatzes

In [None]:
conll['train'][0]['tokens']

## POS Tags des ersten Trainingssatzes
Es wird das [Penn Treebank Tagset](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) genutzt.

In [None]:
conll['train'][0]['pos_tags']

In [None]:
conll['train'].features['pos_tags']

Es wird das BIO-Encoding-Schema genutzt.

In [None]:
conll['train'][0]['ner_tags']

In [None]:
conll['train'].features['ner_tags']

# Data preprocessing
Wir nutzen das Language Model [distilbert-base-cased](https://huggingface.co/distilbert-base-cased).

In [None]:
model_name = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Language Model Tokenizer tokenisieren Text in Tokens und Subtokens. Falls der zu verarbeitende Text bereits tokenisiert vorliegt, dann sollte diese Tokenisierung für NER beibehalten werden. Eine andere Tokenisierung würde dazu führen, dass Tokengrenzen nicht mehr mit den Named Entity Labels übereinstimmen.
Ein erneutes Tokenisieren verhindern wir durch den Parameter ```is_split_into_words=True```

In [None]:
inputs = tokenizer(conll['train'][0]['tokens'], is_split_into_words=True)
inputs.tokens()

Named Entity labels müssen korrigiert werden: nur das erste Subtoken je Token wird mit einem Label versehen, weitere Subtokens werden ignoriert ([https://huggingface.co/docs/transformers/tasks/token_classification](https://huggingface.co/docs/transformers/tasks/token_classification)).

In [None]:
def tokenize_and_align_labels(sequence_batch):
    # tokenize pre-tokenized sequences
    # long sequences will be truncated to respect the maximum token length of the language model (usually 512)
    tokenized_sequences = tokenizer(sequence_batch['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    # iterate over pre-tokenized tokens of single sequences
    for i, label in enumerate(sequence_batch['ner_tags']):
        # get associated word ids of the subtokens
        # if a token has multiple subtokens, then each subtoken is associated with the token's word id
        word_ids = tokenized_sequences.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        # iterate over subtokens
        for word_idx in word_ids:
            # special tokens (e.g. [CLS], [SEP]) get label id -100 -> loss function will ignore them
            if word_idx is None:
                label_ids.append(-100)
            # if the first subtoken of the next token is encountered, then associate the token's ner label with the subtoken
            # FIXME what if two consecutive tokens are identical (e.g. "is this a really really bad?")?
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # consecutive subtokens will be ignored
            else:
                label_ids.append(-100)
            # memorize the current word
            previous_word_idx = word_idx
        # add labels of the current sequence to the list of labels of the batch
        labels.append(label_ids)

    # update batch labels
    tokenized_sequences['labels'] = labels
    return tokenized_sequences

# Model and training definition
Load auto model for token classification. This adds a classification layer / head to the language model (usually a simple dense layer and dropout).

In [None]:
model = TFAutoModelForTokenClassification.from_pretrained(model_name, num_labels=9)

Specify training parameters and optimizer.

In [None]:
batch_size = 16
num_train_epochs = 1
num_train_steps = (len(conll['train']) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

model.get_layer('distilbert').trainable=False

model.compile(optimizer=optimizer, metrics='acc')
model.summary()

Init collator to build batches, pad sequences in a batch etc.

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
tf_train_set = model.prepare_tf_dataset(
    conll['train'].map(tokenize_and_align_labels, batched=True),
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    conll['validation'].map(tokenize_and_align_labels, batched=True),
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

# Train model

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_train_epochs)