In [20]:
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import create_optimizer

# Load CoNLL 2003 Dataset
![](images/conll2003.png)

In [21]:
conll = load_dataset('conll2003')
conll

Found cached dataset conll2003 (C:/Users/Timo/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

Daten liegen im CoNLL-Format vor und enthalten neben Tokens und Named Entities auch POS Tags und Phrasenannotationen.
![](images/conll-format.png)

## Tokens des ersten Trainingsatzes

In [22]:
conll['train'][0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

## POS Tags des ersten Trainingssatzes
Es wird das [Penn Treebank Tagset](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) genutzt.

In [23]:
conll['train'][10]['id']

'10'

In [24]:
conll['train'].features['pos_tags']

Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None)

Es wird das BIO-Encoding-Schema genutzt.

In [25]:
print(conll['train'][0]['ner_tags'])
print(conll['train'].features['ner_tags'])

[3, 0, 7, 0, 0, 0, 7, 0, 0]
Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)


# Data preprocessing
Wir nutzen das Language Model [distilbert-base-cased](https://huggingface.co/distilbert-base-cased).

In [26]:
model_name = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Language Model Tokenizer tokenisieren Text in Tokens und Subtokens. Falls der zu verarbeitende Text bereits tokenisiert vorliegt, dann sollte diese Tokenisierung für NER beibehalten werden. Eine andere Tokenisierung würde dazu führen, dass Tokengrenzen nicht mehr mit den Named Entity Labels übereinstimmen.
Ein erneutes Tokenisieren verhindern wir durch den Parameter ```is_split_into_words=True```

In [27]:
inputs = tokenizer(conll['train'][0]['tokens'], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

Named Entity labels müssen korrigiert werden: nur das erste Subtoken je Token wird mit einem Label versehen, weitere Subtokens werden ignoriert ([https://huggingface.co/docs/transformers/tasks/token_classification](https://huggingface.co/docs/transformers/tasks/token_classification)).

In [28]:
def tokenize_and_align_labels(sequence_batch):
    # tokenize pre-tokenized sequences
    # long sequences will be truncated to respect the maximum token length of the language model (usually 512)
    tokenized_sequences = tokenizer(sequence_batch['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    # iterate over pre-tokenized tokens of single sequences
    for i, label in enumerate(sequence_batch['ner_tags']):
        # get associated word ids of the subtokens
        # if a token has multiple subtokens, then each subtoken is associated with the token's word id
        word_ids = tokenized_sequences.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        # iterate over subtokens
        for word_idx in word_ids:
            # special tokens (e.g. [CLS], [SEP]) get label id -100 -> loss function will ignore them
            if word_idx is None:
                label_ids.append(-100)
            # if the first subtoken of the next token is encountered, then associate the token's ner label with the subtoken
            # FIXME what if two consecutive tokens are identical (e.g. "is this a really really bad?")?
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # consecutive subtokens will be ignored
            else:
                label_ids.append(-100)
            # memorize the current word
            previous_word_idx = word_idx
        # add labels of the current sequence to the list of labels of the batch
        labels.append(label_ids)

    # update batch labels
    tokenized_sequences['labels'] = labels
    return tokenized_sequences

In [29]:
a=conll['train'].map(tokenize_and_align_labels, batched=True)

Loading cached processed dataset at C:/Users/Timo/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-1180f3410eff14d3.arrow


# Model and training definition
Load auto model for token classification. This adds a classification layer / head to the language model (usually a simple dense layer and dropout).

In [30]:
model = TFAutoModelForTokenClassification.from_pretrained(model_name, num_labels=9)

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForTokenClassification: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier', 'dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Specify training parameters and optimizer.

In [31]:
batch_size = 1
num_train_epochs = 1
num_train_steps = (len(conll['train']) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

model.get_layer('distilbert').trainable=False

model.compile(optimizer=optimizer, metrics='acc')
model.summary()

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Model: "tf_distil_bert_for_token_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 65190912  
 nLayer)                                                         
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  6921      
                                                                 
Total params: 65,197,833
Trainable params: 6,921
Non-trainable params: 65,190,912
_________________________________________________________________


Init collator to build batches, pad sequences in a batch etc.

In [32]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
tf_train_set = model.prepare_tf_dataset(
    conll['train'].map(tokenize_and_align_labels, batched=True),
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    conll['validation'].map(tokenize_and_align_labels, batched=True),
    shuffle=False,
    batch_size=1,
    collate_fn=data_collator,
)

Loading cached processed dataset at C:/Users/Timo/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-1180f3410eff14d3.arrow
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  tensor = as_tensor(value)
Loading cached processed dataset at C:/Users/Timo/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-698bbe1a45034a69.arrow


In [38]:
for example in tf_train_set:
   ainput_ids= example[0]['input_ids'].numpy()
   aattention_mask= example[0]['attention_mask'].numpy()
   aattention_labels= example[1].numpy()
   print(example[1])
   break

tf.Tensor(
[[-100    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    7    8    0    0    0
     3    4    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 -100]], shape=(1, 47), dtype=int64)


# Train model

In [33]:
import os
os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8"
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_train_epochs)

  526/14041 [>.............................] - ETA: 7:32 - loss: 1.9068 - acc: 0.3550


KeyboardInterrupt

