## Library

In [None]:
import tensorflow as tf
from datasets import load_dataset

from transformers import RobertaTokenizerFast, DataCollatorForTokenClassification

## Config.

In [13]:
BATCH_SIZE = 16
NUM_EPOCH = 2

## Dataset

In [5]:
dataset = load_dataset('conll2003')

Downloading data: 100%|██████████| 983k/983k [00:00<00:00, 3.59MB/s]
Generating train split: 100%|██████████| 14041/14041 [00:02<00:00, 6093.76 examples/s]
Generating validation split: 100%|██████████| 3250/3250 [00:00<00:00, 5638.57 examples/s]
Generating test split: 100%|██████████| 3453/3453 [00:00<00:00, 6607.90 examples/s]


In [6]:
model_id = 'roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(model_id, add_prefix_space=True)

In [7]:
def align_labels_with_tokens(labels, word_ids):

  new_labels =[]
  current_word = None

  for word_id in word_ids:
    if word_id != current_word:
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)

    elif word_id is None:
      new_labels.append(-100)

    else:
      label = labels[word_id]

      if label % 2 == 1:
        label +=1

      new_labels.append(label)

  return new_labels

In [8]:
def tokenizer_function(dataset):
  out = tokenizer(dataset['tokens'], truncation = True, is_split_into_words = True)
  out['labels'] = align_labels_with_tokens(dataset['ner_tags'], out.word_ids())

  return out

In [10]:
tokenized_dataset = dataset.map(tokenizer_function, remove_columns = ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'])

Map: 100%|██████████| 14041/14041 [00:09<00:00, 1529.69 examples/s]
Map: 100%|██████████| 3250/3250 [00:02<00:00, 1503.03 examples/s]
Map: 100%|██████████| 3453/3453 [00:01<00:00, 1850.59 examples/s]


In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer,
                                                   return_tensors = 'tf')

In [14]:
tf_train_dataset = tokenized_dataset['train'].to_tf_dataset(
    collate_fn = data_collator,
    shuffle = True,
    batch_size = BATCH_SIZE
)

In [28]:
tf.data.experimental.save(dataset=tf_train_dataset, path='Dataset/Train')

In [15]:
tf_val_dataset = tokenized_dataset['validation'].to_tf_dataset(
    collate_fn = data_collator,
    shuffle = True,
    batch_size = BATCH_SIZE
)

In [29]:
tf.data.experimental.save(dataset=tf_val_dataset, path='Dataset/Valid')