In [1]:
import os, re
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import RobertaTokenizer, RobertaConfig, TFRobertaForTokenClassification
from tokenizers import ByteLevelBPETokenizer
from tqdm.auto import tqdm

# Get the Data

In [2]:
data = pd.read_csv(
    "../input/entity-annotated-corpus/ner.csv", encoding = "ISO-8859-1", error_bad_lines=False, 
    usecols=['sentence_idx', 'word', 'tag']
)
data  = data[data['sentence_idx'] != 'prev-lemma'].dropna(subset=['sentence_idx']).reset_index(drop=True)
print(data.shape)
data.head()



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


(1050794, 3)


Unnamed: 0,sentence_idx,word,tag
0,1,Thousands,O
1,1,of,O
2,1,demonstrators,O
3,1,have,O
4,1,marched,O


In [3]:
data['sentence_idx'] = data['sentence_idx'].astype(int)

split_thresh = data['sentence_idx'].max() * 0.9
df_train, df_valid = data[data['sentence_idx'] < split_thresh], data[data['sentence_idx'] >= split_thresh]
len(df_train), len(df_valid)

(946852, 103942)

In [4]:
agg_func = lambda s: [ [w,t] for w,t in zip(s["word"].values.tolist(),s["tag"].values.tolist())]

x_train_grouped = df_train.groupby("sentence_idx").apply(agg_func)
x_valid_grouped = df_valid.groupby("sentence_idx").apply(agg_func)

x_train_sentences = [[s[0] for s in sent] for sent in x_train_grouped.values]
x_valid_sentences = [[s[0] for s in sent] for sent in x_valid_grouped.values]

x_train_tags = [[t[1] for t in tag] for tag in x_train_grouped.values]
x_valid_tags = [[t[1] for t in tag] for tag in x_valid_grouped.values]

In [5]:
tag_list = data['tag'].unique()
label_map = {label: i for i, label in enumerate(tag_list)}
label_map_inv = {i: label for i, label in enumerate(tag_list)}
num_labels = len(tag_list) + 1
num_labels

18

# Build Tokenizer

In [6]:
texts = [' '.join(sent) for sent in x_train_sentences]

os.mkdir('text_files')
for e, text in enumerate(texts):
    with open(f"text_files/train_{e+1:05}.txt", 'w') as f:
        f.write(re.sub(r'\s+', ' ', text))

In [7]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files=[f"text_files/train_{e+1:05}.txt" for e in range(len(texts))], vocab_size=30_522,
    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>']
)
os.mkdir('tokenizer')
tokenizer.save_model('tokenizer')






['tokenizer/vocab.json', 'tokenizer/merges.txt']

# Initialise Model

In [8]:
pad_token_label_id = 0

In [9]:
tokenizer = RobertaTokenizer.from_pretrained('tokenizer')
config = RobertaConfig(
    vocab_size=30_522,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1, 
    num_labels=num_labels,
)
model = TFRobertaForTokenClassification(config)

2022-07-13 04:13:03.175140: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-13 04:13:03.292412: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-13 04:13:03.293192: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-13 04:13:03.294967: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

# Prepare Data

In [10]:
from keras.preprocessing.sequence import pad_sequences
max_seq_length =512

def convert_to_input(sentences,tags):
    input_id_list,attention_mask_list,token_type_id_list=[],[],[]
    label_id_list=[]
    
    for x,y in tqdm(zip(sentences,tags),total=len(tags)):
        tokens = []
        label_ids = []

        for word, label in zip(x, y):
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

  
        special_tokens_count =  2
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        label_ids = [pad_token_label_id]+label_ids+[pad_token_label_id]
        inputs = tokenizer(' '.join(tokens), add_special_tokens=True, max_length=max_seq_length,
                          return_token_type_ids=True)

        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_masks = [1] * len(input_ids)

        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)

        label_id_list.append(label_ids)

    return input_id_list,token_type_id_list,attention_mask_list,label_id_list

In [11]:
(
    input_ids_train, token_ids_train, attention_masks_train, label_ids_train
) = convert_to_input(x_train_sentences,x_train_tags)
(
    input_ids_valid, token_ids_valid, attention_masks_valid, label_ids_valid
) = convert_to_input(x_valid_sentences,x_valid_tags)

post_pad_sequences = lambda x: pad_sequences(
    x, maxlen=max_seq_length,dtype="long",truncating="post",padding="post"
)

input_ids_train = post_pad_sequences(input_ids_train)
token_ids_train = post_pad_sequences(token_ids_train)
attention_masks_train = post_pad_sequences(attention_masks_train)
label_ids_train = post_pad_sequences(label_ids_train)

input_ids_valid = post_pad_sequences(input_ids_valid)
token_ids_valid = post_pad_sequences(token_ids_valid)
attention_masks_valid = post_pad_sequences(attention_masks_valid)
label_ids_valid = post_pad_sequences(label_ids_valid)

def example_to_features(input_ids,attention_masks,token_type_ids,y):
    return {
        "input_ids": input_ids, "attention_mask": attention_masks, "token_type_ids": token_type_ids
    }, y

train_data = tf.data.Dataset.from_tensor_slices(
    (input_ids_train, attention_masks_train, token_ids_train, label_ids_train)
).map(example_to_features).shuffle(1000).batch(8).repeat(5)
valid_data = tf.data.Dataset.from_tensor_slices(
    (input_ids_valid, attention_masks_valid, token_ids_valid, label_ids_valid)
).map(example_to_features).batch(1)

  0%|          | 0/30381 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/4796 [00:00<?, ?it/s]

# Train

In [12]:
model.compile(
    optimizer=tf.optimizers.Adam(2e-5), 
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics='accuracy'
)
model.fit(train_data, validation_data=valid_data, epochs=3)
model.save_pretrained('saved-model')

Epoch 1/3


2022-07-13 04:14:31.908477: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/3
Epoch 3/3


# Inference

In [13]:
query_idx = 7

def query_to_features(input_ids,attention_masks,token_type_ids):
    return {
        "input_ids": input_ids, "attention_mask": attention_masks, "token_type_ids": token_type_ids
    }

input_ids_query = input_ids_valid[query_idx:query_idx+1]
token_ids_query = token_ids_valid[query_idx:query_idx+1]
attention_masks_query = attention_masks_valid[query_idx:query_idx+1]
label_ids_query = label_ids_valid[query_idx:query_idx+1]

query_data = tf.data.Dataset.from_tensor_slices(
    (input_ids_query, attention_masks_query, token_ids_query)
).map(query_to_features).batch(1)
model = TFRobertaForTokenClassification.from_pretrained('saved-model')
logits = model.predict(query_data, verbose=1).logits
preds_proba = tf.nn.softmax(logits).numpy()
preds = preds_proba.argmax(axis=2)

Some layers from the model checkpoint at saved-model were not used when initializing TFRobertaForTokenClassification: ['dropout_19']
- This IS expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaForTokenClassification were initialized from the model checkpoint at saved-model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForTokenClassification for predictions without further training.




In [14]:
texts = [tokenizer.decode(token) for token in input_ids_query[0]]
tags = [label_map_inv[p] for p in preds[0]]
for token, tag in zip(texts, tags):
    print(f"{token:15} {tag}")
    if token == '</s>':
        break

<s>             O
Iraqi           B-gpe
 o              O
 f              O
ficial          O
s               O
 s              O
 a              O
y               O
 gun            O
 men            O
 h              O
 a              O
ve              O
 k              O
 ill            O
ed              O
 a              O
 member         O
 of             O
 the            O
 secular        O
 co             O
 al             B-gpe
ition           O
 led            O
 by             O
 for            B-per
 mer            O
 Iraqi          I-per
 prime          O
 m              O
 in             O
ister           O
 A              O
 y              O
ad              O
 All            O
 aw             O
i               O
.               O
</s>            O
