In [1]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from seqeval.metrics import f1_score, classification_report
import json
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook

tf.__version__

'2.4.1'

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [3]:
from transformers import TFBertForTokenClassification

model = TFBertForTokenClassification.from_pretrained("monologg/kocharelectra-small-discriminator", num_labels=9, from_pt=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss)

You are using a model of type electra to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForTokenClassification: ['electra.encoder.layer.1.output.dense.bias', 'electra.encoder.layer.2.attention.self.query.weight', 'electra.encoder.layer.4.attention.self.query.bias', 'electra.encoder.layer.9.attention.self.value.bias', 'electra.encoder.layer.2.output.dense.bias', 'electra.encoder.layer.9.intermediate.dense.weight', 'electra.encoder.layer.8.attention.self.value.bias', 'electra.encoder.layer.6.attention.self.query.bias', 'electra.encoder.layer.10.attention.output.LayerNorm.bias', 'electra.encoder.layer.1.attention.output.dense.bias', 'electra.encoder.layer.2.attention.output.dense.bias', 'electra.encoder.layer.10.attention.self.query.bias', 'electra.encoder.layer.10.attention.output.dense.weight', 'electra.encoder.layer.10.output.LayerNorm

In [4]:

datapath ='/Users/eunbin/Desktop/Projects/saessack-server/extra/data/kinnaver/sent_tag.json'
data = open(datapath, encoding='utf-8')
df = pd.DataFrame(json.load(data))
# df.head()
train_ner_df = df.iloc[:13344, 0:]
test_ner_df = df.iloc[13344:, 0:]
print("학습 데이터 샘플 개수 :", len(train_ner_df))
print("테스트 데이터 샘플 개수 :", len(test_ner_df))

train_data_sentence = [sent.split() for sent in train_ner_df['sentence'].values]
test_data_sentence = [sent.split() for sent in test_ner_df['sentence'].values]
train_data_label = [tag.split() for tag in train_ner_df['tagging'].values]
test_data_label = [tag.split() for tag in test_ner_df['tagging'].values]
# print(train_data_sentence[0])
# print(train_data_label[0])

ner_label_path = '/Users/eunbin/Desktop/Projects/saessack-server/extra/data/tag/ner_label.txt'
labels = [label.strip() for label in open(ner_label_path, 'r', encoding='utf-8')]
print('개체명 태깅 정보 :', labels)

tag_to_index = {tag: index for index, tag in enumerate(labels)}
index_to_tag = {index: tag for index, tag in enumerate(labels)}
tag_size = len(tag_to_index)
print("개체명 태깅 정보의 개수 : ", tag_size)

학습 데이터 샘플 개수 : 13344
테스트 데이터 샘플 개수 : 3336
개체명 태깅 정보 : ['O', 'PLT-B', 'PLT-I', 'BUG-B', 'BUG-I', 'DIS-B', 'DIS-I', 'CTG-B', 'CTG-I']
개체명 태깅 정보의 개수 :  9


In [5]:
from tokenization_kocharelectra import KoCharElectraTokenizer

tokenizer = KoCharElectraTokenizer.from_pretrained("monologg/kocharelectra-base-discriminator")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'KoCharElectraTokenizer'.


# Tokenizing

In [6]:
tokenizer.get_vocab

<bound method KoCharElectraTokenizer.get_vocab of PreTrainedTokenizer(name_or_path='monologg/kocharelectra-base-discriminator', vocab_size=11568, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})>

# Model Input

In [7]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer,
                                 pad_token_id_for_segment=0, pad_token_id_for_label=-100):
    cls_token = '[CLS]'
    sep_token = '[SEP]'
    pad_token_id = 0

    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []

    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        tokens = []
        labels_ids = []
        for one_word, label_token in zip(example, label):
            tokens.append(one_word)
            labels_ids.append(tag_to_index[label_token])

        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            labels_ids = labels_ids[:(max_seq_len - special_tokens_count)]

        tokens += [sep_token]
        labels_ids += [pad_token_id_for_label]
        tokens = [cls_token] + tokens
        labels_ids = [pad_token_id_for_label] + labels_ids

        input_id = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        padding_count = max_seq_len - len(input_id)

        input_id = input_id + ([pad_token_id] * padding_count)
        attention_mask = attention_mask + ([0] * padding_count)
        token_type_id = [pad_token_id_for_segment] * max_seq_len
        label = labels_ids + ([pad_token_id_for_label] * padding_count)

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
        assert len(label) == max_seq_len, "Error with labels length {} vs {}".format(len(label), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels


In [8]:
X_train, y_train = convert_examples_to_features(train_data_sentence, train_data_label, max_seq_len=512, tokenizer=tokenizer)
X_test, y_test = convert_examples_to_features(test_data_sentence, test_data_label, max_seq_len=512, tokenizer=tokenizer)

100%|██████████| 13344/13344 [00:02<00:00, 4543.65it/s]
100%|██████████| 3336/3336 [00:00<00:00, 4751.54it/s]


# Model Training

In [9]:
class F1score(tf.keras.callbacks.Callback):
    def __init__(self, X_test, y_test):
        self.X_test = X_test
        self.y_test = y_test

    def sequences_to_tags(self, label_ids, pred_ids):
      label_list = []
      pred_list = []

      for i in range(0, len(label_ids)):
        label_tag = []
        pred_tag = []

        for label_index, pred_index in zip(label_ids[i], pred_ids[i]):
          if label_index != -100:
            label_tag.append(index_to_tag[label_index])
            pred_tag.append(index_to_tag[pred_index])
        
        label_list.append(label_tag)
        pred_list.append(pred_tag)

      return label_list, pred_list

    def on_epoch_end(self, epoch, logs={}):
      y_predicted = self.model.predict(self.X_test)
      y_predicted = np.argmax(y_predicted.logits, axis = 2)

      label_list, pred_list = self.sequences_to_tags(self.y_test, y_predicted)

      score = f1_score(label_list, pred_list, suffix=True)
      print(' - f1: {:04.2f}'.format(score * 100))
      print(classification_report(label_list, pred_list, suffix=True))


In [10]:
f1_score_report = F1score(X_test, y_test)

model.compile
model.fit(
    X_train, y_train, epochs=7, batch_size=32,
    callbacks = [f1_score_report]
)

: 

: 