In [1]:
import numpy as np
import tensorflow as tf
import torch as th

from tqdm import tqdm
from typing import List

# 0. Data

## 0.1 construce data
구두점이 찍혀있는 512 이내의 텍스트를 입력으로 받아, 구두점이 없는 텍스트(sent)와 sent 내에서의 구두점이 위치해야 하는 위치 인덱스(label)의 리스트를 리턴

In [89]:
def make_data(sentences: List[str]):
    sents = []
    labels = []
    for sent in sentences:
        sent = np.array(list(sent))
        label = np.argwhere(sent=='.').flatten()
        labels.append((label - np.arange(len(label))).tolist())
        # sent[label] = ''
        sent = np.delete(sent, label, axis=0)
        sents.append(''.join(sent))

    return sents, labels

def validate_data(sentences, x, labels):
    for i, (sent, label) in enumerate(zip(x, labels)):
        sent_list = list(sent)
        for idx in label[::-1]:
            sent_list.insert(idx, '.')
        if ''.join(sent_list) != sentences[i]:
            raise ValueError(f"{i}'th label has error")
    else:
        print('done')

In [90]:
sentences = ["you. cut the longer texts. off and only use the first 100 tokens. the original implementation truncates longer. sequences automatically.",
             "now in my recently published paper there is a new method proposed called text guide. text guide is a text selection method that allows for improved performance when compared to naive or semi naive truncation methods."]
x, labels = make_data(sentences)

In [91]:
print(x)
print(labels)

['you cut the longer texts off and only use the first 100 tokens the original implementation truncates longer sequences automatically', 'now in my recently published paper there is a new method proposed called text guide text guide is a text selection method that allows for improved performance when compared to naive or semi naive truncation methods']
[[3, 24, 62, 107, 131], [83, 214]]


In [92]:
validate_data(sentences, x, labels)

done


# 1. Tokenize

In [41]:
from transformers import BertModel, BertTokenizerFast

# pretrained_model_name = "klue/bert-base"    # ko
pretrained_model_name = "bert-base-uncased"

In [42]:
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name)

In [130]:
train_x = tokenizer(x, add_special_tokens=False, padding=True, truncation=True)

In [162]:
def binary_label(y_data, max_len):
    b_data = []
    for label in y_data:
        binary = np.zeros(max_len, dtype=np.int32)
        binary[label] = 1
        b_data.append(binary)

    b_data = np.array(b_data)
    return b_data

def construct_data_input(tokenizer, x, labels):
    y_data = []
    train_x = tokenizer(x, add_special_tokens=False, padding=True, truncation=True)

    for i in tqdm(range(len(labels))):
        label = []
        for l in labels[i]:
            label.append(train_x.char_to_token(i, l-1))
        y_data.append(label)

    X_data = [train_x['input_ids'], train_x['attention_mask']]
    X_data = np.array(X_data)
    y_data = binary_label(y_data, len(train_x['input_ids'][0]))

    return X_data, y_data

In [163]:
X_data, y_data = construct_data_input(tokenizer, x, labels)

100%|██████████| 2/2 [00:00<00:00, 11949.58it/s]


In [151]:
y_data

array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [164]:
for tokens, y in zip(X_data[0], y_data):
    y = np.argwhere(y==1).flatten()
    print(tokenizer.decode([tokens[l] for l in y]))

you textss longer automatically
guide methods


# 2. Modeling

## Tensorflow

In [122]:
from tensorflow import keras

from keras import layers, models, initializers
from transformers import TFBertModel

In [168]:
class TFBertForSentenceTokenizing(models.Model):
    def __init__(self, pretrained_model_name):
        super(TFBertForSentenceTokenizing, self).__init__()
        self.bert = TFBertModel.from_pretrained(pretrained_model_name, from_pt=True)
        self.outputs = layers.Dense(1,
                                    kernel_initializer=initializers.initializers_v2.TruncatedNormal(mean=0.02),
                                    activation='sigmoid',
                                    name='outputs')

    def call(self, inputs, **kwargs):
        input_ids = inputs[0]
        attention_mask = inputs[1]
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_output = bert_out[0]        # (bs, seq_len, h_dim)
        logits = self.outputs(last_hidden_output)   # (bs, seq_len, 1)
        return logits

In [169]:
model = TFBertForSentenceTokenizing(pretrained_model_name)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [170]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = keras.losses.BinaryCrossentropy(from_logits=False)
model.compile(optimizer=optimizer, loss=loss)

In [171]:
model.fit(X_data, y_data, epochs=1, verbose=1)



<keras.callbacks.History at 0x219f9ea57c0>