In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 27.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 57.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [3]:
from typing import Dict, List, Optional
import os
import json
import pickle
import random
import time
import warnings

from filelock import FileLock

import torch
from torch.utils.data.dataset import Dataset

from tokenizers import BertWordPieceTokenizer

from transformers import BertConfig, BertForPreTraining, BertTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import BertForMaskedLM, pipeline
from transformers.utils import logging
from transformers.tokenization_utils import PreTrainedTokenizer

In [4]:
!mkdir my_data

In [5]:
# 전체적인 동작 확인을 위한 작은 데이터셋

!curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" > /dev/null
!curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" -o my_data/wiki_20190620_small.txt

file="./my_data/wiki_20190620_small.txt"

awk: cannot open ./cookie (No such file or directory)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1323k  100 1323k    0     0  2575k      0 --:--:-- --:--:-- --:--:--  270M


In [6]:
# 큰 데이터셋

# !curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1_F5fziHjUM-jKr5Pwcx1we6g_J2o70kZ" > /dev/null
# !curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1_F5fziHjUM-jKr5Pwcx1we6g_J2o70kZ" -o my_data/wiki_20190620.txt

# file="./my_data/wiki_20190620.txt"

## Tokenizer

In [7]:
!mkdir wordPieceTokenizer

In [8]:
# Initialize an empty tokenizer
wp_tokenizer = BertWordPieceTokenizer(
    clean_text=True,   # " ", "\t", "\n", "\r" 등의 공백 문자는 Token으로 하지 않고 제거. ["좋은"," ","예제"] -> ["좋은","예제"]
    handle_chinese_chars=True,  # 한자는 모두 char 단위로 분할
    strip_accents=False,    # True: [YehHamza] -> [Yep, Hamza]
    lowercase=False,    # Hello -> hello
)

wp_tokenizer.train(
    files=file,
    vocab_size=20000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    wordpieces_prefix="##"
)

wp_tokenizer.save_model("wordPieceTokenizer", "my_tokenizer")

['wordPieceTokenizer/my_tokenizer-vocab.txt']

In [9]:
print(wp_tokenizer.get_vocab_size())

20000


In [10]:
model_max_input_len = 512

In [11]:
tokenizer = BertTokenizerFast(
    vocab_file='./wordPieceTokenizer/my_tokenizer-vocab.txt',
    max_len=model_max_input_len,
    do_lower_case=False,
    )

In [12]:
print(tokenizer.tokenize("뷁은 [MASK] 조선 중기의 무신이다."))

['[UNK]', '[', 'M', '##AS', '##K', ']', '조선', '중', '##기의', '무신', '##이다', '.']


In [13]:
tokenizer.add_special_tokens({'mask_token':'[MASK]'})
print(tokenizer.tokenize("이순신은 [MASK] 중기의 무신이다."))

['이', '##순', '##신은', '[MASK]', '중', '##기의', '무신', '##이다', '.']


## Preprocess

In [14]:
logger = logging.get_logger(__name__)

In [15]:
class TextDatasetForNextSentencePrediction(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        short_seq_probability=0.1,
        nsp_probability=0.5,
    ):
        # caching
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

        self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True)
        self.short_seq_probability = short_seq_probability
        self.nsp_probability = nsp_probability

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory,
            "cached_nsp_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        self.tokenizer = tokenizer

        lock_path = cached_features_file + ".lock"

        # Input file format:
        # (1) One sentence per line. These should ideally be actual sentences, not
        # entire paragraphs or arbitrary spans of text. (Because we use the
        # sentence boundaries for the "next sentence prediction" task).
        # (2) Blank lines between documents. Document boundaries are needed so
        # that the "next sentence prediction" task doesn't span between documents.
        #
        # Example:
        # I am very happy.
        # Here is the second sentence.
        #
        # A new document.

        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else: # 캐시가 없는 경우
                logger.info(f"Creating features from dataset file at {directory}")

                self.documents = [[]] # document 단위로 학습이 이뤄짐
                with open(file_path, encoding="utf-8") as f:
                    while True: 
                        line = f.readline() # 한줄씩 개행된 문장 
                        if not line:
                            break
                        line = line.strip()

                        # 이중 개행일 시, documents에 새로 document 추가
                        if not line and len(self.documents[-1]) != 0:
                            self.documents.append([])

                        # line 별로 document에 추가
                        tokens = tokenizer.tokenize(line)
                        tokens = tokenizer.convert_tokens_to_ids(tokens)
                        if tokens:
                            self.documents[-1].append(tokens)

                logger.info(f"Creating examples from {len(self.documents)} documents.")
                self.examples = []

                for doc_index, document in enumerate(self.documents):
                    self.create_examples_from_document(document, doc_index) 

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def create_examples_from_document(self, document: List[List[int]], doc_index: int):
        """Creates examples for a single document."""
        
        # 총 입력 길이를 block_size로 지정했지만, 
        # Tokenizing 과정에서 [CLS], 입력1 tokens,[SEP] ,입력2 tokens,[SEP]형태로 들어가므로 3만큼 빼줘야함.
        max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True)

        # We *usually* want to fill up the entire sequence since we are padding
        # to `block_size` anyways, so short sequences are generally wasted
        # computation. However, we *sometimes*
        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
        # sequences to minimize the mismatch between pretraining and fine-tuning.
        # The `target_seq_length` is just a rough target however, whereas
        # `block_size` is a hard limit.

        # 기본적으로 입력 데이터는 max_num_tokens를 꽉 채우는 형식으로 만들어짐
        # 하지만 실제 입력 데이터는 max_len보다 짧은 데이터가 들어올 수 있음
        # positioin embedding 등도 고려한다면 이런 부분에서 짧은 길이의 데이터도 넣어주는 것이 학습에 좋음
        # 그래서 short_seq_probability 만큼의 데이터에서는 2 ~최대길이 사이의 random 값으로 짧은 길이의 데이터도 생성
        target_seq_length = max_num_tokens
        if random.random() < self.short_seq_probability:
            target_seq_length = random.randint(2, max_num_tokens)

        current_chunk = []  # a buffer stored current working segments
        current_length = 0
        i = 0

        # document 단위로 데이터 생성
        # 위에서 정한 target_seq_length을 꽉 채울 수 있게 문장_1+문장_2[SEP]문장_3+문장_4 형태로 생성
        while i < len(document):
            segment = document[i]
            current_chunk.append(segment)
            current_length += len(segment)
            if i == len(document) - 1 or current_length >= target_seq_length:
                if current_chunk:
                    # `a_end` is how many segments from `current_chunk` go into the `A`
                    # (first) sentence.
                    a_end = 1

                    if len(current_chunk) >= 2:
                        a_end = random.randint(1, len(current_chunk) - 1)
                    tokens_a = []
                    for j in range(a_end):
                        tokens_a.extend(current_chunk[j])

                    tokens_b = []
                    # 50%의 확률로 다음 문장을 이어서 넣거나, 다른 문서의 내용을 넣음 
                    if len(current_chunk) == 1 or random.random() < self.nsp_probability:
                        is_random_next = True
                        target_b_length = target_seq_length - len(tokens_a)

                        # This should rarely go for more than one iteration for large
                        # corpora. However, just to be careful, we try to make sure that
                        # the random document is not the same as the document
                        # we're processing.
                        for _ in range(10):
                            random_document_index = random.randint(0, len(self.documents) - 1)
                            if random_document_index != doc_index:
                                break

                        random_document = self.documents[random_document_index]
                        random_start = random.randint(0, len(random_document) - 1)
                        for j in range(random_start, len(random_document)):
                            tokens_b.extend(random_document[j])
                            if len(tokens_b) >= target_b_length:
                                break
                        # We didn't actually use these segments so we "put them back" so
                        # they don't go to waste.
                        num_unused_segments = len(current_chunk) - a_end
                        i -= num_unused_segments
                    # Actual next
                    else:
                        is_random_next = False
                        for j in range(a_end, len(current_chunk)):
                            tokens_b.extend(current_chunk[j])

                    def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
                        """Truncates a pair of sequences to a maximum sequence length."""
                        while True:
                            total_length = len(tokens_a) + len(tokens_b)
                            if total_length <= max_num_tokens:
                                break
                            trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
                            assert len(trunc_tokens) >= 1
                            # We want to sometimes truncate from the front and sometimes from the
                            # back to add more randomness and avoid biases.
                            if random.random() < 0.5:
                                del trunc_tokens[0]
                            else:
                                trunc_tokens.pop()

                    truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)

                    assert len(tokens_a) >= 1
                    assert len(tokens_b) >= 1

                    # add special tokens
                    input_ids = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
                    input_ids += [tokenizer.pad_token_id] * (self.block_size - len(input_ids))
                    # add token type ids, 0 for sentence a, 1 for sentence b
                    token_type_ids = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
                    token_type_ids += [tokenizer.pad_token_id] * (self.block_size - len(token_type_ids))

                    attention_mask = ([1] * len(input_ids)) + ([0] * (self.block_size - len(input_ids)))
                    
                    example = {
                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
                        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                        "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
                        "next_sentence_label": torch.tensor(1 if is_random_next else 0, dtype=torch.long),
                    }

                    self.examples.append(example)

                current_chunk = []
                current_length = 0

            i += 1

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [16]:
dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path='/content/my_data/wiki_20190620_small.txt',
    block_size=model_max_input_len,
    overwrite_cache=False,
    short_seq_probability=0.1,
    nsp_probability=0.5,
)

data_collator = DataCollatorForLanguageModeling(    # MLM의 masking 작업을 해주는 기능
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [17]:
batch_size = 8

train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=True, collate_fn=data_collator)

In [18]:
for batch in train_dataloader:
  break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [19]:
dataset.examples[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label'])

In [20]:
# 기존 input_ids에 masking 처리가 되고, labels가 생긴 것을 확인할 수 있음

data_collator(dataset.examples).keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [21]:
tokenizer.decode(dataset.examples[1]['input_ids'])

'[CLS] 퇴임 이후 민간 자원을 적극 활용한 비영리 기구인 카터 재단을 설립한 뒤 민주주의 실현을 위해 제 3세계의 선거 감시 활동 및 기니 벌레에 의한 드라쿤쿠르스 질병 방재를 위해 힘썼다. 미국의 빈곤층 지원 활동, 사랑의 집짓기 운동, 국제 분쟁 중재 등의 활동도 했다. 카터는 카터 행정부 이후 미국이 북핵 위기, 코소보 전쟁, 이라크 전쟁과 같이 미국이 군사적 행동을 최후로 선택하는 전통적 사고를 버리고 군사적 행동을 선행하는 행위에 대해 깊은 유감을 표시 하며 미국의 군사적 활동에 강한 반대 입장을 보이고 있다. 특히 국제 분쟁 조정을 위해 북한의 김일성, 아이티의 세드라스 장군, 팔레인스타인의 하마스, 보스니아의 세르비아계 정권 같이 미국 정부에 대해 협상을 거부하면서 사태의 위기를 초래한 인물 및 단체를 직접 만나 분쟁의 원인을 근본적으로 해결하기 위해 힘썼다. 이 과정에서 미국 행정부와 갈등을 보이기도 했지만, 전직 대통령의 권한과 재야 유명 인사들의 활약으로 해결해 나갔다. 1978년에 채결된 캠프데이비드 협정의 이행이 지지부진 하자 중동 분쟁 분제를 해결하기 위해 1993년 퇴임 후 직접 이스라엘과 팔레인스타인의 오슬로 협정을 이끌어 내는 데도 성공했다. [SEP] 1993년 1차 북핵 위기 당시 북한에 대한 미국의 군사적 행동이 임박했으나, 미국 전직 대통령으로는 처음으로 북한을 방문하고 미국과 북 양국의 중재에 큰 기여를 해 위기를 해결했다는 평가를 받았다. 또한 이 때 김영삼 대통령과 김일성 주석의 만남을 주선했다. 하지만 그로부터 수주일 후 김일성이 갑자기 사망하여 김일성과 김영삼의 정상회담은 이루어지지 못했다. 미국의 관타나모 수용소 문제, 세계의 인권문제에서도 관심이 깊어 유엔에 유엔인권고등판무관의 제도를 시행하도록 노력하여 독재자들의 인권 유린에 대해 제약을 하고, 국제형사재판소를 만드는 데 기여하여 독재자들 같은 인권유린범죄자를 재판소로 회부하여 국제적인 처벌을 받게 하는 등 인권 신장에 크나 큰 기여를 했다. 2011년 4

In [22]:
tokenizer.decode(data_collator(dataset.examples)['input_ids'][0])

'[CLS] 제임스 얼 " 지미 " 카터 주니어는 민주당 [MASK] 미국 39번째 대통령 이다. 지미 카터는 조지아주 섬터 카운티 플레인스 [MASK]서 태어났다. 조지아 [MASK]를 졸업하였다. [MASK] 후 해군 [MASK] 들어가 전함 · 원자력 · 잠수함의 승무원으로 일하였다 [MASK] 1953년 미국 해군 대위로 예편 [MASK] 이후 땅콩 · 면화 [MASK] [UNK] [MASK] 돈을 벌었다. 그의 별 [MASK] " 땅콩 농부 " [MASK] 알려졌다. 1962년 조지아 주 상원 [MASK] 선거에서 낙선하나 그 선거가 부정선거 였 [MASK] 입증 [MASK] 되어 당선되고, 1966년 조지아 주 지사 선거에 낙선하지만 1970년 조지아 주 지사를 역임했다. 대통령이 되기 전화시켜 [MASK] [MASK] 두번 연임했으며, 1971년부터 [MASK] 조지아 [MASK] [MASK] [MASK]했다. 조지아 주지사로 지내면서, 미국에 [MASK] 흑인 등용법을 [MASK]. [MASK] 대통령 [MASK] 민주당 후보로 [MASK] 도덕 [MASK] 정책으로 내세워, 포드를 누르고 당선되었다 [MASK] 카터 대통령은 [MASK] 영어의 촉구했으나 공화당 [MASK] 반대로 무산되었다. 협회 이집트와 [MASK]을 조정하여, 캠프 데이 [MASK]드에서 안와르 사다트 대통령과 [UNK] 베긴 수상과 함께 중동 평화를 위한 캠프데이비드 협정을 [MASK]했다. [SEP] [MASK] 1979년 백악관에서 [MASK] 간의 평화조약으로 이끌어졌다. 또한 소련과 제2차 전략 무기 제한 협상에 조인했다. 카터는 1970년대 후반 [MASK] 대한민국 등 인권 후진국의 국민들의 인권을 지키기 위해 노력했으며, 취임 이후 계속해서 도덕정치 [MASK] 내세웠다. 그러나 주 이란 [MASK] 대사관 인질 사건에서 인질 구출 실패를 이유로 1980년 대통령 선거에서 공화당의 [MASK]널드 레이건 후보에게 져 결국 재선에 실패했다 [MASK] [MAS

In [24]:
print(dataset.examples[1]['input_ids'])

tensor([    2,  4277,  1935,  4097,  5825,  4400, 10178,  7946,  8030,  1040,
         5506,  2823,  1096,  7068,   307,  4065,  5463,  1096,  1964,   733,
         7193,  2806,  2328,  7219,  2195,   467,   182,  1252,   484,  1197,
         1014,  2931, 13064,  1820,  1369,  6925,  8301,  9210,  1071,  1964,
        16703,    17,  2506,   519,  1702,  1418,  2291,  2195,    15, 16210,
        19161,  1944,    15,  2363,  6168, 11118,  2029,  2195,  1082,  2032,
           17, 10314,  5506,  5237,  1935,  8589,  9244,  8229,    15, 19329,
         1041,  2323,    15,  7859, 15090,  1991,  8589,  7812,  6507, 13948,
         1036, 15024,  3001,  1008,  4233,  1071, 17970,  7812,  6507,  7345,
         1898, 19585,  1949,  4216, 18792,  2936,  2617,  2506,  7812,  5637,
         3135,  2703,  4750,  6688,  1889,    17,  2244,  2363,  6168, 13833,
         1964,  7836, 16394,    15,  2545,  1031,  1007,   556, 11464,  1034,
          721,  1358,    15, 19446, 15382,   936,  3367,    15, 

In [23]:
print(data_collator(dataset.examples))

{'input_ids': tensor([[    2,  4356,   638,  ...,  2433,     4,     3],
        [    2,  4277,  1935,  ...,     0,     0,     0],
        [    2,  2001,   699,  ...,   280,  5859,     3],
        ...,
        [    2, 14532,  1932,  ...,     4,    17,     3],
        [    2,    44,  3160,  ...,     0,     0,     0],
        [    2, 16182,  2635,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'next_sentence_label': tensor([0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,

## Train

In [20]:
config = BertConfig(    # https://huggingface.co/transformers/model_doc/bert.html#bertconfig
    vocab_size=tokenizer.vocab_size, 
    # hidden_size=512,
    # num_hidden_layers=12,    # layer num
    # num_attention_heads=8,    # transformer attention head number
    # intermediate_size=3072,   # transformer 내에 있는 feed-forward network의 dimension size
    # hidden_act="gelu",
    # hidden_dropout_prob=0.1,
    # attention_probs_dropout_prob=0.1,
    max_position_embeddings=model_max_input_len,    # 해당 모델에서 사용할 수 있는 최대 입력 길이
    # type_vocab_size=2,    # token type ids의 값 수 (BERT는 segmentA(0), segmentB(1))
    # pad_token_id=0,
    # position_embedding_type="absolute"
)

model = BertForPreTraining(config=config)
model.num_parameters()

102015010

In [27]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

epochs = 100

learning_rate = 1e-5
weight_decay = 1e-2
early_stopping_patience = 10

save_name = 'bert_pretraining'

In [28]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [23]:
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(20000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [29]:
# class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'

In [30]:
import gc
from tqdm import tqdm

In [31]:
def train_step(batch, epoch, training):
    batch = {key: value.to(device) for key, value in batch.items()}

    if training is True:
        model.train()
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():

            loss = model(**batch)[0]

        loss.backward()
        optimizer.step()

        lr = optimizer.param_groups[0]["lr"]

        return loss, round(lr, 10)

    else:
        model.eval()
        with torch.no_grad():
            loss = model(**batch)[0]

        return loss

In [33]:
%%time
# train

loss_plot = []
lrs = []

check_list = []

best_loss = 100

best_epoch = 0
patience = 0

for epoch in range(epochs):
    gc.collect()
    total_loss, total_val_loss = 0, 0
    
    tqdm_dataset = tqdm(enumerate(train_dataloader), total=train_dataloader.__len__())
    training = True
    for batch_idx, batch in tqdm_dataset:
        batch_loss, lr = train_step(batch, epoch, training)
        total_loss += batch_loss
        
        tqdm_dataset.set_postfix({
            '%+10s' % 'Epoch': epoch + 1,
            '%10s' % GREEN + 'Loss' : '{:.4f}'.format(total_loss/(batch_idx+1)) + END,
            '%5s' % 'LR' : lr,
        })
            
    loss_plot.append(total_loss/(batch_idx+1))
    
    cur_loss = round(float((total_loss/(batch_idx+1)).detach().cpu()), 3)

    if cur_loss < best_loss:
        print(YELLOW + 'Best_loss is updated from {:>5} to {:>5} on epoch {}'.format(best_loss, cur_loss, epoch+1) + END)
        best_loss = cur_loss
        best_epoch = epoch+1
        torch.save(model.state_dict(), './'+save_name+'.ckpt')
        patience = 0
    else:
        patience += 1
    
    lrs.append(lr)
    
    if patience == early_stopping_patience:
        break

100%|██████████| 106/106 [00:58<00:00,  1.83it/s,      Epoch=1,      [92mLoss=9.6608[0m,    LR=1e-5]


[93mBest_loss is updated from   100 to 9.661 on epoch 1[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=2,      [92mLoss=9.4877[0m,    LR=1e-5]


[93mBest_loss is updated from 9.661 to 9.488 on epoch 2[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=3,      [92mLoss=9.3538[0m,    LR=1e-5]


[93mBest_loss is updated from 9.488 to 9.354 on epoch 3[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=4,      [92mLoss=9.2576[0m,    LR=1e-5]


[93mBest_loss is updated from 9.354 to 9.258 on epoch 4[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=5,      [92mLoss=9.1823[0m,    LR=1e-5]


[93mBest_loss is updated from 9.258 to 9.182 on epoch 5[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=6,      [92mLoss=9.1337[0m,    LR=1e-5]


[93mBest_loss is updated from 9.182 to 9.134 on epoch 6[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=7,      [92mLoss=9.1070[0m,    LR=1e-5]


[93mBest_loss is updated from 9.134 to 9.107 on epoch 7[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=8,      [92mLoss=9.0561[0m,    LR=1e-5]


[93mBest_loss is updated from 9.107 to 9.056 on epoch 8[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=9,      [92mLoss=9.0275[0m,    LR=1e-5]


[93mBest_loss is updated from 9.056 to 9.028 on epoch 9[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=10,      [92mLoss=9.0019[0m,    LR=1e-5]


[93mBest_loss is updated from 9.028 to 9.002 on epoch 10[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=11,      [92mLoss=8.9753[0m,    LR=1e-5]


[93mBest_loss is updated from 9.002 to 8.975 on epoch 11[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=12,      [92mLoss=8.9339[0m,    LR=1e-5]


[93mBest_loss is updated from 8.975 to 8.934 on epoch 12[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=13,      [92mLoss=8.9024[0m,    LR=1e-5]


[93mBest_loss is updated from 8.934 to 8.902 on epoch 13[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=14,      [92mLoss=8.8953[0m,    LR=1e-5]


[93mBest_loss is updated from 8.902 to 8.895 on epoch 14[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=15,      [92mLoss=8.8450[0m,    LR=1e-5]


[93mBest_loss is updated from 8.895 to 8.845 on epoch 15[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=16,      [92mLoss=8.7954[0m,    LR=1e-5]


[93mBest_loss is updated from 8.845 to 8.795 on epoch 16[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=17,      [92mLoss=8.7489[0m,    LR=1e-5]


[93mBest_loss is updated from 8.795 to 8.749 on epoch 17[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=18,      [92mLoss=8.6981[0m,    LR=1e-5]


[93mBest_loss is updated from 8.749 to 8.698 on epoch 18[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=19,      [92mLoss=8.6308[0m,    LR=1e-5]


[93mBest_loss is updated from 8.698 to 8.631 on epoch 19[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=20,      [92mLoss=8.5696[0m,    LR=1e-5]


[93mBest_loss is updated from 8.631 to  8.57 on epoch 20[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=21,      [92mLoss=8.5020[0m,    LR=1e-5]


[93mBest_loss is updated from  8.57 to 8.502 on epoch 21[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=22,      [92mLoss=8.4556[0m,    LR=1e-5]


[93mBest_loss is updated from 8.502 to 8.456 on epoch 22[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=23,      [92mLoss=8.3805[0m,    LR=1e-5]


[93mBest_loss is updated from 8.456 to  8.38 on epoch 23[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=24,      [92mLoss=8.3606[0m,    LR=1e-5]


[93mBest_loss is updated from  8.38 to 8.361 on epoch 24[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=25,      [92mLoss=8.3480[0m,    LR=1e-5]


[93mBest_loss is updated from 8.361 to 8.348 on epoch 25[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=26,      [92mLoss=8.3238[0m,    LR=1e-5]


[93mBest_loss is updated from 8.348 to 8.324 on epoch 26[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=27,      [92mLoss=8.2654[0m,    LR=1e-5]


[93mBest_loss is updated from 8.324 to 8.265 on epoch 27[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=28,      [92mLoss=8.2303[0m,    LR=1e-5]


[93mBest_loss is updated from 8.265 to  8.23 on epoch 28[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=29,      [92mLoss=8.2468[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=30,      [92mLoss=8.2126[0m,    LR=1e-5]


[93mBest_loss is updated from  8.23 to 8.213 on epoch 30[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=31,      [92mLoss=8.2092[0m,    LR=1e-5]


[93mBest_loss is updated from 8.213 to 8.209 on epoch 31[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=32,      [92mLoss=8.1775[0m,    LR=1e-5]


[93mBest_loss is updated from 8.209 to 8.177 on epoch 32[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=33,      [92mLoss=8.1856[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=34,      [92mLoss=8.1744[0m,    LR=1e-5]


[93mBest_loss is updated from 8.177 to 8.174 on epoch 34[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=35,      [92mLoss=8.1316[0m,    LR=1e-5]


[93mBest_loss is updated from 8.174 to 8.132 on epoch 35[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=36,      [92mLoss=8.0649[0m,    LR=1e-5]


[93mBest_loss is updated from 8.132 to 8.065 on epoch 36[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=37,      [92mLoss=8.1114[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=38,      [92mLoss=8.0633[0m,    LR=1e-5]


[93mBest_loss is updated from 8.065 to 8.063 on epoch 38[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=39,      [92mLoss=8.0942[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=40,      [92mLoss=8.0857[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=41,      [92mLoss=8.0232[0m,    LR=1e-5]


[93mBest_loss is updated from 8.063 to 8.023 on epoch 41[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=42,      [92mLoss=8.0505[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=43,      [92mLoss=8.0386[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=44,      [92mLoss=8.0829[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=45,      [92mLoss=8.0383[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=46,      [92mLoss=7.9928[0m,    LR=1e-5]


[93mBest_loss is updated from 8.023 to 7.993 on epoch 46[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=47,      [92mLoss=7.9987[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=48,      [92mLoss=7.9382[0m,    LR=1e-5]


[93mBest_loss is updated from 7.993 to 7.938 on epoch 48[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=49,      [92mLoss=8.0105[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=50,      [92mLoss=7.9555[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=51,      [92mLoss=7.9633[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=52,      [92mLoss=7.9720[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=53,      [92mLoss=7.9694[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=54,      [92mLoss=7.9304[0m,    LR=1e-5]


[93mBest_loss is updated from 7.938 to  7.93 on epoch 54[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=55,      [92mLoss=7.9425[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=56,      [92mLoss=7.9200[0m,    LR=1e-5]


[93mBest_loss is updated from  7.93 to  7.92 on epoch 56[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=57,      [92mLoss=7.9265[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=58,      [92mLoss=7.9439[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=59,      [92mLoss=7.9456[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=60,      [92mLoss=7.9157[0m,    LR=1e-5]


[93mBest_loss is updated from  7.92 to 7.916 on epoch 60[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=61,      [92mLoss=7.9129[0m,    LR=1e-5]


[93mBest_loss is updated from 7.916 to 7.913 on epoch 61[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=62,      [92mLoss=7.8871[0m,    LR=1e-5]


[93mBest_loss is updated from 7.913 to 7.887 on epoch 62[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=63,      [92mLoss=7.8575[0m,    LR=1e-5]


[93mBest_loss is updated from 7.887 to 7.857 on epoch 63[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=64,      [92mLoss=7.8564[0m,    LR=1e-5]


[93mBest_loss is updated from 7.857 to 7.856 on epoch 64[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=65,      [92mLoss=7.9109[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=66,      [92mLoss=7.8537[0m,    LR=1e-5]


[93mBest_loss is updated from 7.856 to 7.854 on epoch 66[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=67,      [92mLoss=7.8600[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=68,      [92mLoss=7.8321[0m,    LR=1e-5]


[93mBest_loss is updated from 7.854 to 7.832 on epoch 68[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=69,      [92mLoss=7.8307[0m,    LR=1e-5]


[93mBest_loss is updated from 7.832 to 7.831 on epoch 69[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=70,      [92mLoss=7.8546[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=71,      [92mLoss=7.8203[0m,    LR=1e-5]


[93mBest_loss is updated from 7.831 to  7.82 on epoch 71[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=72,      [92mLoss=7.8452[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=73,      [92mLoss=7.7866[0m,    LR=1e-5]


[93mBest_loss is updated from  7.82 to 7.787 on epoch 73[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=74,      [92mLoss=7.7827[0m,    LR=1e-5]


[93mBest_loss is updated from 7.787 to 7.783 on epoch 74[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=75,      [92mLoss=7.7835[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=76,      [92mLoss=7.7690[0m,    LR=1e-5]


[93mBest_loss is updated from 7.783 to 7.769 on epoch 76[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=77,      [92mLoss=7.7777[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=78,      [92mLoss=7.7706[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=79,      [92mLoss=7.7822[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=80,      [92mLoss=7.7687[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=81,      [92mLoss=7.7536[0m,    LR=1e-5]


[93mBest_loss is updated from 7.769 to 7.754 on epoch 81[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=82,      [92mLoss=7.7232[0m,    LR=1e-5]


[93mBest_loss is updated from 7.754 to 7.723 on epoch 82[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=83,      [92mLoss=7.7174[0m,    LR=1e-5]


[93mBest_loss is updated from 7.723 to 7.717 on epoch 83[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=84,      [92mLoss=7.7569[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=85,      [92mLoss=7.7289[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=86,      [92mLoss=7.7006[0m,    LR=1e-5]


[93mBest_loss is updated from 7.717 to 7.701 on epoch 86[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=87,      [92mLoss=7.7136[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=88,      [92mLoss=7.6902[0m,    LR=1e-5]


[93mBest_loss is updated from 7.701 to  7.69 on epoch 88[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=89,      [92mLoss=7.7058[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=90,      [92mLoss=7.6810[0m,    LR=1e-5]


[93mBest_loss is updated from  7.69 to 7.681 on epoch 90[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=91,      [92mLoss=7.6982[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=92,      [92mLoss=7.7076[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=93,      [92mLoss=7.6761[0m,    LR=1e-5]


[93mBest_loss is updated from 7.681 to 7.676 on epoch 93[0m


100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=94,      [92mLoss=7.6593[0m,    LR=1e-5]


[93mBest_loss is updated from 7.676 to 7.659 on epoch 94[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=95,      [92mLoss=7.6110[0m,    LR=1e-5]


[93mBest_loss is updated from 7.659 to 7.611 on epoch 95[0m


100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=96,      [92mLoss=7.6331[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=97,      [92mLoss=7.6344[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.84it/s,      Epoch=98,      [92mLoss=7.6257[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=99,      [92mLoss=7.6513[0m,    LR=1e-5]
100%|██████████| 106/106 [00:57<00:00,  1.83it/s,      Epoch=100,      [92mLoss=7.6222[0m,    LR=1e-5]

CPU times: user 1h 15min 58s, sys: 21min 23s, total: 1h 37min 22s
Wall time: 1h 37min 49s





In [34]:
model.save_pretrained('./bert_model')

## Filling Mask Test

In [35]:
my_model = BertForMaskedLM.from_pretrained('./bert_model')

Some weights of the model checkpoint at ./bert_model were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
tokenizer.tokenize('이순신은 [MASK] 중기의 무신이다.')

['이', '##순', '##신은', '[MASK]', '중', '##기의', '무신', '##이다', '.']

In [37]:
nlp_fill = pipeline('fill-mask', top_k=5, model=my_model, tokenizer=tokenizer)

In [38]:
nlp_fill('이순신은 [MASK] 중기의 무신이다.')

[{'score': 0.0300902146846056,
  'token': 1034,
  'token_str': '##의',
  'sequence': '[CLS] 이순신은의 중기의 무신이다. [SEP]'},
 {'score': 0.01777038536965847,
  'token': 17,
  'token_str': '.',
  'sequence': '[CLS] 이순신은. 중기의 무신이다. [SEP]'},
 {'score': 0.015896081924438477,
  'token': 1067,
  'token_str': '##는',
  'sequence': '[CLS] 이순신은는 중기의 무신이다. [SEP]'},
 {'score': 0.006974676623940468,
  'token': 705,
  'token_str': '이',
  'sequence': '[CLS] 이순신은 이 중기의 무신이다. [SEP]'},
 {'score': 0.00679297000169754,
  'token': 15,
  'token_str': ',',
  'sequence': '[CLS] 이순신은, 중기의 무신이다. [SEP]'}]

In [39]:
nlp_fill('[MASK]는 조선 중기의 무신이다.')

[{'score': 0.024894190952181816,
  'token': 705,
  'token_str': '이',
  'sequence': '[CLS] 이 는 조선 중기의 무신이다. [SEP]'},
 {'score': 0.014039567671716213,
  'token': 17,
  'token_str': '.',
  'sequence': '[CLS]. 는 조선 중기의 무신이다. [SEP]'},
 {'score': 0.006746664177626371,
  'token': 175,
  'token_str': '그',
  'sequence': '[CLS] 그 는 조선 중기의 무신이다. [SEP]'},
 {'score': 0.006324885878711939,
  'token': 1925,
  'token_str': '또한',
  'sequence': '[CLS] 또한 는 조선 중기의 무신이다. [SEP]'},
 {'score': 0.0050242007710039616,
  'token': 1034,
  'token_str': '##의',
  'sequence': '[CLS]의 는 조선 중기의 무신이다. [SEP]'}]