### Train BERT Tokenizer
* https://www.thepythoncode.com/article/pretraining-bert-huggingface-transformers-in-python
* https://huggingface.co/transformers/v3.2.0/training.html
* 토크나이저 옵션을 바꿔서 다시 사전을 생성하고 pre-train 시도(2022.09.20)
    * 한글의 경우 : do_lower_case=False, strip_accents=False
    * 이렇게 안하면 자소단위로 쪼개지는 걸로 알고 있어요
    * tokenizer = BertTokenizer(os.path.join(MODEL_PATH, "kopat-vocab.txt"), do_lower_case=False, strip_accents=False)

In [1]:
import os
import json
import re
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import *
from tokenizers import *
from datasets import *
from sklearn.model_selection import train_test_split
import nltk
from nltk.data import load

  from .autonotebook import tqdm as notebook_tqdm


### Tokenizer train data 생성
* 약어 이후에 등장하는 마침표를 사용해 문장이 분리되지 않도록 조치를 해야 한다.
* NLTK의 tokenizer를 사용해 문장 분리하기 위해 extra_abbreviations에 예외조건을 추가하여 준다.
    * https://cryptosalamander.tistory.com/140?category=1218889

In [2]:
sent_tokenizer = load("tokenizers/punkt/english.pickle")
extra_abbreviations = [
    'RE','re','pat', 'no', 'nos','vol','jan','feb','mar','apr','jun',
    'jul','aug','sep','oct','nov','dec','eng','ser','ind','ed','pp',
    'e.g','al','T.E.N.S', 'E.M.S','F.E','U.H.T.S.T','degree',
    '/gm','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
    'P','Q','R','S','T','U','V','W','X','Y','Z']
sent_tokenizer._params.abbrev_types.update(extra_abbreviations)

In [3]:
# NLTK의 tokenizer를 사용해 문장 분리(미사용)
# https://cryptosalamander.tistory.com/140?category=1218889
def sent_tokenize(input='./input.txt', output='./output.txt'):
    sent_tokenizer = load("tokenizers/punkt/english.pickle")
    extra_abbreviations = [
        'RE','re','pat', 'no', 'nos','vol','jan','feb','mar','apr','jun',
        'jul','aug','sep','oct','nov','dec','eng','ser','ind','ed','pp',
        'e.g','al','T.E.N.S', 'E.M.S','F.E','U.H.T.S.T','degree',
        '/gm','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
        'P','Q','R','S','T','U','V','W','X','Y','Z']
    sent_tokenizer._params.abbrev_types.update(extra_abbreviations)

    load_file=open(input,'r')
    save_file=open(output,'w')
    no_blank = False
    while True:
        line = load_file.readline()
        if line == "":
            break
        if line.strip() == "":
            if no_blank:
                continue
            save_file.write(f"{line}")
        else:
            print(line)
            result_ = tokenizer.tokenize(line)
            print(result_)
            result  = [ f"{cur_line}\n" for cur_line in result_ ]
            for save_line in result:
                save_file.write(save_line)

In [4]:
dataset = Dataset.from_text('data/c09k_170k_corpus.txt')



In [5]:
print(dataset['text'][0:2])
print(dataset['text'][-2:])

['사이알론 형광체, 그 제조방법 및 이를 이용한 발광소자 패키지', '본 발명의 일 실시예에 따른 사이알론 형광체 제조방법은, 규소 전구체 및 알루미늄 전구체를 혼합한 후 소결하여 제1 소결체를 형성한 후에, 상기 제1 소결체 및 활성물질의 전구체를 혼합한 후 열처리하여 제2 소결체를 형성한다.']
['[상기식(3) 중, l는 6~12의 정수를 나타낸다.]', '[상기식(4) 중, m는 1~6의 정수를 나타낸다.R2은 수소 원자, 탄소수 1~6의 직쇄상 혹은 분기상 알킬기, 또는 불소 원자를 나타낸다.]']


### Training the Tokenizer

In [7]:
special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]
files = ["data/c09k_170k_corpus.txt"]
vocab_size = 8000
max_length = 512
truncate_longer_samples = False

In [None]:
# 토크나이저에서 옵션을 지정해야 해요
# 한글의 경우
# do_lower_case=False, strip_accents=False 
# 이렇게 안하면
# 자소단위로 쪼개지는 걸로 알고 있어요
# tokenizer = BertTokenizer(os.path.join(MODEL_PATH, "kopat-vocab.txt"), do_lower_case=False, strip_accents=False) 
# 이런식으로


In [11]:
?BertWordPieceTokenizer

In [12]:
tokenizer = BertWordPieceTokenizer(handle_chinese_chars=False, lowercase=False, strip_accents=False)
# tokenizer = BertWordPieceTokenizer(lowercase=False)  # 이 옵션을 줬더니 나중에 ENCODE 결과가 죄다 unk로 ㅠㅠ
# tokenizer = BertWordPieceTokenizer(handle_chinese_chars=False, lowercase=False)  # 이 옵션을 줬더니 나중에 ENCODE 결과가 죄다 unk로 ㅠㅠ
# train the tokenizer
tokenizer.train(files=files, vocab_size=vocab_size, special_tokens=special_tokens, show_progress=True)
# enable truncation up to the maximum 512 tokens
tokenizer.enable_truncation(max_length=max_length)






In [8]:
tokenizer_path = 'c09k_tokenizer_2'

In [14]:
# make the directory if not already there
if not os.path.isdir(tokenizer_path):
    os.mkdir(tokenizer_path)
# save the tokenizer  
tokenizer.save_model(tokenizer_path)
tokenizer.save(os.path.join(tokenizer_path, 'tokenizer.json'))

### Pre-train data 생성

In [12]:
# if you want to train the tokenizer from scratch (especially if you have custom
# dataset loaded as datasets object), then run this cell to save it as files
# but if you already have your custom data as text files, there is no point using this
def dataset_to_text(dataset, output_filename="data.txt"):
    """Utility function to save dataset text to disk,
    useful for using the texts to train the tokenizer 
    (as the tokenizer accepts files)"""
    with open(output_filename, "w") as f:
        for t in dataset["text"]:
            print(t, file=f)

In [23]:
# dataset_to_text(d["train"], "data/c09k_pre_train.txt")
# dataset_to_text(d["test"], "data/c09k_pre_test.txt")

In [13]:
# dumping some of the tokenizer config to config file, 
# including special tokens, whether to lower case and the maximum sequence length
with open(os.path.join(model_path, "config.json"), "w") as f:
    tokenizer_cfg = {"do_lower_case": True,
                     "unk_token": "[UNK]",
                     "sep_token": "[SEP]",
                     "pad_token": "[PAD]",
                     "cls_token": "[CLS]",
                     "mask_token": "[MASK]",
                     "model_max_length": max_length,
                     "max_len": max_length,
                    }
    json.dump(tokenizer_cfg, f)

In [14]:
# when the tokenizer is trained and configured, load it as BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)
# tokenizer = BertTokenizer.from_pretrained(model_path, )


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file c09k_pretrained_bert/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 512,
  "max_position_embeddings": 512,
  "model_max_length": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token": "[PAD]",
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "sep_token": "[SEP]",
  "transformers_version": "4.22.0.dev0",
  "type_vocab_size": 2,
  "unk_token": "[UNK]",
  "use_cache": true,
  "vocab_size": 30522
}



vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


In [46]:
# tokenizer.vocab_size

8000

### Tokenizing the Dataset

In [17]:
# ?tokenizer

In [18]:
# Union은 공용체 형으로 Union[X, Y]는 X나 Y를 의미
# https://python.flowdas.com/library/typing.html#typing.Union
    
# Signature:     
# tokenizer(
#     text: Union[str, List[str], List[List[str]]] = None,
#     text_pair: Union[str, List[str], List[List[str]], NoneType] = None,
#     text_target: Union[str, List[str], List[List[str]]] = None,
#     text_pair_target: Union[str, List[str], List[List[str]], NoneType] = None,
#     add_special_tokens: bool = True,
#     padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = False,
#     truncation: Union[bool, str, transformers.tokenization_utils_base.TruncationStrategy] = False,
#     max_length: Union[int, NoneType] = None,
#     stride: int = 0,
#     is_split_into_words: bool = False,
#     pad_to_multiple_of: Union[int, NoneType] = None,
#     return_tensors: Union[str, transformers.utils.generic.TensorType, NoneType] = None,
#     return_token_type_ids: Union[bool, NoneType] = None,
#     return_attention_mask: Union[bool, NoneType] = None,
#     return_overflowing_tokens: bool = False,
#     return_special_tokens_mask: bool = False,
#     return_offsets_mapping: bool = False,
#     return_length: bool = False,
#     verbose: bool = True,
#     **kwargs,
# ) -> transformers.tokenization_utils_base.BatchEncoding

In [15]:
def encode_with_truncation(examples):
    """Mapping function to tokenize the sentences passed with truncation"""
    return tokenizer(examples["text"], truncation=True, padding="max_length",
                                     max_length=max_length, return_special_tokens_mask=True)

def encode_without_truncation(examples):
    """Mapping function to tokenize the sentences passed without truncation"""
    return tokenizer(examples["text"], return_special_tokens_mask=True)

In [16]:
# the encode function will depend on the truncate_longer_samples variable
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation

In [21]:
# # tokenizing the train dataset
# train_dataset = data_train1['text'].map(encode, batched=True)
# test_dataset = data_test1['text'].map(encode, batched=True)

In [22]:
# d["train"]
# train_dataset['attention_mask'][0]

In [17]:
# tokenizing the train dataset
train_dataset = d["train"].map(encode, batched=True)
# tokenizing the testing dataset
test_dataset = d["test"].map(encode, batched=True)

100%|██████████| 15/15 [00:00<00:00, 16.80ba/s]
100%|██████████| 2/2 [00:00<00:00, 21.41ba/s]


In [100]:
# test_dataset['text']

['보안 인쇄물의 위변조 확인 방법',
 '보안 인쇄물의 위변조 확인 방법',
 '  본 발명은 보안잉크가 인쇄된 보안 인쇄물에 관한 것으로서, 보다 상세하게는, 발광색상, 지속시간 또는 여기파장이 다른 형광체 또는 인광체를 포함하는 보안잉크가 인쇄된 보안 인쇄물에 관한 것이다. 이를 위해 보안 인쇄물은 제1영역 및 제2영역으로 구분되는 것으로서, 제1영역은 청색 형광체를 포함하는 보안잉크로 인쇄되고, 제2영역은 청색 형광체, 녹색 인광체 및 적색 인광체를 포함하는 보안잉크로 인쇄되는 것을 특징으로 한다.  ',
 '  본 발명은 보안잉크가 인쇄된 보안 인쇄물에 관한 것으로서, 보다 상세하게는, 발광색상, 지속시간 또는 여기파장이 다른 형광체 또는 인광체를 포함하는 보안잉크가 인쇄된 보안 인쇄물에 관한 것이다. 이를 위해 보안 인쇄물은 제1영역 및 제2영역으로 구분되는 것으로서, 제1영역은 청색 형광체를 포함하는 보안잉크로 인쇄되고, 제2영역은 청색 형광체, 녹색 인광체 및 적색 인광체를 포함하는 보안잉크로 인쇄되는 것을 특징으로 한다.  ',
 'UV 광원을 사용한 보안 인쇄물의 위변조 확인 방법에 있어서,피인쇄물을 준비하는 단계;UV 광원을 상기 제1영역 및 제2영역에 조사하여, 상기 제1영역은 제1색으로 발광하고, 제2영역은 제2색으로 발광하는 다색 발광 단계;UV 광원의 조사를 중지하는 단계; 및UV 조사가 중지된 이후, 상기 제1영역의 발광은 사라지고, 동시에 제2영역은 소정 시간 동안 제3색을 발광하고, 이후 제3색이 사라지거나, 제4색으로 일정 시간 동안 발광하는 색변환 단계;를 포함하고, 상기 피인쇄물은, 형광체을 포함하는 제1 보안 잉크를 사용하여 상기 피인쇄물의 표면에 제1영역이 인쇄되고, 형광체와 인광체을 포함하는 제2 보안 잉크를 사용하여 상기 피인쇄물의 표면에 제2영역이 인쇄된 것인, UV 광원을 사용한 보안 인쇄물의 위변조 확인 방법.',
 'UV 광원을 사용한 보안 인쇄물의 위변조 확인 방법에 있어서,제1영역과 제2영역으로 구분되는 

In [18]:
# print(train_dataset[:2])
len(train_dataset)  # 14769

14769

In [19]:
if truncate_longer_samples:
    # remove other columns and set input_ids and attention_mask as PyTorch tensors
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
    # remove other columns, and remain them as Python lists
    test_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
    train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

In [20]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

In [34]:
# # train_dataset.features.keys()
# # dict_keys(['text', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'])
# total_length = len(list(chain(train_dataset['text'])))  # total_length = 14769
# max_length  # 512
# total_length = (total_length // max_length) * max_length
# # total_length  # 14336, total_length // max_length = 28.8457..., 28 * 512 = 14336

In [21]:
max_length = 64

In [22]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [37]:
# !pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
# https://pytorch.org/get-started/locally/#windows-package-manager

In [23]:
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
if not truncate_longer_samples:
    train_dataset1 = train_dataset.map(group_texts, batched=True,
                                                                        desc=f"Grouping texts in chunks of {max_length}")
    test_dataset1 = test_dataset.map(group_texts, batched=True,
                                                                    desc=f"Grouping texts in chunks of {max_length}")
    # convert them from lists to torch tensors
    train_dataset1.set_format("torch")
    test_dataset1.set_format("torch")

Grouping texts in chunks of 64: 100%|██████████| 15/15 [00:01<00:00, 13.49ba/s]
Grouping texts in chunks of 64: 100%|██████████| 2/2 [00:00<00:00, 14.70ba/s]


In [24]:
# len(train_dataset), len(test_dataset)  # (14769, 1642)
len(train_dataset1), len(test_dataset1)  # (2171, 225)

(19200, 2246)

In [25]:
for i in range(2):
    print(train_dataset1['input_ids'][i])
    print(tokenizer.decode(train_dataset1['input_ids'][i]))

tensor([ 907,  910,  453,  402, 2005, 5564, 6947,  591, 4790,  401,  605,  622,
         758, 3378, 2610, 5205, 1959,  443,   16,  523, 5725,  573,  790,  402,
         956, 5205,  719,   12,   23,   13,   53,   12, 1407,   13,   54,   12,
          23,   13,   12,  556,  476,   16, 1483, 2179,  805,  510,  414, 2179,
        5788,  805,  954,   16, 1233,  805, 4390, 1596,  401,   16, 1664,  566,
         510,  414, 1268,  572])
그리고 나타내지는 시리콘아르코키시도와 알콕시 시릴기를 가지는 포토크로미즘 화합물을 반응시켜 실리카 겔을 합성해, 그 다음에 해실리카 겔을 일반식 ( 3 ) m ( or ) n ( 3 ) ( 식 중, m는 알칼리 금속 원자 또는 알칼리 토류 금속 원자를, n는 금속 m의 원자가를, r는 수소 원자 또는 알킬기를 각각
tensor([ 729,   18,   13,  872, 1811, 1105, 1273, 2624,  605, 1110, 1122, 1804,
         656,  551, 1273,  583, 3253,  623, 2586,   16,  450, 1110, 1122, 1804,
         656,  551, 1110, 1122,  871, 4339, 4501, 2480,  450, 1110, 112

### Loading the Model

In [26]:
vocab_size

8000

In [24]:
# initialize the model with the config
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

In [48]:
# model_config.vocab_size

### Training

In [25]:
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens for the Masked Language
# Modeling (MLM) task
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [50]:
training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # 'steps': evaluate each `logging_steps`, 'epoch'  : each epoch
    overwrite_output_dir=True,      
    num_train_epochs=50.,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=8,  # evaluation batch size
    logging_steps=100,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

using `logging_steps` to initialize `eval_steps` to 100
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [27]:
model_path

'c09k_pretrained_bert'

In [52]:
# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset1,
    eval_dataset=test_dataset1,
)

In [53]:
# train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 19400
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 8
  Total optimization steps = 3750


Step,Training Loss,Validation Loss
100,7.2956,6.732242
200,6.6033,6.342851
300,6.3786,6.237882
400,6.275,6.045237
500,6.1023,5.956049
600,6.0245,5.918322
700,5.9946,5.867238
800,5.8837,5.784225
900,5.8255,5.757144
1000,5.8194,5.694105


The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2047
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2047
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2047
  Batch size = 8
The follow

***** Running Evaluation *****
  Num examples = 2047
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2047
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2047
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Runn

TrainOutput(global_step=3750, training_loss=5.451891813151041, metrics={'train_runtime': 6405.2907, 'train_samples_per_second': 151.437, 'train_steps_per_second': 0.585, 'total_flos': 3.18986161668096e+16, 'train_loss': 5.451891813151041, 'epoch': 49.99})

In [56]:
model.save_pretrained(os.path.join(model_path, 'checkpoint-4320'))

Configuration saved in c09k_pretrained_bert/checkpoint-4320/config.json
Model weights saved in c09k_pretrained_bert/checkpoint-4320/pytorch_model.bin


In [None]:
# Gradient Accumulation steps * Total optimization steps = 8 * 540 = 4320

In [None]:
# model.save_pretrained(model_path)

### Additional Training the Model

In [28]:
# load the model checkpoint
model1 = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-7500"))
# load the tokenizer
# tokenizer = BertTokenizerFast.from_pretrained(model_path)
tokenizer1 = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)

loading configuration file c09k_pretrained_bert/checkpoint-7500/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert/checkpoint-4320",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 8000
}

loading weights file c09k_pretrained_bert/checkpoint-7500/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at c09k_pretrained_

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json


In [29]:
fill_mask = pipeline("fill-mask", model=model1, tokenizer=tokenizer1)

In [30]:
# perform predictions
# 인광성 유기 금속 이리듐 착체, 발광 소자, 발광 장치, 전자 기기, 및 조명 장치
# 본 명세서는 화학식 1로 표시되는 화합물 및 이를 포함하는 유기 발광 소자에 관한 것이다
examples = [
    "인광성 유기 금속 이리듐 착체, [MASK] 소자, 발광 장치, 전자 기기, 및 조명 장치",
    "본 명세서는 화학식 1로 표시되는 [MASK] 및 이를 포함하는 유기 발광 소자에 관한 것이다",
    "인광성 유기 금속 이리듐 착체, 발광 소자, 발광 장치, [MASK] 기기, 및 조명 장치",
    "본 명세서는 화학식 1로 표시되는 화합물 및 이를 포함하는 [MASK] 발광 소자에 관한 것이다",
]
for example in examples:
    print(fill_mask(example))
    for prediction in fill_mask(example):
        print(f"{prediction['sequence']}, confidence: {prediction['score']}")
    print("="*50)

[{'score': 0.46306195855140686, 'token': 1193, 'token_str': '발광', 'sequence': '인광성 유기 금속 이리듐 착체, 발광 소자, 발광 장치, 전자 기기, 및 조명 장치'}, {'score': 0.13777363300323486, 'token': 737, 'token_str': '표시', 'sequence': '인광성 유기 금속 이리듐 착체, 표시 소자, 발광 장치, 전자 기기, 및 조명 장치'}, {'score': 0.05654741823673248, 'token': 1125, 'token_str': '일렉트로크로믹', 'sequence': '인광성 유기 금속 이리듐 착체, 일렉트로크로믹 소자, 발광 장치, 전자 기기, 및 조명 장치'}, {'score': 0.03776992857456207, 'token': 3316, 'token_str': '광기능', 'sequence': '인광성 유기 금속 이리듐 착체, 광기능 소자, 발광 장치, 전자 기기, 및 조명 장치'}, {'score': 0.03028818406164646, 'token': 663, 'token_str': '전기변색', 'sequence': '인광성 유기 금속 이리듐 착체, 전기변색 소자, 발광 장치, 전자 기기, 및 조명 장치'}]
인광성 유기 금속 이리듐 착체, 발광 소자, 발광 장치, 전자 기기, 및 조며

In [78]:
# Num examples = 19400
# Num Epochs = 50
# Total optimization steps = 3750 = 750*50 = 
# 로드한 모델 추가학습
training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # 'steps': evaluate each `logging_steps`, 'epoch'  : each epoch
    overwrite_output_dir=True,      
    num_train_epochs=100.,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=8,  # evaluation batch size
    logging_steps=1500,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1500,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=5,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

trainer = Trainer(
    model=model1,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset1,
    eval_dataset=test_dataset1,
)

trainer.train()

using `logging_steps` to initialize `eval_steps` to 1500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 19400
  Num Epochs = 100
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 8
  Total optimization steps = 7500


Step,Training Loss,Validation Loss
1500,4.1431,3.477852
3000,3.0731,2.746345
4500,2.4814,2.393077
6000,2.1701,2.215955
7500,2.0183,2.143648


The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2047
  Batch size = 8
Saving model checkpoint to c09k_pretrained_bert/checkpoint-1500
Configuration saved in c09k_pretrained_bert/checkpoint-1500/config.json
Model weights saved in c09k_pretrained_bert/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [c09k_pretrained_bert/checkpoint-2000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2047
  Batch size = 8
Saving model checkpoint to c09k_

TrainOutput(global_step=7500, training_loss=2.7772110026041665, metrics={'train_runtime': 12492.1984, 'train_samples_per_second': 155.297, 'train_steps_per_second': 0.6, 'total_flos': 6.38038107242496e+16, 'train_loss': 2.7772110026041665, 'epoch': 99.99})

In [80]:
# load the model checkpoint
model1 = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-7500"))
# load the tokenizer
# tokenizer = BertTokenizerFast.from_pretrained(model_path)
tokenizer1 = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)
fill_mask = pipeline("fill-mask", model=model1, tokenizer=tokenizer1)
# perform predictions
# 인광성 유기 금속 이리듐 착체, 발광 소자, 발광 장치, 전자 기기, 및 조명 장치
# 본 명세서는 화학식 1로 표시되는 화합물 및 이를 포함하는 유기 발광 소자에 관한 것이다
examples = [
    "인광성 유기 금속 이리듐 착체, [MASK] 소자, 발광 장치, 전자 기기, 및 조명 장치",
    "본 명세서는 화학식 1로 표시되는 [MASK] 및 이를 포함하는 유기 발광 소자에 관한 것이다",
    "인광성 유기 금속 이리듐 착체, 발광 소자, 발광 장치, [MASK] 기기, 및 조명 장치",
    "본 명세서는 화학식 1로 표시되는 화합물 및 이를 포함하는 [MASK] 발광 소자에 관한 것이다",
]
for example in examples:
    print(fill_mask(example))
    for prediction in fill_mask(example):
        print(f"{prediction['sequence']}, confidence: {prediction['score']}")
    print("="*50)

loading configuration file c09k_pretrained_bert/checkpoint-7500/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert/checkpoint-4320",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 8000
}

loading weights file c09k_pretrained_bert/checkpoint-7500/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at c09k_pretrained_

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json
[{'score': 0.46306195855140686, 'token': 1193, 'token_str': '발광', 'sequence': '인광성 유기 금속 이리듐 착체, 발광 소자, 발광 장치, 전자 기기, 및 조명 장치'}, {'score': 0.13777363300323486, 'token': 737, 'token_str': '표시', 'sequence': '인광성 유기 금속 이리듐 착체, 표시 소자, 발광 장치, 전자 기기, 및 조명 장치'}, {'score': 0.05654741823673248, 'token': 1125, 'token_str': '일렉트로크로믹', 'sequence': '인광성 유기 금속 이리듐 착체, 일렉트로크로믹 소자, 발광 장치, 전자 기기, 및 조명 장치'}, {'score': 0.03776992857456207, 'token': 3316, 'token_str': '광기능', 'sequence': '인광성 유기 금속 이리듐 착체, 광기능 소자, 발광 장치, 전자 기기, 및 조명 장치'}, {'score': 0.03028818406164646, 'token': 663, 'token_str': '전기변색', 'sequence': '인과

## Prepare FineTuning

### model load

In [31]:
from transformers import AdamW

In [32]:
# ?BertPreTrainedModel

In [34]:
# load the model checkpoint
model1 = BertForSequenceClassification.from_pretrained(os.path.join(model_path, "checkpoint-7500"), return_dict=True, num_labels=18)
# load the tokenizer
tokenizer1 = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)
optimizer = AdamW(model1.parameters(), lr=1e-5)

loading configuration file c09k_pretrained_bert/checkpoint-7500/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert/checkpoint-4320",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LABEL_16": 16,


vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json




In [35]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model1.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model1.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

### finetuning data prepare

In [36]:
from torch.nn import functional as F

In [48]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 9881
})

In [37]:
encoding = tokenizer1(train_dataset['text'][:16], return_tensors='pt', padding=True, truncation=True, max_length=64)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [50]:
train_data_df = pd.read_csv('data/train_C09K11_220715.txt', sep='\t')  # text와 라벨 파일
test_dataset_df = pd.read_csv('data/test_C09K11_220715.txt', sep='\t')
train_dataset = Dataset.from_pandas(train_data_df)  # Dataset 객체 생성
test_dataset = Dataset.from_pandas(test_dataset_df)
finetune_dataset = DatasetDict()  # DatasetDict 객체 생성
finetune_dataset['train'] = train_dataset
finetune_dataset['test'] = test_dataset
finetune_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9881
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5203
    })
})

In [61]:
train_data_df[:3]

Unnamed: 0,text,label
0,광활성 형광체 프로브 및 이를 이용한 암세포 검출방법,1
1,"본 발명은 광활성 형광체 검출방법에 관한 것으로서, 화학식 1로 표시되는 화합물...",1
2,하기 [화학식 1]로 표시되는 OPA 또는 TPA 구조체를 포함하는 활용한 광활성 ...,1


In [51]:
# from datasets import load_dataset
# imdb = load_dataset("imdb")
# # imdb
# # DatasetDict({
# #     train: Dataset({
# #         features: ['text', 'label'],
# #         num_rows: 25000
# #     })
# #     test: Dataset({
# #         features: ['text', 'label'],
# #         num_rows: 25000
# #     })
# #     unsupervised: Dataset({
# #         features: ['text', 'label'],
# #         num_rows: 50000
# #     })
# # })

In [62]:
# # 학습 가능 여부 확인
# import torch
# labels = torch.tensor(train_dataset['label'][:16]).unsqueeze(0)
# outputs = model1(input_ids, attention_mask=attention_mask, labels=labels)
# loss = outputs.loss
# loss.backward()
# optimizer.step()

# # # loss 지정 계산
# # outputs = model1(input_ids, attention_mask=attention_mask)
# # loss = F.cross_entropy(labels, outputs.logits)
# # loss.backward()
# # optimizer.step()

In [53]:
num_warmup_steps = 2
n_epochs = 5
num_train_steps = n_epochs + 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)

In [41]:
epoch_loss = 0
epoch_acc = 0
for epoch in range(n_epochs):
    scheduler.step()
    

In [144]:
# train_dataset['text'][:5]

In [112]:
# train_dataset1 = train_dataset.shuffle(100)
# test_dataset1 = test_dataset.shuffle(100)

In [114]:
# train_dataset1['text'][:5]

In [77]:
# 토크나이저 함수 객체 생성
def preprocess_function(examples):
    return tokenizer1(examples["text"], truncation=True, max_length=64, padding=True)
def preprocess_function1(examples):
    return examples['label']

In [78]:
tokenized_finetune_dataset = finetune_dataset.map(preprocess_function, batched=True)  
# DatasetDict의 'text'를 토크나이징, finetune_dataset에는 features: ['text', 'label']만 있었으나,
# tokenized_finetune_dataset에는 'input_ids', 'token_type_ids', 'attention_mask'가 추가됨

100%|██████████| 10/10 [00:00<00:00, 11.26ba/s]
100%|██████████| 6/6 [00:00<00:00, 14.96ba/s]


In [79]:
tokenized_finetune_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9881
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5203
    })
})

In [80]:
train_dataset1

Dataset({
    features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 19200
})

In [81]:
# token_train_dataset = {}
# finetune_dataset['sentences'] = list(map(preprocess_function, train_dataset1))
# token_train_dataset['labels'] = list(map(preprocess_function1, train_dataset1))
# # test_dataset = data_test1['text'].map(encode, batched=True)

In [82]:
# ?DataCollatorWithPadding

In [83]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer1)

In [68]:
# token_train_dataset['labels'][:5]

In [69]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [70]:
# training_args = TrainingArguments(output_dir="c09k_finetuned_bert")
training_args = TrainingArguments(
    output_dir="c09k_finetuned_bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [72]:
trainer1 = Trainer(
    model=model1,
    args=training_args,
    train_dataset=tokenized_finetune_dataset['train'],
    eval_dataset=tokenized_finetune_dataset['test'],
#     compute_metrics=compute_metrics,
#     tokenizer=tokenizer1,
    data_collator=data_collator,
)

In [73]:
trainer1.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9881
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3090


Step,Training Loss
500,2.2628
1000,1.8151
1500,1.6225
2000,1.5009
2500,1.4295
3000,1.346


Saving model checkpoint to c09k_finetuned_bert/checkpoint-500
Configuration saved in c09k_finetuned_bert/checkpoint-500/config.json
Model weights saved in c09k_finetuned_bert/checkpoint-500/pytorch_model.bin
Saving model checkpoint to c09k_finetuned_bert/checkpoint-1000
Configuration saved in c09k_finetuned_bert/checkpoint-1000/config.json
Model weights saved in c09k_finetuned_bert/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to c09k_finetuned_bert/checkpoint-1500
Configuration saved in c09k_finetuned_bert/checkpoint-1500/config.json
Model weights saved in c09k_finetuned_bert/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to c09k_finetuned_bert/checkpoint-2000
Configuration saved in c09k_finetuned_bert/checkpoint-2000/config.json
Model weights saved in c09k_finetuned_bert/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to c09k_finetuned_bert/checkpoint-2500
Configuration saved in c09k_finetuned_bert/checkpoint-2500/config.json
Model weights saved in c0

TrainOutput(global_step=3090, training_loss=1.6539026920849451, metrics={'train_runtime': 405.526, 'train_samples_per_second': 121.829, 'train_steps_per_second': 7.62, 'total_flos': 1625108636839680.0, 'train_loss': 1.6539026920849451, 'epoch': 5.0})

In [111]:
# load the model checkpoint
model2 = BertForSequenceClassification.from_pretrained(os.path.join('c09k_finetuned_bert', "checkpoint-3000"),
                                                       return_dict=True, num_labels=18)
# load the tokenizer
tokenizer2 = BertTokenizerFast.from_pretrained(model_path, vocab_size=8000, local_files_only=True)
optimizer = AdamW(model1.parameters(), lr=1e-5)

loading configuration file c09k_finetuned_bert/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert/checkpoint-7500",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LA

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json




In [112]:
# ?pipeline
text_classifier = pipeline('text-classification', model=model2, tokenizer=tokenizer2)

In [113]:
# test_dataset['text'][:5]
# test_dataset['label'][:5]

In [120]:
for i in range(10):
    print('pred: ', text_classifier(test_dataset['text'][i]), ', label: ', test_dataset['label'][i])

pred:  [{'label': 'LABEL_9', 'score': 0.1434182971715927}] , label:  0
pred:  [{'label': 'LABEL_9', 'score': 0.1434182971715927}] , label:  0


RuntimeError: The size of tensor a (106) must match the size of tensor b (64) at non-singleton dimension 1

0

In [94]:
test_outputs = model1(test_input_ids, attention_mask=test_attention_mask, labels=test_labels)

AttributeError: 'list' object has no attribute 'size'

In [6]:
train_dataset = Dataset.from_text('data/train_C09K11_220715.txt')
# test_dataset = Dataset.from_text('data/test_C09K11_220715.txt')



In [7]:
train_dataset.features

{'text': Value(dtype='string', id=None)}

In [14]:
# train_dataset1 = []
# for line in train_dataset['text'][1:]:
#     text, label = line.split('\t')
#     train_dataset1.append({'text': text, 'label': label})
# print(train_dataset1[:5])
# test_dataset1 = []
# for line in test_dataset['text'][1:]:
#     text, label = line.split('\t')
#     test_dataset1.append({'text': text, 'label': label})
# print(test_dataset1[:5])

In [34]:
token_train_dataset

{'sentences': [{'input_ids': [480, 1531, 3771, 1111, 1307, 412, 1272, 1117, 6280, 1086, 681, 2370, 1030], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
  {'input_ids': [672, 785, 480, 1531, 3771, 2370, 5279, 882, 2578, 16, 746, 2183, 1138, 758, 1323, 16, 1597, 1710, 524, 1653, 3055, 401, 1323, 2300, 792, 558, 4418, 792, 3952, 546, 453, 584, 809, 1327, 6167, 389, 1329, 6280, 1086, 681, 383, 2370, 5279, 882, 4478, 398, 1059, 18, 3890, 1214, 672, 1399, 2750, 16, 3574, 1379, 12, 45, 281, 2143, 305, 262, 13, 660, 3574, 543, 12, 3016, 2143, 305, 262, 13, 915, 2950, 1106, 828, 916, 688, 384, 3956, 1830, 1360, 2053, 932, 1147, 448, 480, 1531, 1658, 1710, 524, 1205, 3055, 401, 2361, 736, 6280, 398, 2318, 786, 1976, 405, 467, 2005, 683, 402, 582, 389, 1757, 423, 239, 440, 7117, 623, 2281, 6280, 4418, 383, 2318, 786, 485, 2572, 1306, 558, 12, 3880, 3513, 284, 3221, 3513, 5830, 13, 414, 2572, 6406, 558, 414, 4181, 558, 12, 30

In [31]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 9881
})

In [34]:
# model1.train

In [32]:
classifier01 = pipeline("text-classification", model=model1, tokenizer=tokenizer1)
# classifier02 = pipeline("zero-shot-classification", model=model1, tokenizer=tokenizer1)

The model 'BertForTokenClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GPT2ForSequenceClassification', 'GPTNeoForSequenceClassification', 'GPTJForSequenceClassification', 'IBertForSequenceClassification', 'LayoutLMForSequenceClassification', 'LayoutLMv2ForSequenceClassification', 'LayoutLMv3ForSequenceClassificati

In [37]:
classifier01(['인광성 유기 금속 이리듐 착체, 발광 소자, 발광 장치, 발광 기기, 및 조명 장치'])

Disabling tokenizer parallelism, we're using DataLoader multithreading already


KeyError: 10

### 그 외 참고용 자료

In [None]:
# 명선 책임의 문장 분리 코드(3글자 이후 마침표가 등장하는 경우에 간혹 분리되는 경우가 있다. 미사용)
# REG_SENT_KO=r'([ㄱ-ㅣ가-힣]+[.]|[\n]|[:;!?])'
# REG_SENT_EN=r'([a-zA-Z]+[.]\s|[\n]|[:;!?])'

# def split_sentence(doc, regex):
#     s = 0
#     for m in re.finditer(regex, doc):
#         sent = doc[s:m.end()].strip()
#         s = m.end()
#         if not sent:
#             continue
#         yield sent

#     if s < len(doc):
#         sent = doc[s:].strip()
#         if sent:
#             yield sent