### Train BERT Tokenizer
* https://www.thepythoncode.com/article/pretraining-bert-huggingface-transformers-in-python
* https://huggingface.co/transformers/v3.2.0/training.html

In [1]:
import os
import json
import re
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import *
from tokenizers import *
from datasets import *
from sklearn.model_selection import train_test_split
import nltk
from nltk.data import load

  from .autonotebook import tqdm as notebook_tqdm


### Tokenizer train data 생성
* 약어 이후에 등장하는 마침표를 사용해 문장이 분리되지 않도록 조치를 해야 한다.
* NLTK의 tokenizer를 사용해 문장 분리하기 위해 extra_abbreviations에 예외조건을 추가하여 준다.
    * https://cryptosalamander.tistory.com/140?category=1218889

In [2]:
sent_tokenizer = load("tokenizers/punkt/english.pickle")
extra_abbreviations = [
    'RE','re','pat', 'no', 'nos','vol','jan','feb','mar','apr','jun',
    'jul','aug','sep','oct','nov','dec','eng','ser','ind','ed','pp',
    'e.g','al','T.E.N.S', 'E.M.S','F.E','U.H.T.S.T','degree',
    '/gm','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
    'P','Q','R','S','T','U','V','W','X','Y','Z']
sent_tokenizer._params.abbrev_types.update(extra_abbreviations)

In [3]:
# NLTK의 tokenizer를 사용해 문장 분리(미사용)
# https://cryptosalamander.tistory.com/140?category=1218889
def sent_tokenize(input='./input.txt', output='./output.txt'):
    sent_tokenizer = load("tokenizers/punkt/english.pickle")
    extra_abbreviations = [
        'RE','re','pat', 'no', 'nos','vol','jan','feb','mar','apr','jun',
        'jul','aug','sep','oct','nov','dec','eng','ser','ind','ed','pp',
        'e.g','al','T.E.N.S', 'E.M.S','F.E','U.H.T.S.T','degree',
        '/gm','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
        'P','Q','R','S','T','U','V','W','X','Y','Z']
    sent_tokenizer._params.abbrev_types.update(extra_abbreviations)

    load_file=open(input,'r')
    save_file=open(output,'w')
    no_blank = False
    while True:
        line = load_file.readline()
        if line == "":
            break
        if line.strip() == "":
            if no_blank:
                continue
            save_file.write(f"{line}")
        else:
            print(line)
            result_ = tokenizer.tokenize(line)
            print(result_)
            result  = [ f"{cur_line}\n" for cur_line in result_ ]
            for save_line in result:
                save_file.write(save_line)

In [7]:
df_train = pd.read_csv("data/h04w4/train_H04W4_220511.txt", delimiter='\t', dtype=str, header=0, names=['text', 'label'])
df_test = pd.read_csv("data/h04w4/test_H04W4_220511.txt", delimiter='\t', dtype=str, header=0, names=['text', 'label'])
# dataset_train = Dataset.from_text('data/h04w4/train_H04W4_220511.txt')

In [17]:
df = pd.concat([df_train, df_test], axis=0)
df['text'].to_csv("data/h04w4/tok_train_H04W4.txt")

In [6]:
# try:
#     dataset = Dataset.from_text('data/c09k_corpus.txt')
# except:
#     kiwee_files = ['data/c09k_0001-1000.xlsx', 'data/c09k_1001-2000.xlsx', 'data/c09k_2001-3000.xlsx', 'data/c09k_3001-3935.xlsx']
#     with open('data/c09k_corpus.txt', 'a') as f:
#         f.truncate(0)
#         for i, fn in enumerate(kiwee_files):
#             tmp = pd.read_excel(fn).fillna('')
#             # pandas는 비어있는 컬럼의 dtype을 float로 바꿔서 인식한다. 그로 인해 토크나이징 할 데이터가 없으면 오류가 발생되어 fillna를 사용해 모두 텍스트로 인식시키도록 한다
#             # https://stackoverflow.com/questions/53953286/pandas-read-excel-blanks-in-string-columns-convert-to-floats-converting-via-st
#             col_text = ['발행번호', '발명의명칭', '요약', '대표청구항', '과제', '해결방안']
#             tmp = tmp[col_text]
#             for index, row in tmp.iterrows():
#         #         print(index, '\n', row['발명의명칭'], row['요약'], row['대표청구항'], row['과제'], row['해결방안'], '\n')
#                 for col in col_text[1:]:
#         #             print('처리중인 데이터:', col, row[col], '\n')
#                     if row[col].strip() == "":
#                         pass
#                     else:
#         #                 print(row[col].strip())
#                         row[col] = unicodedata.normalize('NFKC', row[col])
#                         # row[col] = unicodedata.normalize('NFC', row[col])  # 자음과 모음이 깨질 때는 NFC로 변환
#                         # NFD(Normalization Form Decomposition) : 자음과 모음이 분리
#                         # row[col] = unicodedata.normalize('NFKD', row[col])
#                         #     https://blog.naver.com/PostView.nhn?blogId=duswl0319&logNo=221516880642&from=search&redirect=Log&widgetTypeCall=true&directAccess=false
#                         row[col] = row[col].replace('\n\t',' ')
#                         row[col] = row[col].replace('\n',' ')
#                         row[col] = row[col].replace('&lt;',' ')
#                         row[col] = row[col].replace('_x000d_',' ')
#                         row[col] = row[col].replace('\t\t',' ')
#                         row[col] = row[col].replace('@@',' ')
#                         row[col] = row[col].replace('.  .','.')
#                         row[col] = row[col].replace('. .','.')
#                         row[col] = row[col].replace('..','.')
#                         row[col] = row[col].replace('〜','~')
#                         row[col] = row[col].replace(' . ','.')
#                         row[col] = row[col].replace(' ． ','.')
#                         row[col] = row[col].replace('． ','.')
#                         row[col] = row[col].replace('. ','.')
#                         row[col] = row[col].replace('  ',' ')
#                         row[col] = row[col].replace('  ',' ')
#                         row[col] = row[col].replace('【과제】',' ')
#                         row[col] = row[col].replace('【요약】',' ')
#                         row[col] = row[col].replace('【해결 수단】',' ')
#                         str_tmp = sent_tokenizer.tokenize(row[col].strip())
#         #                 print('문장 분리: ', str_tmp, '\n'*3)
#         #                 result  = [f"{line}\n" for line in str_tmp]
#                         for line in str_tmp:
#                             f.write(f"{line}\n")
#     dataset = Dataset.from_text('data/c09k_corpus.txt')




### Training the Tokenizer

In [18]:
special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]
# if you want to train the tokenizer on both sets
# files = ["train.txt", "test.txt"]
# training the tokenizer on the training set
files = ["data/h04w4/tok_train_H04W4.txt"]
# 30,522 vocab is BERT's default vocab size, feel free to tweak
vocab_size = 3000
# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512
# whether to truncate
truncate_longer_samples = False

In [19]:
tokenizer = BertWordPieceTokenizer(handle_chinese_chars=False, lowercase=False, strip_accents=False)
tokenizer.train(files=files, vocab_size=vocab_size, special_tokens=special_tokens, show_progress=True)
tokenizer.enable_truncation(max_length=max_length)






In [20]:
model_path = "h04w4_trained_model"

In [21]:
# make the directory if not already there
if not os.path.isdir(model_path):
    os.mkdir(model_path)
# save the tokenizer  
tokenizer.save_model(model_path)
tokenizer.save(os.path.join(model_path, 'tokenizer.json'))