In [1]:
import os
import json
import re
import pickle
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import *
from tokenizers import *
# pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org tokenizers

from datasets import *
from sklearn.model_selection import train_test_split
import nltk
from nltk.data import load

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sent_tokenizer = load("tokenizers/punkt/english.pickle")
extra_abbreviations = [
    'RE', 're', 'pat', 'no', 'nos', 'vol', 'jan', 'feb', 'mar', 'apr', 'jun',
    'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'eng', 'ser', 'ind', 'ed', 'pp',
    'e.g', 'al', 'T.E.N.S', 'E.M.S', 'F.E', 'U.H.T.S.T', 'degree',
    '/gm', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

In [3]:
sent_tokenizer._params.abbrev_types.update(extra_abbreviations)

In [5]:
data_path = 'data/h04w4/'
kiwee_data_list = os.listdir(data_path)
kiwee_data_list = [fn for fn in kiwee_data_list if fn[-4:]=='xlsx']
print(kiwee_data_list)

['h04w4_03.xlsx', 'h04w4_05.xlsx', 'h04w4_01.xlsx', 'h04w4_04.xlsx', 'h04w4_02.xlsx']


In [None]:
def read_kiwee_data():
    kiwee_files = kiwee_data_list
    with open('data/c09k_corpus.txt', 'a', encoding='utf-8') as f:
        f.truncate(0)
        for i, fn in enumerate(kiwee_files):
            print(fn)
            tmp = pd.read_excel(os.path.join(data_path, fn), engine="openpyxl").fillna('')
            # pandas는 비어있는 컬럼의 dtype을 float로 바꿔서 인식한다. 그로 인해 토크나이징 할 데이터가 없으면 오류가 발생되어 fillna를 사용해 모두 텍스트로 인식시키도록 한다
            # https://stackoverflow.com/questions/53953286/pandas-read-excel-blanks-in-string-columns-convert-to-floats-converting-via-st
            col_text = ['발행번호', '발명의명칭', '요약', '대표청구항', '과제', '해결방안']
            tmp = tmp[col_text]
            for index, row in tmp.iterrows():
                # print(index, '\n', row['발명의명칭'], row['요약'], row['대표청구항'], row['과제'], row['해결방안'], '\n')
                for col in col_text[1:]:
                    # print('처리중인 데이터:', col, row[col], '\n')
                    if row[col].strip() == "":
                        pass
                    else:
                        # print(row[col].strip())
                        row[col] = unicodedata.normalize('NFKC', row[col])
                        # row[col] = unicodedata.normalize('NFC', row[col])  # 자음과 모음이 깨질 때는 NFC로 변환
                        # NFD(Normalization Form Decomposition) : 자음과 모음이 분리
                        # row[col] = unicodedata.normalize('NFKD', row[col])
                        #     https://blog.naver.com/PostView.nhn?blogId=duswl0319&logNo=221516880642&from=search&redirect=Log&widgetTypeCall=true&directAccess=false
                        row[col] = row[col].replace('\n\t', ' ')
                        row[col] = row[col].replace('\n', ' ')
                        row[col] = row[col].replace('&lt;', ' ')
                        row[col] = row[col].replace('_x000d_', ' ')
                        row[col] = row[col].replace('\t\t', ' ')
                        row[col] = row[col].replace('@@', ' ')
                        row[col] = row[col].replace('.  .', '.')
                        row[col] = row[col].replace('. .', '.')
                        row[col] = row[col].replace('..', '.')
                        row[col] = row[col].replace('〜', '~')
                        row[col] = row[col].replace(' . ', '.')
                        row[col] = row[col].replace(' ． ', '.')
                        row[col] = row[col].replace('． ', '.')
                        row[col] = row[col].replace('. ', '.')
                        row[col] = row[col].replace('  ', ' ')
                        row[col] = row[col].replace('  ', ' ')
                        row[col] = row[col].replace('【과제】', ' ')
                        row[col] = row[col].replace('【요약】', ' ')
                        row[col] = row[col].replace('【해결 수단】', ' ')
                        str_tmp = sent_tokenizer.tokenize(row[col].strip())
                        # print('문장 분리: ', str_tmp, '\n'*3)
                        # result  = [f"{line}\n" for line in str_tmp]
                        for line in str_tmp:
                            f.write(f"{line}\n")

In [None]:
# read_kiwee_data()

dataset = Dataset.from_text('data/c09k_170k_corpus.txt')
# print(dataset['text'][0:5])
# print(dataset['text'][-5:])

# d = dataset.train_test_split(test_size=0.15)

try:
    with open('data/c09k_170k_dataset.pkl', 'rb') as f:
        d = pickle.load(f)
    print('dataset loading completed')
except:
    d = dataset.train_test_split(test_size=0.15)
    with open('data/c09k_170k_dataset.pkl', 'wb') as f:
        pickle.dump(d, f)
    print('dataset split/saving completed')

print(d)
print(d['train']['text'][0:5])
print(d['test']['text'][-5:])

# for t in d["train"]["text"][:3]:
#     print(t)
#     print("="*50)
#
# for t in d["test"]["text"][:3]:
#     print(t)
#     print("="*50)

special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]
files = ["data/c09k_corpus.txt"]
vocab_size = 8000
max_length = 512
truncate_longer_samples = False

#
# def dataset_to_text(dataset, output_filename="data.txt"):
#     """Utility function to save dataset text to disk, useful for using the texts to train the tokenizer
#      (as the tokenizer accepts files)"""
#     with open(output_filename, "w") as f:
#         for t in dataset["text"]:
#             print(t, file=f)
#
#
# model_path = 'c09k_pretrained_bert'
#
# dataset_to_text(d["train"], "data/c09k_pre_train.txt")
# dataset_to_text(d["test"], "data/c09k_pre_test.txt")