In [54]:
import os
import json
import re
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import *
import torch
from tokenizers import *
from datasets import *
from sklearn.model_selection import train_test_split
import nltk
from nltk.data import load
import nltk.translate.bleu_score as bleu

ModuleNotFoundError: No module named 'nltk.bleu_score'

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Data prepare

In [4]:
def encode_with_truncation(examples):
    """Mapping function to tokenize the sentences passed with truncation"""
    return tokenizer(examples["text"], truncation=True, padding="max_length",
                                     max_length=max_length, return_special_tokens_mask=True)

def encode_without_truncation(examples):
    """Mapping function to tokenize the sentences passed without truncation"""
    return tokenizer(examples["text"], return_special_tokens_mask=True)

In [5]:
# the encode function will depend on the truncate_longer_samples variable
# encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation
encode = encode_with_truncation

In [34]:
# dataset = Dataset.from_text('data/c09k_corpus.txt')
dataset = Dataset.from_text('data/c09k_pre_test.txt')



In [38]:
# dataset['text'][:100]

In [37]:
examples = ['본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 높은 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다.',
'폴리아미드 섬유 또는 폴리우레탄 섬유로부터 되는 섬유(X)와 폴리오레핀(polyolefin) 수지와 포토 크로믹 재료를 함유 하는 섬유(Y)와 하지만 서로 꼼 합쳐진 것인 스타킹용 포토 크로믹 섬유',
'본 발명은 칼라표시장치용 광선택 흡광제, 이를 포함하는 코팅제 및 상기 코팅제로 제조된 필터를 제공한다.상기 광선택 흡광제는 테트라아자포피린 유도체의 이합체를 포함한다',
'상기한 목적을 달성하기 위하여 본 발명은 나노 세공의 다공성 육방정계의 실리카 분체내의 실리콘 일부가 전이금속으로 치환되고, 나노 세공내에 금속 산화물이 나노 크기로 담지된 복합분체를 제공한다']

In [39]:
def masking_ex(text):
    text = text.split(' ')
    length = len(text)
    mask_ind = np.random.randint(length)
    text[mask_ind] = '[MASK]'
    text = ' '.join(text)
#         print(length, result)
    return text

In [41]:
masked_sample = [masking_ex(text[:100]) for text in examples]
for item in masked_sample:
    print(item)

본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 [MASK] 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다.
폴리아미드 섬유 또는 폴리우레탄 섬유로부터 되는 섬유(X)와 폴리오레핀(polyolefin) 수지와 포토 크로믹 재료를 함유 하는 [MASK] 하지만 서로 꼼 합쳐진 것인 스타킹용
본 [MASK] 칼라표시장치용 광선택 흡광제, 이를 포함하는 코팅제 및 상기 코팅제로 제조된 필터를 제공한다.상기 광선택 흡광제는 테트라아자포피린 유도체의 이합체를 포함한다
상기한 목적을 달성하기 위하여 본 발명은 나노 세공의 다공성 육방정계의 실리카 분체내의 실리콘 일부가 전이금속으로 [MASK] 나노 세공내에 금속 산화물이 나노 크기로 담지된 복합분


In [42]:
model_path = [['c09k_pretrained_bert', 'checkpoint-1500'],
             ['c09k_pretrained_bert', 'checkpoint-2880'],
             ['c09k_pretrained_bert', 'checkpoint-4500'],
             ['c09k_pretrained_bert', 'checkpoint-6000'],
             ['c09k_pretrained_bert', 'checkpoint-7500']]

In [46]:
def model_load(path):
    # load the model checkpoint
    model = BertForMaskedLM.from_pretrained(os.path.join(path[0], path[1]))
    # load the tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(path[0])
    return model, tokenizer

In [47]:
model_path[0]

['c09k_pretrained_bert', 'checkpoint-1500']

In [51]:
pred_result = []
for path in model_path:
    model, tokenizer = model_load(path)
    fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
    for example in masked_sample:
        for prediction in fill_mask(example):
#             print(f"{prediction['sequence']}, confidence: {prediction['score']}")
            pred_result.append([path[1], prediction['sequence'], prediction['score']])
            
#     print("="*50)

loading configuration file c09k_pretrained_bert/checkpoint-1500/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert/checkpoint-4320",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 8000
}

loading weights file c09k_pretrained_bert/checkpoint-1500/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at c09k_pretrained_

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 얻을 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.1457536369562149
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제조할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.050306208431720734
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.04066529870033264
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제공할 수율로 제조

All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at c09k_pretrained_bert/checkpoint-2880.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training.
loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file c09k_pretrained_bert/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 512,
  "max_position_embeddings": 512,
  "model

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 얻을 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.24339978396892548
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제조할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.09570968151092529
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제공할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.06419500708580017
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를할 수율로 제조

All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at c09k_pretrained_bert/checkpoint-4500.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training.
loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file c09k_pretrained_bert/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 512,
  "max_position_embeddings": 512,
  "model

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 얻을 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.28728723526000977
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제조할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.1961677074432373
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제공할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.09082634747028351
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 형성할 수율ᄅ

All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at c09k_pretrained_bert/checkpoint-6000.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training.
loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file c09k_pretrained_bert/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 512,
  "max_position_embeddings": 512,
  "model

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 얻을 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.32060763239860535
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제조할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.19089171290397644
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제공할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.1143389567732811
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 형성할 수율ᄅ

All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at c09k_pretrained_bert/checkpoint-7500.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training.
loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file c09k_pretrained_bert/config.json
Model config BertConfig {
  "_name_or_path": "c09k_pretrained_bert",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 512,
  "max_position_embeddings": 512,
  "model

vocab_file vocab.txt
tokenizer_file tokenizer.json
added_tokens_file added_tokens.json
special_tokens_map_file special_tokens_map.json
tokenizer_config_file tokenizer_config.json
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 얻을 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.3447664976119995
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제조할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.19645027816295624
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 제공할 수율로 제조할 수 있는 폴리로탁산 모노머를 제공할 수 있다., confidence: 0.06571029126644135
본 발명에 따르면, 폴리로탁산 모노머의 우수한 기계 특성을 유지하면서, 생산성이 높고, 고품질의 재료를 형성할 수율ᄅ

In [65]:
pred_result = pd.DataFrame(np.array(pred_result), columns=['chkpoints', 'mask_pred', 'proba'])
pred_result['mask_pred'].str.normalize('NFKC')
pred_result.to_csv('data/masklm_pred_result.csv', encoding='utf-8')