# BERTをMaskedLMで事前学習する  
MaskedLMでBERTの学習を改善します。

# モジュールインポート  
requirements.txtを利用してください。

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

In [25]:
import torch
from transformers import BertJapaneseTokenizer, BertForMaskedLM, BertModel, TrainingArguments, Trainer

# データ読み込み  
太宰治「走れメロス」 by 青空文庫 https://www.aozora.gr.jp/

In [3]:
with open('text.txt') as f:
    list_lines = f.readlines()

## コーパス作成

In [4]:
list_corpus = ''.join(list_lines).replace('\n', '').split('。')

In [5]:
print(len(list_corpus))

459


In [6]:
list_corpus[0]

'メロスは激怒した'

## データ作成  
BERT論文：  
https://arxiv.org/abs/1810.04805

maskは、[CLS]（文頭）[SEP]（文区切）[PAD]（空）の特殊文字には行わない。  
maskは、各トークンに15%の確率で行う。
maskは80%に、10%で別のトークンに、10%で元に戻す操作をする。  

In [7]:
model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
model_save = './model_pretrained'

In [8]:
# tokenizerを定義
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)

In [9]:
# vocab辞書のランダム取得用
mask_list_rand = list(tokenizer.get_vocab().items())

In [10]:
max_length = 128 # 1文のトークン数最大を指定。設定上限は512です。

In [11]:
dataset = []
# 1行ごとに処理していく
for i, text in tqdm(enumerate(list_corpus)):
    # tokenize（1文ずつ、max_lengthまでトークンを取得する。何もなければNull(0)が入る。）
    inputs = tokenizer(text, max_length=max_length, padding='max_length', truncation=True)

    # label（予測する文を入力）
    inputs['labels'] = torch.tensor(inputs.input_ids).detach().clone()
    
    # マスク確率配列作成
    mask_rand = torch.rand(torch.tensor(inputs.input_ids).shape)
    
    # マスク有無配列作成
    mask_arr = (mask_rand < 0.15) \
                * (torch.tensor(inputs.input_ids) != 0) \
                * (torch.tensor(inputs.input_ids) != 2) \
                * (torch.tensor(inputs.input_ids) != 3)
    
    # マスクされる配列を設定
    mask_sel = torch.flatten(mask_arr.nonzero()).tolist()
    for no in mask_sel:
        # 80%をマスクし、10%は他トークンにランダムに変換, 10%は何もしない。
        mask_num_rand = random.random()
        if mask_num_rand >= 0.20: # 80%
            inputs.input_ids[no] = 4 # マスクID
        elif mask_num_rand >= 0.10: # 10%
            vocab, vocab_no = random.choice(mask_list_rand)
            inputs.input_ids[no] = vocab_no
        else: # 10%
            pass
    
    # to tensor
    inputs = {k: torch.tensor(v) for k, v in inputs.items()}
    
    # set
    dataset.append(inputs)

  inputs = {k: torch.tensor(v) for k, v in inputs.items()}
459it [00:00, 4302.85it/s]


In [12]:
dataset[:10]

[{'input_ids': tensor([    2, 10897, 28466,     9,     4,    15,     4,     3,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

## データ分割  
train6:valid2:test2

In [14]:
# シャッフル
random.shuffle(dataset)
# Nを決める
n = len(dataset)
n_train = int(0.6 * n)
n_valid = n_train + int(0.2 * n)

In [15]:
print(n, n_train, n_valid)

459 275 91


In [16]:
dataset_train = dataset[:n_train]
dataset_valid = dataset[n_train:n_valid]
dataset_test = dataset[n_valid:]

# モデル定義  
kerasと違い、torchはtrainモードに変更が必要。

In [17]:
model = BertForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

# 学習  
実利用の場合、ハイパーパラメータを調整する必要があります。

In [19]:
args = TrainingArguments(
    output_dir='out', per_device_train_batch_size=16, num_train_epochs=10
)

In [20]:
# set trainer
trainer = Trainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    args=args,
)

In [21]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=180, training_loss=0.22343281639946833, metrics={'train_runtime': 1638.3425, 'train_samples_per_second': 0.11, 'total_flos': 233694658560000.0, 'epoch': 10.0})

# 評価

In [22]:
trainer.evaluate(dataset_test)

{'eval_loss': 0.03651389107108116,
 'eval_runtime': 20.4931,
 'eval_samples_per_second': 4.538,
 'epoch': 10.0}

# 保存

In [23]:
trainer.save_model(model_save)

# モデル呼び出し  
他ファイルで呼び出す際に利用します。

In [26]:
model = BertModel.from_pretrained(model_save)

Some weights of BertModel were not initialized from the model checkpoint at ./model_pretrained and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# tokenizerは元のモデルから呼び出してください。
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)