### __Bertによる単語穴埋め__

単語の穴埋めは教師あり学習だが、基本的にはファインチューニングは不要である

In [1]:
!pip install transformers[ja] | tail -n 1

Successfully installed fugashi-1.2.1 huggingface-hub-0.11.1 ipadic-1.0.0 plac-1.3.5 pyknp-0.6.1 sudachidict-core-20221021 sudachipy-0.6.6 tokenizers-0.13.2 transformers-4.25.1 unidic-1.1.0 unidic-lite-1.0.8


In [2]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, BertForMaskedLM

In [3]:
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

In [4]:
# 穴埋めしたい部分には[MASK]と書けば良いことがわかる
tokenizer

PreTrainedTokenizer(name_or_path='cl-tohoku/bert-base-japanese-whole-word-masking', vocab_size=32000, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
# モデル構造の比較
# AutoModelでは汎用的なBertModelを提供するので、
# 最終層は単純な変換だけ行われる

model = AutoModel.from_pretrained(model_name)
model

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [6]:
# モデル構造の比較
# Masked Language Modelingに特化した最終層を持っている
# AutoModelとは違って32000次元のベクトルが出力される

mlm_model = BertForMaskedLM.from_pretrained(model_name)
mlm_model

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [7]:
text = [
    "日本の首都は[MASK]です",
    "アメリカの首都は[MASK]です"
]

# トークナイズ
inputs = tokenizer(text, return_tensors="pt")

# 推論
with torch.no_grad():
    logits = mlm_model(**inputs).logits

# バッチサイズ×トークン数×単語数
logits.shape

torch.Size([2, 8, 32000])

In [8]:
# [MASK]した部分のindexを取得
mask_index = (inputs["input_ids"] == tokenizer.mask_token_id)

# logitsとサイズを合わせる
mask_index = mask_index.unsqueeze(-1).expand_as(logits)

# [MASK]ごとに最もスコアが高いトークンのIDを取得
predicted_token_ids = logits[mask_index].view(logits.size(0), -1).argmax(axis=-1)

# トークンIDを単語に戻す
tokenizer.decode(predicted_token_ids)

'東京 ニューヨーク'