# 1 モデルの準備

In [None]:
!pip install transformers fugashi ipadic

In [2]:
import torch
from transformers import BertJapaneseTokenizer, BertForMaskedLM

In [None]:
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
bert_model = BertForMaskedLM.from_pretrained(model_name)

In [4]:
print(bert_model.config)

BertConfig {
  "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertJapaneseTokenizer",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}



In [5]:
print(bert_model)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

# 2 tokenizer

In [6]:
text = "私はお腹が空いたので[MASK]を食べたい"
tokens = tokenizer.tokenize(text)
print(tokens)

['私', 'は', 'お', '##腹', 'が', '空い', 'た', 'ので', '[MASK]', 'を', '食べ', 'たい']


In [7]:
encoding = tokenizer(text, max_length=20, padding="max_length", truncation=True, return_tensors="pt")

# 3 BERTと単語ベクトル

In [8]:
output = bert_model(**encoding)
print(output[0].shape)
mask_index = encoding["input_ids"][0].tolist().index(4)

torch.Size([1, 20, 32000])


In [9]:
max_word = output[0][0][mask_index].argmax().item()
mask_word = tokenizer.convert_ids_to_tokens(max_word)
print(text.replace("[MASK]", mask_word))

私はお腹が空いたのでご飯を食べたい


In [10]:
top_words = output[0][0][mask_index].topk(5).indices
for word_id in top_words:
  word = tokenizer.convert_ids_to_tokens(word_id.item())
  print(text.replace("[MASK]", word))

私はお腹が空いたのでご飯を食べたい
私はお腹が空いたので野菜を食べたい
私はお腹が空いたので肉を食べたい
私はお腹が空いたのでカレーを食べたい
私はお腹が空いたのでラーメンを食べたい
