In [None]:
# 各種パッケージのインストールとバージョン
!pip install transformers
!pip install tokenizers
!pip install sentencepiece
!pip list | grep torch
!pip list | grep transformers
!pip list | grep tokenizers
!pip list | grep sentencepiece

In [None]:
# ディレクトリの指定
dir = "./"



In [None]:
# 事前学習用コーパスの準備
# 1行に1文章となるようなテキストを準備する

df_header = pd.read_csv('XXX.csv')
print(df_header)



In [None]:
# Tokenization
from sentencepiece import SentencePieceTrainer

# sentencepieceの学習
SentencePieceTrainer.Train(
    '--input='+dir+'corpus/corpus.txt, --model_prefix='+dir+'model/sentencepiece --character_coverage=0.9995 --vocab_size=100'
)

# sentencepieceのパラメータ
# https://github.com/google/sentencepiece#train-sentencepiece-model
# training options
# https://github.com/google/sentencepiece/blob/master/doc/options.md




In [None]:
# sentencepieceのモデルをTokenizerで読み込み

# sentencepieceを使ったTokenizerは現時点では以下。
# >All transformers models in the library that use SentencePiece use it 
# in combination with unigram. Examples of models using SentencePiece are ALBERT, XLNet, Marian, and T5.
# https://huggingface.co/transformers/tokenizer_summary.html

from transformers import AlbertTokenizer

# ALBERTのトークナイザを定義
tokenizer = AlbertTokenizer.from_pretrained(dir+'model/sentencepiece.model', keep_accents=True)

# textをトークナイズ
text = "吾輩は猫である。名前はまだ無い。"
print(tokenizer.tokenize(text))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['▁', '吾輩', 'は', '猫', 'である', '。', '名前', 'は', 'まだ無', 'い', '。']


In [None]:
# BERTモデルのconfigを設定
from transformers import BertConfig
from transformers import BertForMaskedLM

# BERTconfigを定義
config = BertConfig(vocab_size=32003, num_hidden_layers=12, intermediate_size=768, num_attention_heads=12)

# BERT MLMのインスタンスを生成
model = BertForMaskedLM(config)

# パラメータ数を表示
print('No of parameters: ', model.num_parameters())

No of parameters:  68158211


In [None]:
# 事前学習用のデータセットを準備
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling

# textを1行ずつ読み込んでトークンへ変換
dataset = LineByLineTextDataset(
     tokenizer=tokenizer,
     file_path=dir + 'corpus/corpus.txt',
     block_size=256, # tokenizerのmax_length
)

# データセットからサンプルのリストを受け取り、それらをテンソルの辞書としてバッチに照合するための関数
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True,
    mlm_probability= 0.15
)





In [None]:
# 事前学習を行う
from transformers import TrainingArguments
from transformers import Trainer

# 事前学習のパラメータを定義
training_args = TrainingArguments(
    output_dir= drive_dir + 'outputBERT/',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    save_steps=10000,
    save_total_limit=2,
    prediction_loss_only=True
)

# trainerインスタンスの生成
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

#　学習
trainer.train()

# 学習したモデルの保存
trainer.save_model(dir + 'outputBERT/')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 5
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 10


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/BERT-pretrained-transformers/outputBERT/
Configuration saved in /content/drive/MyDrive/BERT-pretrained-transformers/outputBERT/config.json
Model weights saved in /content/drive/MyDrive/BERT-pretrained-transformers/outputBERT/pytorch_model.bin


In [None]:
# 言語モデルの確認
from transformers import pipeline

# tokenizerとmodel
tokenizer = AlbertTokenizer.from_pretrained(drive_dir+'model/sentencepiece.model', keep_accents=True)
model = BertForMaskedLM.from_pretrained(drive_dir + 'outputBERT')

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

MASK_TOKEN = tokenizer.mask_token

# コーパスに応じた文章から穴埋めをとく

text = "XXX{}XXX".format(MASK_TOKEN)
fill_mask(text)

loading file /content/drive/MyDrive/BERT-pretrained-transformers/model/sentencepiece.model
Adding [CLS] to the vocabulary
Adding [SEP] to the vocabulary
Adding <pad> to the vocabulary
Adding [MASK] to the vocabulary
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file /content/drive/MyDrive/BERT-pretrained-transformers/outputBERT/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 768,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.9.1",
  "type_vocab_siz

[{'score': 0.0019151709275320172,
  'sequence': 'は、 である。はい。',
  'token': 3,
  'token_str': '、'},
 {'score': 0.0012296646600589156,
  'sequence': 'はか である。はい。',
  'token': 9,
  'token_str': 'か'},
 {'score': 0.0007844513165764511,
  'sequence': 'は。 である。はい。',
  'token': 7,
  'token_str': '。'},
 {'score': 0.0006089677917771041,
  'sequence': 'はと である。はい。',
  'token': 6,
  'token_str': 'と'},
 {'score': 0.0005491935880854726,
  'sequence': 'はの である。はい。',
  'token': 8,
  'token_str': 'の'}]