In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [3]:
import torch
from transformers import BertForMaskedLM, BertTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from transformers import DataCollatorForLanguageModeling



In [None]:
# トークナイザーとモデルの読み込み
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from transformers import BertTokenizer, BertModel, BertConfig
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig(vocab_size=tokenizer.vocab_size)
model = BertForMaskedLM(config)



In [5]:
tokens = tokenizer.tokenize('Hello, my dog is cute')
print(tokens)

['hello', ',', 'my', 'dog', 'is', 'cute']


In [None]:
from transformers import pipeline
fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

MASK_TOKEN = tokenizer.mask_token
text = '''
it's {}. So that, I don't feel good.
'''.format(MASK_TOKEN)
fill_mask(text)

[{'score': 0.00034311236231587827,
  'token': 10139,
  'token_str': '##yi',
  'sequence': "it'syi. so that, i don't feel good."},
 {'score': 0.00027401692932471633,
  'token': 25678,
  'token_str': '##islaus',
  'sequence': "it'sislaus. so that, i don't feel good."},
 {'score': 0.00022951728897169232,
  'token': 11206,
  'token_str': 'reluctantly',
  'sequence': "it's reluctantly. so that, i don't feel good."},
 {'score': 0.00021664977248292416,
  'token': 19207,
  'token_str': 'automobiles',
  'sequence': "it's automobiles. so that, i don't feel good."},
 {'score': 0.0002067431341856718,
  'token': 14558,
  'token_str': 'slogan',
  'sequence': "it's slogan. so that, i don't feel good."}]

In [None]:
# データセットの読み込み
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

# データセットのトークナイズ
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# データローダーの作成
train_loader = DataLoader(tokenized_datasets['train'], batch_size=8, shuffle=True)
eval_loader = DataLoader(tokenized_datasets['validation'], batch_size=8)

# 損失関数とオプティマイザの定義
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# トレーニングループ
model.train()
loss_history = []

for epoch in range(3):  # エポック数
    loss_summation = 0.0
    for i, batch in enumerate(train_loader):
        inputs, labels = batch['input_ids'], batch['input_ids']
        attention_mask = batch['attention_mask']

        # 順伝播
        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss_summation += loss.item()

        # 逆伝播とオプティマイザのステップ
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
          print(f'Epoch {epoch+1}, Loss: {loss.item()}')
    loss_history.append(loss_summation)

    # モデルの保存
    torch.save(model.state_dict(), 'bert_model.pth')
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

print("Training complete!")

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Epoch 1, Loss: 10.478010177612305
Epoch 1, Loss: 8.999910354614258
Epoch 1, Loss: 8.392775535583496
Epoch 1, Loss: 7.933531761169434
Epoch 1, Loss: 7.638495922088623
Epoch 1, Loss: 6.756958961486816
Epoch 1, Loss: 6.181826591491699
Epoch 1, Loss: 5.841009140014648
Epoch 1, Loss: 5.0888752937316895
Epoch 1, Loss: 4.380066871643066
Epoch 1, Loss: 4.121727466583252
Epoch 1, Loss: 3.772393226623535
Epoch 1, Loss: 3.299144983291626
Epoch 1, Loss: 2.9387850761413574
Epoch 1, Loss: 2.3905749320983887
Epoch 1, Loss: 1.917541742324829
Epoch 1, Loss: 1.9224401712417603
Epoch 1, Loss: 1.6786000728607178
Epoch 1, Loss: 1.019134759902954
Epoch 1, Loss: 1.1547377109527588
Epoch 1, Loss: 0.8197243213653564
Epoch 1, Loss: 1.1697890758514404
Epoch 1, Loss: 0.6962764263153076
Epoch 1, Loss: 0.7674043774604797
Epoch 1, Loss: 1.0915473699569702
Epoch 1, Loss: 0.49325698614120483
Epoch 1, Loss: 0.612546443939209
Epoch 1, Loss: 0.6440722346305847
Epoch 1, Loss: 0.8319011330604553
Epoch 1, Loss: 0.3246716260

In [None]:
import matplotlib.pyplot as plt

# 損失の推移をグラフ化
plt.plot(loss_history)
plt.xlabel('Batch')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.show()