In [3]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import matplotlib_fontja
import numpy as np
import pandas as pd
import sentencepiece as spm
from tokenizers import SentencePieceUnigramTokenizer
from transformers import PreTrainedTokenizerFast
import wandb

In [None]:
ds = load_dataset("globis-university/aozorabunko-clean", split="train")

In [3]:
ds

Dataset({
    features: ['text', 'footnote', 'meta'],
    num_rows: 16951
})

In [4]:
ds_train = ds["text"]

In [5]:
len(ds_train)

16951

物語生成モデル

In [None]:
import sentencepiece as spm

# データセットを直接渡すとメモリを圧迫するため、ジェネレータをかませる
def ds_iter():
    for item in ds["text"]:
        yield item

spm.SentencePieceTrainer.Train(
    sentence_iterator=ds_iter(),
    model_prefix="trained/tokenizer/sp_jawiki",
    vocab_size=8000,
    model_type="unigram",
    character_coverage=0.9995, # どの程度の文字をカバーするか。これより使用頻度の低い文字はUNKになる
    train_extremely_large_corpus=True,
    unk_id=0,
    bos_id=1,
    eos_id=2,
    pad_id=3,
)

In [9]:
!pip install tokenizers

[0m

In [14]:
!conda install wget -y
!pip install protobuf
!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py

from tokenizers import SentencePieceUnigramTokenizer
tokenizer = SentencePieceUnigramTokenizer.from_spm("trained/tokenizer/sp_jawiki.model")

!rm sentencepiece_model_pb2.py

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.5.2
  latest version: 25.11.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=25.11.0



# All requested packages already installed.

[0m--2025-12-07 09:03:04--  https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py
raw.githubusercontent.com (raw.githubusercontent.com) をDNSに問いあわせています... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443 に接続しています... 接続しました。
HTTP による接続要求を送信しました、応答を待っています... 200 OK
長さ: 6257 (6.1K) [text/plain]
`sentencepiece_model_pb2.py' に保存中


2025-12-07 09:03:04 (48.5 MB/s) - `sentencepiece_model_pb2.py' へ保存完了 [6257/6257]



In [15]:
tokenizer

Tokenizer(vocabulary_size=8000, model=SentencePieceUnigram)

In [16]:
from tokenizers.processors import TemplateProcessing

# これをすると、なぜかメタスペースが入ってしまう。
# tokenizer.post_processor = TemplateProcessing(
#     single="<s> $A </s>", # BOS, EOSで囲む処理を指定
#     special_tokens=[
#         ("<s>", tokenizer.token_to_id("<s>")),
#         ("</s>", tokenizer.token_to_id("</s>")),
#     ],
# )
# tokenizer.save("trained/tokenizer/jawiki.json")

In [17]:
# !pip install transformers

In [18]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="trained/tokenizer/jawiki.json")
tokenizer.add_special_tokens({
    "unk_token": "<unk>",
    "bos_token": "<s>",
    "eos_token": "</s>",
    "pad_token": "<pad>",
});

In [19]:
text = "人工知能は人間の知能を超えるか？"
ids = tokenizer.encode(text)
ids

[7, 45, 1126, 384, 576, 8, 894, 384, 576, 9, 3477, 820, 15, 441]

In [20]:
tokenizer.decode([7, 45, 1126])
tokenizer.decode([15, 441])

'か?'

tokenizerができた。

tokenizerはpaddingまで自動でやってくれる。

In [21]:
from transformers.data.data_collator import DataCollatorForLanguageModeling

In [22]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [23]:
# collatorを作る。
# dataloaderに渡されたデータはlist型でcollate_fnに渡され、collateの処理が走る。
# ここではtokenizerに渡すという処理を噛ませる。
def data_collator(batch: list):
    return tokenizer(
        batch,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=1024,
    )

In [None]:
train_dataloader = DataLoader(
    ds["text"], shuffle=True, batch_size=8, collate_fn=data_collator
)
next(iter(train_dataloader))

In [25]:
sample = next(iter(train_dataloader))

In [26]:
sample

{'input_ids': tensor([[2078,   99,  550,  ...,   55, 5775, 1821],
        [   7,  114,   88,  ..., 6825,  776,  705],
        [   7, 2155,  127,  ..., 2233, 2034,    5],
        ...,
        [   7, 3198, 6115,  ...,  239,  657, 2311],
        [2019,  147,  914,  ...,    3,    3,    3],
        [   7, 1326,   47,  ...,    3,    3,    3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

# transformer

In [27]:
import torch
from torch import nn

In [28]:
class CausalAttention(nn.Module):
    def __init__(self, d: int):
        super().__init__()
        self.scale = d ** (1 / 2)  # 右辺で新たに渡している要素はdのみ
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v):
        """
        Args:
            q: (seq_length, embedding_size)
            k: (seq_length, embedding_size)
            v: (seq_length, embedding_size)
        """
        seq_len = q.shape[-2]
        score = (q @ k.mT) / self.scale
        # 増えたところ↓
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
        score = score.masked_fill(mask, -torch.inf)  # Trueを-infに飛ばす．

        return self.softmax(score) @ v

In [29]:
class MultiHeadAttention(nn.Module):
    def __init__(
        self, n_head, d_model
    ):  # ここは64じゃなくてn_headの方が良い．契約プログラミング的に．64とすると全体で見た時マジックナンバーっぽくなる．
        super().__init__()
        assert d_model % n_head == 0, "割り切れないよ"
        self.n_head = n_head
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        self.attention = CausalAttention(d_model)

    def forward(self, q, k, v):
        qw = self.w_q(q)
        kw = self.w_k(k)
        vw = self.w_v(v)
        calculated = []
        # chunkでhead数に分割する．
        for qw_i, kw_i, vw_i in zip(
            qw.chunk(self.n_head, dim=-1),
            kw.chunk(self.n_head, dim=-1),
            vw.chunk(self.n_head, dim=-1),
        ):
            calculated.append(self.attention(qw_i, kw_i, vw_i))
        return self.w_o(torch.cat(calculated, dim=-1))

In [30]:
class FFN(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.ReLU(), nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)

In [68]:
class TransformerLayer(nn.Module):
    def __init__(self, n_head, d_model, d_ff):
        super().__init__()
        self.mha = MultiHeadAttention(n_head, d_model)
        self.ffn = FFN(d_model, d_ff)
        self.layer_norm_1 = nn.LayerNorm(
            d_model
        )  # normレイヤーはパラメータを共有しているわけでは無いので，二つインスタンスが必要．
        self.layer_norm_2 = nn.LayerNorm(d_model)

    def forward(self, x):  # (batch, seq_len, d_model)
        residual_1 = x  # 接続用に保存しておく．
        x = self.mha(x, x, x)
        x = x + residual_1  # 残差結合
        x = self.layer_norm_1(x)  # post norm
        residual_2 = x
        x = self.ffn(x)
        x = x + residual_2
        x = self.layer_norm_2(x)
        return x

In [108]:
class Transformer(nn.Module):

    def __init__(
        self, vocab_size, n_head, d_model, d_ff, n_layers, context_window
    ):
        super().__init__()
        self.embed = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=d_model
        )
        self.transformer_layers = nn.Sequential(
            *[TransformerLayer(n_head, d_model, d_ff) for _ in range(n_layers)]
        )  # いけてる？
        self.linear = nn.Linear(
            d_model, vocab_size
        )  # batch_size, seq_len, vocab_sizeになる
        self.softmax = nn.Softmax(dim=-1)
        self.pe = nn.Parameter(
            torch.randn(
                [context_window, d_model]
            )  # ここvocab_sizeじゃなくてd_modelじゃね？
        )  # 上限までベクトルを作って，forwardで削れば良い．

    def forward(self, x):  # batch_size, seq_len: int
        x = self.embed(x)  # batch_size, seq_len, d_model
        x = x + self.pe[:x.size(-2), :] # batch_size, seq_len, d_model
        x = self.transformer_layers(x) # batch_size, seq_len, d_model
        x = self.linear(x) # batch_size, seq_len, vocab_size
        return x # batch_size, seq_len, vocab_size
        # return self.softmax(x) # batch_size, seq_len, vocab_size

バッチごとの(文章)、単語ごとの、語彙の 確率分布が出てくる。

In [109]:
vocab_size = tokenizer.vocab_size
n_head = 8
d_model = 512
d_ff = 2048
n_layers = 6
context_window = 1024

In [110]:
transformer = Transformer(
    vocab_size, n_head, d_model, d_ff, n_layers, context_window
)

# transformer
n_param = sum(p.numel() for p in transformer.parameters() if p.requires_grad)
print(f"{n_param=:,}")
# 1億2000万のパラメータ数！！！
# float32の場合，1パラメータ4バイト換算される．
model_size = n_param * 4
print(f"{model_size=:,}")

n_param=27,638,592
model_size=110,554,368


0.03Bモデルみたいな感じのができる。

In [111]:
transformer

Transformer(
  (embed): Embedding(8000, 512)
  (transformer_layers): Sequential(
    (0): TransformerLayer(
      (mha): MultiHeadAttention(
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_features=512, out_features=512, bias=True)
        (w_o): Linear(in_features=512, out_features=512, bias=True)
        (attention): CausalAttention(
          (softmax): Softmax(dim=-1)
        )
      )
      (ffn): FFN(
        (net): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (layer_norm_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (layer_norm_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (mha): MultiHeadAttention(
        (w_q): Linear(in_features=512, out_features=5

- input: tokenizeされたデータ
- output: sortmaxの確率分布
- loss: cross entropy loss


In [112]:
sample_x = sample["input_ids"]
sample_x

tensor([[2078,   99,  550,  ...,   55, 5775, 1821],
        [   7,  114,   88,  ..., 6825,  776,  705],
        [   7, 2155,  127,  ..., 2233, 2034,    5],
        ...,
        [   7, 3198, 6115,  ...,  239,  657, 2311],
        [2019,  147,  914,  ...,    3,    3,    3],
        [   7, 1326,   47,  ...,    3,    3,    3]])

In [113]:
sample["input_ids"][0]

tensor([2078,   99,  550,  ...,   55, 5775, 1821])

$$
loss = -\sum \log p(x_{t}|x_{<t})
$$

transformer: n個のsequenceを入れると、n個のsequenceを出す。

In [114]:
sample["input_ids"][0].size()

torch.Size([1024])

In [115]:
sample["input_ids"][0]

tensor([2078,   99,  550,  ...,   55, 5775, 1821])

$$
x = x_1, x_2, ..., x_T
\\
y = \text{transformer}(x) = y_1, y_2, ..., y_T こいつらは確率分布
\\
y_t = \text{transformer}(x_{<{t}})
\\
y_t = p(x_{{t}}|x_{<{t}})


x = x_1, x_2, ..., x_T

input:x_1
label:x_2

input:x_1, x_2
label:x_3

input:x_1, x_2,...x_t
label:x_{t+1}

input:x_1, x_2,...x_{T-1}
label:x_{T}
$$

In [116]:
sample['input_ids'][0]

tensor([2078,   99,  550,  ...,   55, 5775, 1821])

In [117]:
sample_input = sample['input_ids'][0].clone()
sample_input = sample_input[:-1]

sample_super = sample['input_ids'][0].clone()
sample_super = sample_super[1:]

In [118]:
sample_input[:2]

tensor([2078,   99])

In [119]:
sample_super[3]

tensor(6817)

In [120]:
sample_input = sample_input.unsqueeze(0)

In [121]:
sample_input

tensor([[2078,   99,  550,  ...,  449,   55, 5775]])

以下が予測と教師のペア

In [122]:
transformer(sample_input).size()

torch.Size([1, 1023, 8000])

In [123]:
sample_super.size()

torch.Size([1023])

In [124]:
vocab_size = tokenizer.vocab_size
n_head = 8
d_model = 512
d_ff = 2048
n_layers = 6
context_window = 1024
model = Transformer(
    vocab_size, n_head, d_model, d_ff, n_layers, context_window
)

In [138]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.AdamW(model.parameters(), lr=0.001)

In [139]:
train_dataloader = DataLoader(
    ds["text"], shuffle=True, batch_size=8, collate_fn=data_collator
)

In [None]:
# outputs = batch, seq_len, vocab
# label = batch, seq_len, vocab
from tqdm import tqdm

In [145]:
for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(tqdm(train_dataloader)):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data["input_ids"][:-1]
        label = data["input_ids"][1:]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs.transpose(-2, -1), label)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

  0%|          | 0/2119 [00:00<?, ?it/s]

  4%|▍         | 91/2119 [01:39<36:50,  1.09s/it]


KeyboardInterrupt: 

学習させて、推論させてみる。