前回を受けてGPTをもう少し精度アップさせることを狙う

- トークナイザを文字化けしない対応品に変更
- GPTをRoPE、RSMノルムレイヤに変更

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import IterableDataset, DataLoader

# 1. 設定
MODEL_NAME = "elyza/ELYZA-japanese-Llama-2-7b" # 既存のトークナイザを利用（自作の場合は前回のMockを使用）
DATASET_NAME = "allenai/c4"
SUBSET = "ja"        # 日本語を指定
SEQ_LEN = 12 * 4          # シーケンス長
BATCH_SIZE = 8

# 2. トークナイザの準備
# ※ Llama3のように語彙数が多いものを使いたい場合は "meta-llama/Meta-Llama-3-8B" 等を指定
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# 3. カスタムデータセットクラス (ストリーミング対応)
class GPTPretrainingDataset(IterableDataset):
    def __init__(self, dataset, tokenizer, seq_len):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __iter__(self):
        # 連続したテキストをトークナイズして連結し、SEQ_LENごとに切り出す
        buffer = []
        for example in self.dataset:
            text = example['text']
            # トークナイズ（特殊トークンなし、パディングなし）
            tokens = self.tokenizer.encode(text, add_special_tokens=False)
            buffer.extend(tokens)

            # バッファがSEQ_LEN分溜まったら取り出す
            while len(buffer) >= self.seq_len:
                chunk = buffer[:self.seq_len]
                buffer = buffer[self.seq_len:]

                # GPT学習用：input_ids と labels を作成
                # ラベルは入力のコピー（学習ループ内でずらすか、ここでずらす）
                input_tensor = torch.tensor(chunk, dtype=torch.long)

                yield {
                    "input_ids": input_tensor,
                    "labels": input_tensor.clone()
                }

# 4. データのロード (Streamingモード)
print("mC4の日本語データをストリーミング中...")
raw_dataset = load_dataset(
    DATASET_NAME,
    data_files={'train': 'multilingual/c4-ja.*.json.gz'}, # 日本語ファイルを指定
    split="train",
    streaming=True
)

# 5. インスタンス化とDataLoader
gpt_dataset = GPTPretrainingDataset(raw_dataset, tokenizer, SEQ_LEN)
train_loader = DataLoader(gpt_dataset, batch_size=BATCH_SIZE)

# 6. 動作確認
print("最初のバッチを取得中...")
for i, batch in enumerate(train_loader):
    print(f"\n--- Batch {i+1} ---")
    print(f"Shape: {batch['input_ids'].shape}") # [Batch, SEQ_LEN]

    # 最初のサンプルの最初の50文字分をデコードして表示
    sample_text = tokenizer.decode(batch['input_ids'][0][:20])
    print(f"Decoded Text Sample:\n{sample_text}...")

    if i == 2: # 3バッチ分確認して終了
        break

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
import random

# ====================================================================
# ハイパーパラメータ設定 (小規模モデル用)
# ====================================================================
VOCAB_SIZE = 50       # 語彙サイズ

EMBED_DIM = 64        # 埋め込み次元 (d_model)
NUM_HEADS = 4         # Attention Headの数
NUM_LAYERS = 3        # Transformer Decoderブロックの層数
FFN_HIDDEN_DIM = EMBED_DIM * 2 # FFNの隠れ層の次元

# MoE関連 (オプション)
USE_MOE = False       # MoEを使用するかどうか
NUM_EXPERTS = 4       # MoEのエキスパート数
TOP_K = 2             # MoEで活性化するエキスパート数
MOE_LOSS_COEF = 0.01  # ロードバランシング損失の係数

BATCH_SIZE = 4
LEARNING_RATE = 1e-4
EPOCHS = 100

# 特殊トークンID
PAD_TOKEN_ID = 0
CLS_TOKEN_ID = 1 # GPTでは通常不要だが、例示のため含める
MASK_TOKEN_ID = 2 # GPTでは通常不要だが、MLMのデータ生成を想定
BOS_TOKEN_ID = 3 # Begin Of Sequence (通常はこれが使われる)
EOS_TOKEN_ID = 4 # End Of Sequence

# デバイス設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
import random


# 特殊トークンID
PAD_TOKEN_ID = 0
CLS_TOKEN_ID = 1 # GPTでは通常不要だが、例示のため含める
MASK_TOKEN_ID = 2 # GPTでは通常不要だが、MLMのデータ生成を想定
BOS_TOKEN_ID = 3 # Begin Of Sequence (通常はこれが使われる)
EOS_TOKEN_ID = 4 # End Of Sequence


def rotate_half(x):
    # x: (..., dim)
    x1 = x[..., ::2]
    x2 = x[..., 1::2]
    return torch.stack((-x2, x1), dim=-1).flatten(-2)

def apply_rope(q, k, seq_len, device):
    """
    q, k: (batch, heads, seq_len, head_dim)
    """
    head_dim = q.size(-1)
    assert head_dim % 2 == 0, "RoPE requires even head_dim"

    # 周波数
    theta = 10000 ** (-torch.arange(0, head_dim, 2, device=device) / head_dim)
    positions = torch.arange(seq_len, device=device)

    freqs = torch.einsum("i,j->ij", positions, theta)  # (seq_len, head_dim/2)
    sin = freqs.sin()[None, None, :, :]
    cos = freqs.cos()[None, None, :, :]

    # 偶奇次元に適用
    q_rot = (q[..., ::2] * cos) + (rotate_half(q)[..., ::2] * sin)
    k_rot = (k[..., ::2] * cos) + (rotate_half(k)[..., ::2] * sin)

    q = torch.cat([q_rot, q[..., 1::2]], dim=-1)
    k = torch.cat([k_rot, k[..., 1::2]], dim=-1)

    return q, k

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)
        self.out_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.shape

        q = self.q_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # ===== RoPE を適用 =====
        q, k = apply_rope(q, k, seq_len, x.device)

        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attention_weights = F.softmax(scores, dim=-1)
        context = torch.matmul(attention_weights, v)

        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
        output = self.out_linear(context)
        return output

class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(dim))
        self.eps = eps

    def forward(self, x):
        norm = x.pow(2).mean(dim=-1, keepdim=True)
        x = x * torch.rsqrt(norm + self.eps)
        return self.weight * x

class SwiGLU(nn.Module):
    def __init__(self, dim, hidden_dim):
        super().__init__()
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(dim, hidden_dim, bias=False)
        self.w3 = nn.Linear(hidden_dim, dim, bias=False)

    def forward(self, x):
        return self.w3(F.silu(self.w1(x)) * self.w2(x))

class Expert(nn.Module):
    def __init__(self, dim, hidden_dim):
        super().__init__()
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(dim, hidden_dim, bias=False)
        self.w3 = nn.Linear(hidden_dim, dim, bias=False)

    def forward(self, x):
        return self.w3(F.silu(self.w1(x)) * self.w2(x))


# 1.4. MoELayer (MoE オプション時)
class MoELayer(nn.Module):
    def __init__(self, dim, num_experts, top_k, expert_hidden_dim=None):
        super().__init__()
        self.num_experts = num_experts
        self.top_k = top_k
        hidden_dim = expert_hidden_dim if expert_hidden_dim is not None else dim * 2

        self.experts = nn.ModuleList([Expert(dim, hidden_dim) for _ in range(num_experts)])
        self.gate = nn.Linear(dim, num_experts)

    def forward(self, x):
        original_shape = x.shape
        x = x.view(-1, original_shape[-1])
        N_tokens = x.size(0)

        gate_logits = self.gate(x)
        gate_weights = F.softmax(gate_logits, dim=-1)

        top_k_weights, top_k_indices = torch.topk(gate_weights, self.top_k, dim=-1)
        top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)

        final_output = torch.zeros_like(x)

        # ロードバランシング損失
        expert_usage_one_hot = F.one_hot(top_k_indices, num_classes=self.num_experts).sum(dim=1).float()
        expert_router_prob = gate_weights.sum(dim=0) / N_tokens
        expert_fraction_routed = expert_usage_one_hot.sum(dim=0) / N_tokens
        load_balancing_loss = (expert_router_prob * expert_fraction_routed).sum()

        for k in range(self.top_k):
            expert_index = top_k_indices[:, k]
            weight = top_k_weights[:, k]

            for i in range(self.num_experts):
                mask = (expert_index == i)
                if not mask.any():
                    continue
                expert_input = x[mask]
                expert_output = self.experts[i](expert_input)
                weighted_output = expert_output * weight[mask].unsqueeze(1)
                final_output[mask] += weighted_output

        final_output = final_output.view(original_shape)
        return final_output, load_balancing_loss

class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ffn_hidden_dim,
                 use_moe=False, num_experts=None, top_k=None):
        super().__init__()

        self.norm1 = RMSNorm(embed_dim)
        self.attention = SelfAttention(embed_dim, num_heads)
        self.norm2 = RMSNorm(embed_dim)

        self.use_moe = use_moe
        if use_moe:
            self.ffn_or_moe = MoELayer(embed_dim, num_experts, top_k, ffn_hidden_dim)
        else:
            self.ffn_or_moe = SwiGLU(embed_dim, ffn_hidden_dim)

    def forward(self, x, mask):
        x = x + self.attention(self.norm1(x), mask)

        if self.use_moe:
            ffn_out, moe_loss = self.ffn_or_moe(self.norm2(x))
            x = x + ffn_out
            return x, moe_loss
        else:
            x = x + self.ffn_or_moe(self.norm2(x))
            return x, None


# ====================================================================
# 2. GPTモデル本体
# ====================================================================

class GPT(nn.Module):
    def __init__(self, vocab_size, seq_len, embed_dim, num_heads, num_layers, ffn_hidden_dim,
                 use_moe=False, num_experts=None, top_k=None):
        super().__init__()
        self.vocab_size = vocab_size
        self.seq_len = seq_len
        self.embed_dim = embed_dim

        # トークン埋め込み層
        self.token_embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_TOKEN_ID)
        # 位置埋め込み層
        self.position_embedding = nn.Embedding(seq_len, embed_dim)

        # Transformer Decoderブロックのスタック
        self.decoder_layers = nn.ModuleList([
            DecoderBlock(embed_dim, num_heads, ffn_hidden_dim, use_moe, num_experts, top_k)
            for _ in range(num_layers)
        ])

        # 最終のLayer Normalization (出力前)
        self.final_norm = nn.LayerNorm(embed_dim)

        # 出力層 (語彙サイズへの線形変換)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

        self.use_moe = use_moe

    def forward(self, input_ids):
        # input_ids: (batch_size, seq_len)

        batch_size, seq_len = input_ids.shape

        # トークン埋め込み
        token_embeds = self.token_embedding(input_ids)

        # 位置埋め込み (torch.arangeで位置IDを生成)
        position_ids = torch.arange(0, seq_len, dtype=torch.long, device=input_ids.device)
        position_embeds = self.position_embedding(position_ids)

        # 埋め込みの合計
        x = token_embeds + position_embeds

        # マスクの作成 (未来のトークンを参照しないようにする)
        # causal_mask: (seq_len, seq_len) の下三角行列
        causal_mask = torch.tril(torch.ones((seq_len, seq_len), device=input_ids.device)).bool()
        # パディングマスクはここでは考慮しない (MLMデータセットで対応)

        total_moe_aux_loss = 0.0

        # Decoderブロックを順に適用
        for layer in self.decoder_layers:
            output, moe_aux_loss = layer(x, causal_mask)
            x = output
            if self.use_moe and moe_aux_loss is not None:
                total_moe_aux_loss += moe_aux_loss

        # 最終Layer Normalization
        x = self.final_norm(x)

        # 言語モデルヘッド (logits)
        logits = self.lm_head(x) # (batch_size, seq_len, vocab_size)

        return logits, total_moe_aux_loss


In [None]:
from transformers import get_scheduler
import os
import torch.optim as optim


save_dir = "gpt_checkpoints"
os.makedirs(save_dir, exist_ok=True)



# 1. トークナイザーの「本当の」語彙数を取得
# 特殊トークンを含めた全語彙数を使わないと Embedding で IndexError になります
actual_vocab_size = len(tokenizer)

# 2. モデルの再定義
# embed_dim や ffn_hidden_dim はお好みで調整してください
model = GPT(
    vocab_size=actual_vocab_size,
    seq_len=SEQ_LEN,           # データセットの SEQ_LEN と完全に一致させる
    embed_dim=256,
    num_heads=8,
    num_layers=6,
    ffn_hidden_dim=1024
).to(device)

print(f"Corrected Vocab Size: {actual_vocab_size}")
print(f"Max Sequence Length: {SEQ_LEN}")


# 2. オプティマイザとスケジューラ
# AdamWはLLM学習の標準
optimizer = optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.01)

# 学習率を徐々に上げ、その後下げるスケジューラ
num_training_steps = 10000 # ストリーミングなのでステップ数で定義
lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=500,
    num_training_steps=num_training_steps,
)

# 3. 損失関数 (PADトークンを無視)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# --- 4. 学習ループ ---
model.train()
step = 0
running_loss = 0.0

print(f"学習を開始します (Device: {device})")

# ストリーミングDataLoaderを回す
for batch in tqdm(train_loader, total=num_training_steps):
# for batch in train_loader:
    input_ids = batch["input_ids"].to(device)
    labels = batch["labels"].to(device)

    optimizer.zero_grad()

    # 順伝播
    # logits: [Batch, Seq, Vocab]
    logits, _ = model(input_ids)

    # --- 重要：GPTの次単語予測（Causal LM）の計算 ---
    # 入力の 0~N-1 番目を使って、ラベルの 1~N 番目を当てる
    #
    shift_logits = logits[:, :-1, :].contiguous().view(-1, tokenizer.vocab_size)
    shift_labels = labels[:, 1:].contiguous().view(-1)

    loss = criterion(shift_logits, shift_labels)

    # 逆伝播
    loss.backward()

    # 勾配クリッピング (学習の安定化)
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()
    lr_scheduler.step()

    running_loss += loss.item()
    step += 1

    # 定期的にログ表示とモデル保存
    if step % 500 == 0:
        avg_loss = running_loss / 500
        print(f"Step {step}/{num_training_steps} - Loss: {avg_loss:.4f}")
        running_loss = 0.0

