### GPT：传统的Decoder模型

### 导包

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from dataclasses import dataclass

torch.manual_seed(1024)

<torch._C.Generator at 0x20ceac56850>

### 定义GPT参数

@dataclass 装饰器会自动为类添加以下方法：

__init__() - 初始化对象的方法

__repr__() - 返回对象的字符串表示形式

__eq__() - 比较两个对象是否相等

In [16]:
@dataclass
class GPTConfig:
    block_size: int = 512   # 文本的最大长度 max_seq
    batch_size: int = 12
    n_layer: int = 12
    n_head: int = 12
    # 设置成一样为了tie_embedding_weight，共享词向量层（embedding layer）和输出层（output layer）的权重矩阵
    n_embd: int = 768   # 也是 hidden_dim/hidden_size的数值
    hidden_dim: int = n_embd
    dropout: float = 0.1 
    head_size: int = n_embd // n_head
    # vocab_size
    # gpt2 的官方的tokenizer
    vocab_size: int = 50257

### 定义GPT的结构

In [17]:
# 1. single head attention
class SingleHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.head_size = config.head_size
        self.query = nn.Linear(config.hidden_dim, config.head_size)
        self.key = nn.Linear(config.hidden_dim, config.head_size)
        self.value = nn.Linear(config.hidden_dim, config.head_size)

        # 新的写法 attention_mask 通过 register_buffer 注册
        # 不用计算梯度，节约内存显存，速度更快
        self.register_buffer(
            "attention_mask",
            torch.tril(
                torch.ones(config.block_size, config.block_size)
            )
        )   # 注意这里是一个block_size x block_size的矩阵，用的时候要用[:seq_len, :seq_len]截取一部分
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        batch_size, seq_len, hidden_dim = x.size()
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        weight = q @ k.transpose(-2, -1)
        weight = weight.masked_fill(
            self.attention_mask[:seq_len, :seq_len] == 0,
            float("-inf")
        )
        # 注意计算weight的时候除以根号d_k
        weight = F.softmax(weight / math.sqrt(self.head_size), dim = -1)
        weight = self.dropout(weight)
        
        output = weight @ v
        return output

In [18]:
# multi head attention
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.heads = nn.ModuleList(
            [
                SingleHeadAttention(config)
                for _ in range(config.n_head)
            ]
        )
        self.proj = nn.Linear(config.hidden_dim, config.hidden_dim)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        output = torch.cat(
            [h(x) for h in self.heads],
            dim = -1
        )
        output = self.proj(output)
        output = self.dropout(output)
        return output

In [19]:
# feed forward (MLP)
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config.hidden_dim, 4 * config.hidden_dim),    # swiglu 就只要升维到8/3
            nn.GELU(),
            nn.Linear(4 * config.hidden_dim, config.hidden_dim),
            nn.Dropout(config.dropout)
        )

    def forward(self, x):
        return self.net(x)

In [20]:
# block
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.att = MultiHeadAttention(config)
        self.ffn = FeedForward(config)
        self.att_ln = nn.LayerNorm(config.hidden_dim, eps = 1e-6)
        self.ffn_ln = nn.LayerNorm(config.hidden_dim, eps = 1e-6)

    def forward(self, x):
        x = x + self.att(self.att_ln(x))
        x = x + self.ffn(self.ffn_ln(x))
        return x

In [21]:
# GPT
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 主要就是(embedding, position, norm, mlp, block)
        # position embedding从0，1，xxx升级到rope
        # norm从layer norm升级到了RMS norm
        # mlp -> swiglu
        # mha -> gqa
        self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embedding_table = nn.Embedding(config.block_size, config.n_embd)
        self.blocks = nn.Sequential(
            *[Block(config) for _ in range(config.n_layer)]
        )
        self.last_ln = nn.LayerNorm(config.hidden_dim)
        self.lm_head = nn.Linear(config.hidden_dim, config.vocab_size, bias = False)
        # 现在的slm，会用tie_weight来减少参数
        # 非常重要
        # linear 层的weight有一个转置的操作
        self.token_embedding_table.weight = self.lm_head.weight

        self.apply(self._init_weights)  # 遍历说有的子模块更优雅

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # 初始化为正态分布
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
        
    def forward(self, idx, targets=None):
        # idx 输入的是token_ids
        # targets 是目标的token ids (batch, seq_len) 就是词表里面最终选择的词！
        # shape 要一样
        batch, seq_len = idx.size() # (batch, seq_len)
        token_emb = self.token_embedding_table(idx) # (batch, seq_len, n_embd)
        pos_emb = self.position_embedding_table(
            # 要确保位置编码和输入的idx在同一个设备上
            torch.arange(seq_len, device = idx.device)
        )   # shape(seq_len, n_embd)

        x = token_emb + pos_emb    # 这里其实是广播相加 (batch, seq_len, n_embd)
        x = self.blocks(x)
        x = self.last_ln(x)
        logits = self.lm_head(x)    # shape (batch, seq_len, vocab_size)
    
        if targets is None:
            loss = None
        else:
            batch, seq_len, vocab_size = logits.size()
            logits = logits.view(batch * seq_len, vocab_size)
            targets = targets.view(batch * seq_len)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # 如果序列太长，只取最后 block_size 个token
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # 获取预测
            logits, _ = self(idx_cond)  # 等价于self.forward(idx_cond)
            # 只关注最后一个时间步的预测, shape (batch_size, seq_len, vocab_size)
            logits = logits[:, -1, :]  # becomes (batch_size, vocab_size)
            # 应用softmax获取概率
            probs = F.softmax(logits, dim=-1)
            # 从概率分布中采样下一个token（而非贪心选择最大值）
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)
            # 附加到序列上
            idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, seq_len + 1)
        return idx
        

### 构建输入的Dataset

非常重要！了解模型输入是什么样子

In [22]:
# 写一个dataset,为Dataloader准备
class MyDataset(Dataset):
    def __init__(self, path, block_size = 512):
        import tiktoken
        # gpt专用的tokenizer
        self.enc = tiktoken.get_encoding('gpt2')
        self.block_size = block_size    # pos 最大长度

        self.encoded_data = []
        # 特殊符号分割不同的训练文本
        # <|endoftext|> # [50256]，即它在vocab里面放在最后一个50256位置
        self.eos_token = self.enc.encode(
            "<|endoftext|>",
            allowed_special={"<|endoftext|>"}
        )[0]

        self.max_lines = 100
        import json

        raw_data = []   # 为了pad长度不一样的data
        with open(path, 'r', encoding = 'utf-8') as f:
            for i, line in enumerate(f):
                if i >= self.max_lines:
                    break
                try:
                    text = json.loads(line.strip())['text']
                    raw_data.append(text)
                except Exception as e:
                    continue
        
        full_encoded = []
        for text in raw_data:
            encoded_text = self.enc.encode(text)    # list
            # 将所有text放在一行然后eos_token做分割
            full_encoded.extend(encoded_text + [self.eos_token])

        # block_size = 512
        # 长 -> 短 512
        for i in range(0, len(full_encoded), self.block_size):
            # 注意！在这的数据有一个移位的操作
            chunk = full_encoded[i:i+self.block_size+1] # 512 每一行实际是 513
            if len(chunk) < self.block_size + 1:
                chunk = chunk + [self.eos_token] * (self.block_size + 1 - len(chunk))
            self.encoded_data.append(chunk)                  

    def __len__(self):
        return len(self.encoded_data)
    
    def __getitem__(self, idx):
        # 完成了移位的操作
        chunk = self.encoded_data[idx]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y
    
    def encode(self, text):
        return self.enc.encode(text)
    
    def decode(self, ids):
        return self.enc.decode(ids)


### 运行相关的函数

In [23]:
model = GPT(GPTConfig())
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# print 模型共计的参数

total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params / 1e6} M")

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
# 设置 cosine 学习率，余弦退火
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000)

Total Parameters: 124.046592 M


In [24]:
device

'cuda'

In [25]:
# train data
train_dataset = MyDataset(r'E://llm/data/mobvoi_seq_monkey_general_open_corpus.jsonl')

# split traindataset to train and val
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [0.9, 0.1])

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=12, shuffle=False)

In [26]:
for x, y in train_loader:
    print(x.shape, y.shape)
    print(x)
    print(y)
    break

torch.Size([12, 512]) torch.Size([12, 512])
tensor([[  163,   100,   239,  ...,   114, 37955, 37772],
        [  238, 47987, 10310,  ...,   245, 26193,   234],
        [17739,   225, 16764,  ...,   103, 16764,   163],
        ...,
        [46237,   115, 16764,  ...,   163,   251,    96],
        [17312,   235, 21410,  ...,   171,   120,   253],
        [20998,   116, 37955,  ..., 50256, 50256, 50256]])
tensor([[  100,   239,   161,  ..., 37955, 37772,   121],
        [47987, 10310,   229,  ..., 26193,   234,   165],
        [  225, 16764, 31660,  ..., 16764,   163,   225],
        ...,
        [  115, 16764,   198,  ...,   251,    96, 34650],
        [  235, 21410,   164,  ...,   120,   253,   198],
        [  116, 37955, 12859,  ..., 50256, 50256, 50256]])


In [28]:
import os
def train(model, optimizer, scheduler, train_loader, val_loader, device, epoch):
    model.train()
    total_loss = 0
    for batch_idx, (x, y) in enumerate(train_loader):
        # 将数据移动设备上
        x, y = x.to(device), y.to(device)

        # 前向传播
        logits, loss = model(x, targets=y)

        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # 调整学习率
        scheduler.step()

        total_loss += loss.item()

        if batch_idx % 10 == 0:
            print(f"Epoch: {epoch}, Batch: {batch_idx}, loss:{loss.item():.4f}")
    
    return total_loss

def eval(model, val_loader, device):
    # 验证
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits, loss = model(x, targets=y)
            val_loss += loss.item()
    return val_loss

for epoch in range(2):
    train_loss = train(model, optimizer, scheduler, train_loader, val_loader, device, epoch)
    val_loss = eval(model, val_loader, device)
    print(f'Epoch: {epoch}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

    # 保存模型
    avg_val_loss = val_loss / len(val_loader)
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'val_loss': avg_val_loss,
    }
    # 确保保存目录存在
    os.makedirs('./checkpoints', exist_ok=True)  # exist_ok=True 表示如果目录已存在不报错
    # 保存每个epoch的模型
    torch.save(checkpoint, f'./checkpoints/model_epoch_{epoch}.pt')


Epoch: 0, Batch: 0, loss:4.8134
Epoch: 0, Batch: 10, loss:4.8144
Epoch: 0, Batch: 20, loss:4.8126
Epoch: 0, Batch: 30, loss:4.7184
Epoch: 0, Train Loss: 5.2238, Val Loss: 4.7299
Epoch: 1, Batch: 0, loss:4.7453
Epoch: 1, Batch: 10, loss:4.7734
Epoch: 1, Batch: 20, loss:4.7141
Epoch: 1, Batch: 30, loss:4.7174
Epoch: 1, Train Loss: 4.7314, Val Loss: 4.7160
