<img src="https://github.com/jingyaogong/minimind/raw/master/images/LLM-structure.png" width="60%">

<img src="https://github.com/jingyaogong/minimind/raw/master/images/LLM-structure-moe.png" width="60%">

In [1]:
# 构建Trnasformer Layer K
import torch.nn as nn

class MiniMindBlock(nn.Module):
    def __init__(self, layer_id: int, config):
        super().__init__()
        # 基础配置
        self.num_attention_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_dim = config.hidden_size // config.num_attention_heads

        # 自注意力模块，内部实现RoPE
        self.self_attn = Attention(config)

        # 当前Block的层编号（用于层内权重共享、分层控制等）
        self.layer_id = layer_id

        # Attention前的RMSNorm
        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        # Feed Forward前的RMSNorm
        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        # 前馈网络，可配置是否用专家混合MoE
        self.mlp = FeedForward(config) if not config.use_moe else MOEFeedForward(config)
    def forward(
            self,
            hidden_states, # 输入隐藏状态 [batch_size, seq_len, hidden_dim]
            position_embeddings, # RoPE位置编码 [seq_len, head_dim]
            past_key_value = None, # KV缓存，加速推理
            use_cache = False, # 是否缓存当前层的KV
            attention_mask = None, # attention掩码
    ):
        # Self-attention层
        residual = hidden_states # 保存残差连接

        # 对输入做RMSNorm,再送入自注意力层
        hidden_states, present_key_value = self.self_attn(
            self.input_layernorm(hidden_states), # LayerNorm后输入attention
            position_embeddings, # Rotary PE传入Attention
            past_key_value, # 过往KV缓存(推理阶段用)
            use_cache, # 是否缓存当前层KV（推理阶段用）
            attention_mask, # 注意力掩码（padding token不计算注意力矩阵）
        )

        # 残差连接：原始输入+attention输出
        hidden_states += residual

        # MLP层
        # MLP前再做一次RMSNorm
        normed_hidden = self.post_attention_layernorm(hidden_states)

        # 残差连接
        hidden_states += self.mlp(normed_hidden)

        # 返回新的hidden_states和当前层的KV缓存
        return hidden_states, present_key_value

  import pynvml  # type: ignore[import]


In [None]:
from typing import Optional, List, Tuple
import torch

class MiniMindModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.vocab_size, self.num_hidden_layers = config.vocab_size, config.num_hidden_layers

        # [vocab_size, hidden_size] -> 把token_id映射为向量
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

        self.dropout = nn.Dropout(config.dropout)

        # 构建多个Transformer Block层
        self.layers = nn.ModuleList([
            MiniMindBlock(l, config) for l in range(self.num_hidden_layers)
        ])

        # 输出前的LayerNorm层
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        # 预计算RoPE所需的位置频率向量
        freqs_cos, freqs_sin = precompute_freqs_cis(
            dim = config.hidden_size // config.num_attention_heads,
            end = config.max_position_embeddings,
            omiga = config.rope_theta
        )

        # 注册为buffer
        self.register_buffer("freq_cos", freqs_cos, persistent=False) # 存到模型里，跟着to(device),cuda,eval一起走，但不是可训练参数，不会被优化器更新。不会保存到state_dict
        self.register_buffer("freq_sin", freqs_sin, persistent=False)
    def forward(self,
                input_ids: Optional[torch.Tensor] = None, 
                attention_mask: Optional[torch.Tensor] = None,
                past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
                use_cache: bool = False,
                **kwargs):
        # B:batch_size, T: 训练时候就是seq_len, 推理时就是当前已经生成的token数量
        batch_size, seq_length = input_ids.shape

        # 如果没传入缓存，初始化为空（推理时才用KV缓存）
        past_key_values = past_key_values or [None] * len(self.layers)

        # 获取历史缓存长度
        start_pos = past_key_values[0][0].shape[1] if past_key_values[0] is not None else 0

        # 输入ids -> [B, T] -> [B, T, hidden_size]
        hidden_states = self.dropout(self.embed_tokens(inputs))

        # 截取当前位置使用的旋转位置编码
        position_embeddings = (
            self.freqs_cos[start_pos: start_pos + seq_length],
            self.freqs_sin[start_pos: start_pos + seq_length]
        )

        presents = [] # 存储KV cache

        # 遍历每层Transformer Block
        for layer_idx, (layer, past_key_value) in enumerate(zip(self.layers, past_key_values)):
            hidden_states, present = layer(
                hidden_states,
                position_embeddings,
                past_key_value=past_key_value,
                use_cache=use_cache,
                attention_mask=attention_mask,
            )
            presents.append(present)
        # 最后输出RMSNorm
        hidden_states = self.norm(hidden_states)

        # 如果使用了MOE,则合并辅助损失
        aux_loss = sum(
            layer.mlp.aux_loss
            for layer in self.layers
            if isinstance(layer.mlp, MOEFeedForward)
        )
        return hidden_states, presents, aux_loss

In [3]:
# MiniMindForCausalLM类是为了将模型应用于因果语言建模任务，在MiniMindModel主干模型的基础上加输出层和统一的输出结构，用于token-level的预测

# 模型的基类，给模型添加生成函数，模型的配置文件类
from transformers import PreTrainedModel, GenerationMixin, PretrainedConfig
# 自回归模型的标准输出格式
from transformers.modeling_outputs import CausalLMOutputWithPast
from typing import Union

class MiniMindForCausalLM(PreTrainedModel, GenerationMixin):
    def __init__(self, config: MiniMindConfig = None):
        self.config = config or MiniMindConfig()
        super().__init__(self.config)

        # 模型主干：MiniMindModel,输出hidden_states
        self.model = MiniMindModel(self.config)

        # 输出层：将hidden size映射为vocab_size
        self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)

        # 权重绑定：embedding权重与lm_head权重共享
        self.model.embed_tokens.weight = self.lm_head.weight

        # 输出容器
        self.OUT = CausalLMOutputWithPast()
    def forward(self,
                input_ids: Optional[torch.Tensor]=None,
                attention_mask: Optional[torch.Tensor]=None,
                past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
                use_cache: bool = True,
                logits_to_keep: Union[int, torch.Tensor] = 0,
                **args):
        # 调用主干模型，输出hidden_states, presents(KV缓存), aux_loss
        h, past_kvs, aux_loss = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            **args
        )

        # logits_to_keep决定保留输出的哪些位置
        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep

        # 从h中保留最后logits_to_keep个位置，送入lm_head做分类
        logits = self.lm_head(h[:, slice_indices, :])

        # 构建结构化输出字典
        # a.__setitem__(b, c) : a[b] = c
        self.OUT.__setitem__('last_hidden_state', h)
        self.OUT.__setitem__('logits', logits)
        self.OUT.__setitem__('aux_loss', aux_loss)
        self.OUT.__setitem__('past_key_values', past_kvs)

        return self.OUT

NameError: name 'MiniMindConfig' is not defined