# 1.GPT模型的架构

In [1]:
'''124M GPT-2的参数设置'''
GPT_CONFIG_124M = {
    "vocab_size": 50527, #vocabulary size
    "context_length": 1024, #context length
    "emb_dim": 768, #embedding dimension
    "n_heads": 12, #number of attention heads
    "n_layers": 12, #number of layers(transformer blocks)
    "drop_rate": 0.1, #dropout rate
    "qkv_bias": False #query-key-value bias
}

In [10]:
'''一个虚拟的GPT模型的架构'''
import torch
import torch.nn as nn

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        #Transformer block
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        #Layer Normalization
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        
        #Output
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias = False
        )


    def forward(self, in_idx):
        batch_size, seq_length = in_idx.shape

        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_length, device = in_idx.device))
        x = tok_embeds + pos_embeds
       
        x = self.drop_emb(x)

        x= self.trf_blocks(x)

        x = self.final_norm(x)

        logits = self.out_head(x)

        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # Implementation of transformer blocks

    def forward(self, x):
        
        return x

class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps = 1e-5):
        super().__init__()
        #The implementation of layer normalization

    def forward(self, x):

        return x


In [None]:
'''测试虚拟的GPT模型的输出'''

import tiktoken

# 1.构造数据
tokenizer  = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch = torch.stack(batch, dim = 0)

print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [11]:
torch.manual_seed(123)

model =  DummyGPTModel(GPT_CONFIG_124M)

logits = model(batch)

print(logits.shape)
print(logits)

torch.Size([2, 4, 50527])
tensor([[[-0.5076,  0.5526, -1.3087,  ..., -1.2430, -0.1799,  0.2840],
         [ 0.4162,  0.5258, -0.2680,  ..., -0.6662, -0.0434, -1.7204],
         [ 0.2720,  0.5737,  0.2196,  ..., -0.4875, -2.2169, -0.8551],
         [ 1.7329, -0.0533,  0.9222,  ..., -0.8311, -0.3060,  0.1644]],

        [[-0.4748,  1.0586, -0.6434,  ..., -0.9168,  0.2064, -0.3204],
         [-0.4063, -0.6273,  0.9828,  ..., -0.6344,  0.0545, -0.0326],
         [-0.2660, -1.7609,  0.6074,  ..., -1.1383, -0.7710, -0.9709],
         [ 0.5479, -1.3437,  1.2603,  ...,  0.6285, -0.2088, -2.5608]]],
       grad_fn=<UnsafeViewBackward0>)


# 2.Layer Normalization

In [None]:
'''得到神经网络某一层的输出激活值'''
torch.manual_seed(123)

# create 2 training examples with 5 dimensions (features) each
batch_example = torch.randn(2, 5) 

layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(out)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [None]:
#查看输出激活值的均值和方差
mean = out.mean(dim = -1, keepdim = True) #keepdim参数的作用是使输出的维度和原始的维度保持一致
var = out.var(dim = -1, keepdim = True)

print(mean)
print(var)

tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [None]:
'''使得均值为0 方差为1 的方法：减去均值并且除以方差的平方根'''
out_norm = (out - mean) / torch.sqrt(var)
print(out_norm)

mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Mean:
 tensor([[9.9341e-09],
        [5.9605e-08]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [16]:
# 注意到上面进行normalization之后，均值并不严格为0，而是接近0的一个小数。
# 可以通过下面的方式实现：禁用科学计算
torch.set_printoptions(sci_mode = False)
print(mean)
print(var)

tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [30]:
'''Layer Normalization的实现'''
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5 #用于避免出现除以0的情况
        # 下面这两个参数是可学习的参数，用于在训练过程中提升模型的性能
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        var = x.var(dim = -1, keepdim = True, unbiased = False)
        
        norm_x = (x - mean) / torch.sqrt(var + self.eps)

        return self.scale * norm_x + self.shift

In [32]:
# 测试layer normalization
ln = LayerNorm(emb_dim = 5)

torch.manual_seed(123)

# create 2 training examples with 5 dimensions (features) each
batch_example = torch.randn(2, 5) 
out_ln = ln(batch_example)

mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)

print("Mean:\n", mean)
print("Variance:\n", var)


Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
