# Chapter 3 - 深入看看 Transformer LLM (Looking Inside Transformer LLMs)

## 3.1  加载 model 和 tokenizer

In [None]:
# %%capture
# !pip install transformers>=4.41.2 accelerate>=0.31.0

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

# Create a pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=50,
    do_sample=False,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
prompt = "春风又绿江南岸 是谁写的？"
output = generator(prompt)

print(output[0]['generated_text'])

（ ） A. 李白 B. 白居易 C. 苏轼 D. 王安石
李白

“春眠不觉晓，处处闻啼鸟。夜来风雨声，花落知多少


### 备注 
text-generation 仅仅是预测下一个token，所以相对于 如果没有构造成 [chat] 类的格式，效果会更差一些

In [8]:
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Linear(in_featur

## 3.2 面试要点？
- RMSNorm 和 layernorm 的区别？
> RMSNorm 可学习参数少于 layernorm,计算量更小。
> RMSNorm 只有缩放操作，没有 recenter 的操作。

## 3.3 查看一个 token 的概率分布（采样和解码）

In [13]:
prompt = "The capital of France is"

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

input_ids = input_ids.to("cuda")

# Get the output of the model before the lm_head
model_output = model.model(input_ids)

# Get the output of the lm_head
lm_head_output = model.lm_head(model_output[0])

In [14]:
token_id = lm_head_output[0, -1].argmax(-1)
tokenizer.decode(token_id)

' Paris'

In [24]:
input_ids

tensor([[ 785, 6722,  315, 9625,  374]], device='cuda:0')

In [22]:
# 896 是模型的 config.hidden_state
model_output[0].shape, lm_head_output.shape

(torch.Size([1, 5, 896]), torch.Size([1, 5, 151936]))

### 备注 
这里有两个 API， model.model() 和 model.lm_head()
> model.model 是获取每一个 token hidden state, lm_head 是做 softmax 判断每一词是什么？ 

## 3.4 使用 KV cache 加速

In [27]:
prompt = "Write a very long email apologizing to Sarah for the tragic gardening mishap. Explain how it happened."

# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to("cuda")

In [30]:
%%timeit -n 1
# Generate the text
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=1000,
  use_cache=True
)

14.8 s ± 4.58 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [31]:
%%timeit -n 1
# Generate the text
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=1000,
  use_cache=False
)

16.7 s ± 5.17 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
