In [28]:
from transformers import AutoTokenizer,AutoModelForCausalLM
import torch

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
Token = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

使用TinyLlama做測試與學習

Token與model的Pretrained

函數庫:

AutoTokenizer  對應Token

AutoModelForCausalLM 對應自然語言LLM

In [29]:
print(Token)

LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-Chat-v1.0', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


Tokenizer = 把文字切成 token ID 的工具

vocab_size=32000：字典大小，有 32k 個 token（詞彙單位）

model_max_length=2048：最多能處理 2048 個 token 的序列

special_tokens：像 BOS <s>、EOS </s>，控制輸入/輸出的邊界

這是「輸入管道」：文字先經過 tokenizer → 轉成數字 ID

In [30]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

Embedding(32000, 2048)
每個 token ID 對應一個 2048 維的向量

22 層 LlamaDecoderLayer
這就是 Transformer Decoder 堆疊

每層有 Self-Attention（Q/K/V 投影 + 輸出）

MLP 前饋層（gate_proj → up_proj → down_proj）

LayerNorm（RMSNorm）

LlamaAttention 裡特別之處：
q_proj: 2048→2048
k_proj, v_proj: 2048→256（壓縮了維度 → 減少運算量）
o_proj: 2048→2048

RotaryEmbedding (RoPE)
用於 attention 的位置編碼，讓模型理解 token 順序

lm_head (Linear 2048→32000)
最後一層，全連接層，輸出每個 token 的機率分佈

In [31]:
from torchsummary import summary

In [32]:
#summary(model, input_size=(1, 2048))

#summary通常適用在單個模型的打印

In [33]:
#對應1.1B 11億參數量
model.num_parameters()

1100048384

In [34]:
sum(p.numel() for p in model.parameters())

1100048384

這是參數量的打印方式

In [35]:
print(model.config)

LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.55.4",
  "use_cache": true,
  "vocab_size": 32000
}



In [36]:
text = "你好，世界"
tokens = Token(text)

In [37]:
print(type(tokens))
print(tokens)
print(tokens.input_ids)
print(type(tokens.input_ids))
print(type(tokens.attention_mask))

<class 'transformers.tokenization_utils_base.BatchEncoding'>
{'input_ids': [1, 29871, 30919, 31076, 30214, 30793, 30967], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
[1, 29871, 30919, 31076, 30214, 30793, 30967]
<class 'list'>
<class 'list'>


In [38]:
input_ids = torch.tensor([tokens.input_ids])
attention_mask = torch.tensor([tokens.attention_mask])
print(type(input_ids))
print(input_ids.shape)
print(type(attention_mask))
print(attention_mask.shape)

<class 'torch.Tensor'>
torch.Size([1, 7])
<class 'torch.Tensor'>
torch.Size([1, 7])


轉換資料成Tensor的過程

In [39]:
text = "你好世界"

token_slow_list = Token(text)
print(token_slow_list)
#可以看到兩者是純List格式
print(token_slow_list.input_ids)
print(token_slow_list.attention_mask)

#轉換成正式格式需要添加"inputs_ids:","attention_mask:", 並外包一層[]，才會返回return_tensor的對等格式
token_slow = {
    "input_ids": torch.tensor([token_slow_list.input_ids]),
    "attention_mask" : torch.tensor([token_slow_list.attention_mask])
}

print("token_slow:", token_slow)

{'input_ids': [1, 29871, 30919, 31076, 30793, 30967], 'attention_mask': [1, 1, 1, 1, 1, 1]}
[1, 29871, 30919, 31076, 30793, 30967]
[1, 1, 1, 1, 1, 1]
token_slow: {'input_ids': tensor([[    1, 29871, 30919, 31076, 30793, 30967]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [40]:
token_fast = Token(text , return_tensors = "pt")
# return_tensors="pt" 是告訴 tokenizer：把輸出資料直接轉成 PyTorch tensor 格式
# 其他格式"tf"：TensorFlow tensor ,"np"：NumPy array
print("token_fast:", token_fast)
print(type(token_fast.input_ids))
print(token_fast.input_ids.shape)

token_fast: {'input_ids': tensor([[    1, 29871, 30919, 31076, 30793, 30967]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
<class 'torch.Tensor'>
torch.Size([1, 6])


Token_slow: 表示原始資料逐步轉換成Tensor的過程

Token_fast: 表示直接轉換Tensor的過程

透過拆解步驟可以知道return_tensor這個參數做的事情

In [55]:
enc = Token("Greetings!" , return_tensors = "pt")

#BatchEncoding是Hugging Face transformers套件的特殊字典容器，後續可以進行.to("cuda")的操作
#keyView是所有Key的名稱，會打印出所有欄位
print(type(enc))

#input_ids': tensor([[    1, 15043,  2787, 29991]]) 對應token在詞彙表的位置，文字 → 字典編號
#跟向量空間不同: input_ids = 「字典編號」 embedding 向量 = 「字典頁面上的內容」
print(enc.keys())

<class 'transformers.tokenization_utils_base.BatchEncoding'>
KeysView({'input_ids': tensor([[    1,  4122,   300,   886, 29991]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])})


In [56]:
token_keyview = Token("Hello World!")
#'input_ids': [1, 15043, 2787, 29991]
print(token_keyview)


{'input_ids': [1, 15043, 2787, 29991], 'attention_mask': [1, 1, 1, 1]}


我們回到尚未轉換成tensor的步驟，進行'input_ids': [1, 15043, 2787, 29991]的個數值拆解與還原

In [57]:
#對input_ids進行解碼還原
decode = Token.decode(token_keyview.input_ids)
#<s> Hello World !
print(decode)

<s> Hello World!


In [58]:
decode_1 = Token.decode([1])
decode_2 = Token.decode([15043])
decode_3 = Token.decode([2787])
decode_4 = Token.decode([29991])
#<s> + Hello + World + !
print(decode_1, decode_2, decode_3, decode_4)

<s> Hello World !


In [None]:
#只需要推理，不用訓練更新，所以這段表示停止梯度計算 > 停用梯度追蹤，省記憶體、加速
with torch.no_grad():
    #generate(**enc)需要dict，勿把tensor格是放入會報錯(因為不是mapping)
    outputs_no_grad = model.generate(**enc, max_new_tokens = 30)
print("outputs_no_grad:", Token.decode(outputs_no_grad[0], skip_special_tokens = True))

Greetings!
I hope you are doing well. I am writing to you today to express my deepest gratitude for the wonderful services you have provided to me


In [None]:
outputs_with_grad = model.generate(**enc, max_new_tokens = 30)
print("outputs_with_grad:", Token.decode(outputs_with_grad[0], skip_special_tokens = True))

Greetings!
I hope you are doing well. I am writing to you today to express my deepest gratitude for the wonderful services you have provided to me


no_grad()只關「要不要記錄梯度」

它不會改變前向的數值或生成策略，只是省顯存、加速

所以在推理時，輸出本來就應該一樣；不同的是資源占用

現在的生成是確定性的

generate() 的預設是 greedy decoding（不抽樣）：每步都選機率最大的 token

確定性的演算法，給相同輸入、相同模型狀態 → 輸出必定相同

想看到「會變」：改用抽樣

In [79]:
outputs_sampling_1 = model.generate(**enc, max_new_tokens=30, do_sample=True, temperature=0.8, top_p=0.9)
print("outputs_sampling_1:", Token.decode(outputs_sampling_1[0], skip_sepical_tokens = True))

outputs_sampling_1: <s> Greetings! I am excited to share with you the next installment of the 15 Minute Meal Plan series. This time, I will show you


In [80]:
outputs_sampling_2 = model.generate(**enc, max_new_tokens=30, do_sample=True, temperature=0.8, top_p=0.9)
print("outputs_sampling_2:", Token.decode(outputs_sampling_2[0], skip_sepical_tokens = True))

outputs_sampling_2: <s> Greetings! This weekend, I have the pleasure of hosting a book signing at the local library. The library is a great place to connect with readers and the


torch.no_grad()：影響資源，不影響輸出內容
想看到不同輸出：用抽樣（do_sample=True 等），或改種子來「可重現的隨機」


需要的話我可以給你一個小腳本，對比：

greedy（固定輸出）

sampling（會變，但設 seed 可重現）

並顯示顯存占用在有/沒有 no_grad() 下的差異

In [81]:
# ------------------------- 看清楚「Tensor」長什麼樣 -------------------------

input_ids = enc["input_ids"].to(model.device)
attention_mask = enc["attention_mask"].to(model.device)

with torch.no_grad():
    outputs = model.generate(input_ids = input_ids, attention_mask = attention_mask , max_new_tokens = 30)

print(Token.decode(outputs[0], skip_speical_tokens = False))

<s> Greetings!
I hope you are doing well. I am writing to you today to express my deepest gratitude for the wonderful services you have provided to me


In [None]:
# ------------------------- 多輪對話 -------------------------

#定義Prompt 與 question
message = [
    {
        "role": "system" , "content" : "You are a inteligent AI assistant."
    },
    {
        "role" : "user", "content" : "Greeting how are you?" 
    }
]

print(message)

#apply_chat_temple會把多輪對話轉成模型可懂的字串
prompt_text = Token.apply_chat_template(
    message, tokenize = False, add_generation_prompt = True
)

#觀看輸出
print(prompt_text)

enc_chat = Token(prompt_text, return_tensors = "pt").to(model.device)
with torch.no_grad():
    outputs_chat = model.generate(**enc, max_new_tokens = 30)

print(Token.decode(outputs_chat[0] , skip_special_tokens = True))

[{'role': 'system', 'content': 'You are a inteligent AI assistant.'}, {'role': 'user', 'content': 'Greeting how are you?'}]
<|system|>
You are a inteligent AI assistant.</s>
<|user|>
Greeting how are you?</s>
<|assistant|>

Greetings!
I hope you are doing well. I am writing to you today to express my deepest gratitude for the wonderful services you have provided to me


In [133]:
message = [
    {
        "role": "system" , "content" : "You are a inteligent AI assistant."
    },
    {
        "role" : "user", "content" : "Greeting how are you?" 
    }
]

print(message)
#apply_chat_temple會把多輪對話轉成模型可懂的字串
#這次使用Tokenize的功能
prompt_text_token = Token.apply_chat_template(
    message, tokenize = True, add_generation_prompt = True
)

#輸出轉成了Token形式
print(prompt_text_token)

[{'role': 'system', 'content': 'You are a inteligent AI assistant.'}, {'role': 'user', 'content': 'Greeting how are you?'}]
[529, 29989, 5205, 29989, 29958, 13, 3492, 526, 263, 13856, 25692, 319, 29902, 20255, 29889, 2, 29871, 13, 29966, 29989, 1792, 29989, 29958, 13, 29954, 4521, 292, 920, 526, 366, 29973, 2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13]


In [135]:
#使用decode轉回
#等價tokenize = False
prompt_text_reture = Token.decode(prompt_text_token , skip_special_tokens= False)
print(prompt_text_reture)

<|system|>
You are a inteligent AI assistant.</s> 
<|user|>
Greeting how are you?</s> 
<|assistant|>



In [141]:
prompt_text_in = Token(prompt_text_reture, return_tensors= "pt").to(model.device)
print(prompt_text_in)

{'input_ids': tensor([[    1,   529, 29989,  5205, 29989, 29958,    13,  3492,   526,   263,
         13856, 25692,   319, 29902, 20255, 29889,     2,   259,    13, 29966,
         29989,  1792, 29989, 29958,    13, 29954,  4521,   292,   920,   526,
           366, 29973,     2,   259,    13, 29966, 29989,   465, 22137, 29989,
         29958,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [142]:
with torch.no_grad():
    # **給入mapping資料
    outputs = model.generate(**prompt_text_in, max_new_tokens = 80)

In [143]:
print(outputs)
get_outputs = outputs[0, prompt_text_in["input_ids"].shape[1]:]

tensor([[    1,   529, 29989,  5205, 29989, 29958,    13,  3492,   526,   263,
         13856, 25692,   319, 29902, 20255, 29889,     2,   259,    13, 29966,
         29989,  1792, 29989, 29958,    13, 29954,  4521,   292,   920,   526,
           366, 29973,     2,   259,    13, 29966, 29989,   465, 22137, 29989,
         29958,    13, 29902,   626,  1532, 29892,  6452,   366, 29889,  1128,
           526,   366, 29973,     2]])


In [144]:
print(Token.decode(get_outputs, skip_special_tokens= True))

I am well, thank you. How are you?


取具有隨機與更聰明的輸出

In [None]:
from transformers import set_seed
set_seed(42)
outputs_sampling_test = model.generate(
    **prompt_text_in, max_new_tokens=80,
    do_sample=True, temperature=0.7, top_p=0.9
)

print(outputs_sampling_test)

tensor([[    1,   529, 29989,  5205, 29989, 29958,    13,  3492,   526,   263,
         13856, 25692,   319, 29902, 20255, 29889,     2,   259,    13, 29966,
         29989,  1792, 29989, 29958,    13, 29954,  4521,   292,   920,   526,
           366, 29973,     2,   259,    13, 29966, 29989,   465, 22137, 29989,
         29958,    13, 29902,   626, 10932,   304,  8293,   515,   366, 29889,
           306,   626,  2691, 29892,  6452,   366, 29889,  1128,  1048,   366,
         29973,     2]])


In [152]:
get_outputs_sampling_test = outputs_sampling_test[0, prompt_text_in["input_ids"].shape[1]:]

In [153]:
print(Token.decode(outputs_sampling_test[0], skip_special_tokens= True))

<|system|>
You are a inteligent AI assistant.  
<|user|>
Greeting how are you?  
<|assistant|>
I am glad to hear from you. I am fine, thank you. How about you?


In [154]:
print(Token.decode(get_outputs_sampling_test, skip_special_tokens= True))

I am glad to hear from you. I am fine, thank you. How about you?
