In [2]:
from transformers.models.qwen2 import Qwen2ForCausalLM, Qwen2Tokenizer
import torch


In [1]:
# model_name_or_path = "/home/yuanz/documents/models/Qwen/Qwen2.5-0.5B"
# model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct", device_map="cuda:0")
# tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")



from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")


## demo1
1. 调用generate方法，让他一直生成新的token

In [2]:
text = "介绍一下杭州的良睦路"

model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
for k, v in model_inputs.items():
    print(k, v)


input_ids tensor([[109432, 104130,   9370,  99584, 103852,  45995]], device='cuda:0')
attention_mask tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')


In [7]:
generated_ids = model.generate(**model_inputs, max_new_tokens=10)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'，以及它与周边地区的联系。\n良睦'

## demo2
1. 直接基于第一步，生成新token

In [8]:
model_inputs1 = {
    "input_ids": torch.tensor(
        [[109432, 104130, 9370, 99584, 103852, 45995]], dtype=torch.long
    ).to(model.device),
    "attention_mask": torch.tensor([[1, 1, 1, 1, 1, 1]], dtype=torch.long).to(
        model.device
    ),
}

model_inputs1


{'input_ids': tensor([[109432, 104130,   9370,  99584, 103852,  45995]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [9]:
model_outputs1 = model.forward(**model_inputs1,use_cache=True) # model(**model_inputs1)
model_outputs1.keys()


odict_keys(['logits', 'past_key_values'])

In [10]:
model_outputs1.logits.shape

torch.Size([1, 6, 151936])

In [11]:
model_outputs1.logits[:, -1, :].shape

torch.Size([1, 151936])

In [12]:
model_outputs1.logits[:, -1, :].argmax(dim=-1)

tensor([3837], device='cuda:0')

In [13]:
tokenizer.decode([3837])

'，'

In [17]:
(
    type(model_outputs1.past_key_values),
    # isinstance(model_outputs1.past_key_values, torch.nn.Module),
    len(model_outputs1.past_key_values),
    len(model_outputs1.past_key_values[0]),
    model_outputs1.past_key_values[0][0].shape,
)

(tuple, 24, 2, torch.Size([1, 2, 6, 64]))

## demo3
1. 把上一次生成的past kv 拿过来，加上新拼接的token，生成

In [18]:
model_outputs2 = model.forward(
    **{
        "input_ids": torch.tensor([[3837]], dtype=torch.long).to(model.device),
        "attention_mask": torch.tensor([[1]], dtype=torch.long).to(model.device),
    },
    past_key_values=model_outputs1.past_key_values,
)
model_outputs2.keys()

We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)


odict_keys(['logits', 'past_key_values'])

## demo4
1. 直接模拟简单粗暴类型的生成方式

In [19]:
model_inputs3 = {
    "input_ids": torch.tensor(
        [[109432, 104130, 9370, 99584, 103852, 45995, 3837]], dtype=torch.long
    ).to(model.device),
    "attention_mask": torch.tensor([[1, 1, 1, 1, 1, 1, 1]], dtype=torch.long).to(
        model.device
    ),
}

model_inputs3


{'input_ids': tensor([[109432, 104130,   9370,  99584, 103852,  45995,   3837]],
        device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [20]:
model_outputs3 = model.forward(**model_inputs3) # model(**model_inputs3)
model_outputs3.keys()

odict_keys(['logits', 'past_key_values'])

## 验证demo4和demo3输出的logits是不是一样的

In [21]:
torch.allclose(model_outputs3.logits[:, -1, :], model_outputs2.logits[:, -1, :],atol=1e-4)

True

In [22]:
model_outputs3.logits[:, -1, :].argmax(dim=-1)

tensor([101034], device='cuda:0')

# layer的各个模块

## 1. embedding模块

1. `[batch_size, seq_len] -> [batch_size, seq_len, hidden_size]`

In [26]:
input_embedding = model.model.embed_tokens(model_inputs3.get("input_ids"))

(model_inputs3.get("input_ids").shape, input_embedding.shape)


(torch.Size([1, 7]), torch.Size([1, 7, 896]))

## 2. layernorm模块

1. 这里主要是input norm、output norm

2. `[batch_size, seq_len, hidden_size] -> [batch_size, seq_len, hidden_size]`

In [30]:
input_layernorm_value = model.model.layers[0].input_layernorm(input_embedding)
input_layernorm_value.shape

torch.Size([1, 7, 896])

## 3. attention 层

1. `[batch_size, seq_len, hidden_size] -> [batch_size, seq_len, hidden_size]`

In [None]:
# show_attention.py 

## 4. mlp层

In [44]:
(
    input_embedding.shape,
    model.model.layers[0].mlp(input_embedding).shape
)

(torch.Size([1, 7, 896]), torch.Size([1, 7, 896]))

In [1]:
a = [1]
a[1] = 3
a

IndexError: list assignment index out of range