## 基座模型推理

In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained("weights/glm-4-9b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("weights/glm-4-9b-chat-hf").eval().to(device)
inputs = tokenizer.encode("我是ChatGLM，是", return_tensors="pt").to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.35it/s]


[gMASK]<sop>我是ChatGLM，是人工智能助手。我是ChatGLM，是人工智能助手。我是ChatGLM，是人工智能助手


## Chat模型推理

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda"
tokenizer = AutoTokenizer.from_pretrained("weights/glm-4-9b-chat-hf")
model = (
    AutoModelForCausalLM.from_pretrained("weights/glm-4-9b-chat-hf").eval().to(device)
)
batch_test_message = [
    [
        {"role": "user", "content": "你好，告诉我你的名字。"},
        {
            "role": "assistant",
            "content": "你好，我是一个人工智能助手，你可以叫我 ChatGLM。",
        },
        {"role": "user", "content": "告诉我苹果的英文？"},
    ],
    [{"role": "user", "content": "告诉我1+2等于多少？"}],
]
batch_inputs_text = tokenizer.apply_chat_template(
    batch_test_message, return_tensors="pt", return_dict=True, padding=True, padding_side="left",add_generation_prompt=True
).to(device)

# print(batch_inputs_text)
outputs = model.generate(**batch_inputs_text)
response_batch = tokenizer.batch_decode(outputs, skip_special_tokens=False)
print(response_batch)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.33it/s]


['[gMASK]<sop><|user|>\n你好，告诉我你的名字。<|assistant|>\n你好，我是一个人工智能助手，你可以叫我 ChatGLM。<|user|>\n告诉我苹果的英文？<|assistant|>\n苹果的英文是 "apple"。<|user|>', '<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>[gMASK]<sop><|user|>\n告诉我1+2等于多少？<|assistant|>\n1+2等于3。<|user|><|endoftext|>']


## Chat模型（pipeline模式）

In [19]:
from transformers import pipeline

messages = [
    {"role": "user", "content": "你是谁"},
]
pipe = pipeline("text-generation", model="weights/glm-4-9b-chat-hf")
print(pipe(messages))

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.35it/s]
Device set to use cuda:0


[{'generated_text': [{'role': 'user', 'content': '你是谁'}, {'role': 'assistant', 'content': '\n我是一个人工智能助手，名为 ChatGLM。我是基于清华大学 KEG 实验室和'}]}]


## 打印模型结构

In [20]:
print(model)

GlmForCausalLM(
  (model): GlmModel(
    (embed_tokens): Embedding(151552, 4096, padding_idx=151329)
    (layers): ModuleList(
      (0-39): 40 x GlmDecoderLayer(
        (self_attn): GlmAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=True)
          (k_proj): Linear(in_features=4096, out_features=256, bias=True)
          (v_proj): Linear(in_features=4096, out_features=256, bias=True)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): GlmMLP(
          (gate_up_proj): Linear(in_features=4096, out_features=27392, bias=False)
          (down_proj): Linear(in_features=13696, out_features=4096, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): GlmRMSNorm((4096,), eps=1.5625e-07)
        (post_attention_layernorm): GlmRMSNorm((4096,), eps=1.5625e-07)
      )
    )
    (norm): GlmRMSNorm((4096,), eps=1.5625e-07)
    (rotary_emb): GlmRotaryEmbedding()
  )
  (lm_head): Linear(in_