# 模型推理 - 使用 QLoRA 微调后的 ChatGLM3-6B

In [1]:
import torch 
from transformers import AutoModel,AutoTokenizer,BitsAndBytesConfig
from peft import PeftModel,PeftConfig
model_name_or_path = "THUDM/chatglm3-6b"
peft_model_path = f"models/saved/{model_name_or_path}"

  from .autonotebook import tqdm as notebook_tqdm
2024-03-26 21:26:38.995556: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-26 21:26:38.997170: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-26 21:26:39.019873: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-26 21:26:39.019897: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-26 21:26:39.020498: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515

In [2]:
config = PeftConfig.from_pretrained(peft_model_path)

q_config = BitsAndBytesConfig(
    load_in_4bits = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.float32
)

base_model = AutoModel.from_pretrained(
    config.base_model_name_or_path,
    quantization_config = q_config,
    trust_remote_code = True,
    device_map = "auto"
)

base_model.requires_grad_(False)
base_model.eval()

Loading checkpoint shards: 100%|█████████████████████████████| 7/7 [00:04<00:00,  1.73it/s]


ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_layer): Linear(in_

## ChatGLM3-6B 

In [3]:
input_text = '类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领'
print(f"输入：\n ${input_text}")
tokenizer = AutoTokenizer.from_pretrained(
    config.base_model_name_or_path,
    trust_remote_code = True
)
response, history = base_model.chat(tokenizer = tokenizer ,query = input_text)
print(f'ChatGLM3-6B 微调前：\n{response}')

输入：
 $类型#裙*版型#显瘦*风格#文艺*风格#简约*图案#印花*图案#撞色*裙下摆#压褶*裙长#连衣裙*裙领型#圆领


Setting eos_token is not supported, use the default one.
Setting pad_token is not supported, use the default one.
Setting unk_token is not supported, use the default one.


ChatGLM3-6B 微调前：
这款连衣裙采用了显瘦的裙下摆设计，搭配上印花图案和撞色元素，给人一种简约而又富有文艺气息的感觉。圆领的设计让整个连衣裙看起来更加优雅，而裙长则刚好覆盖住鞋跟，显得腿部线条更加修长。同时，裙子的压褶设计增加了整款连衣裙的层次感和立体感，让整个款式显得更加丰富。


In [4]:
model_peft = PeftModel.from_pretrained(base_model,peft_model_path)
response, history = model_peft.chat(tokenizer=tokenizer, query=input_text)
print(f'ChatGLM3-6B 微调后: \n{response}')

ChatGLM3-6B 微调后: 
连衣裙采用了简约大气的圆领设计，展现出女性柔美的颈部曲线。撞色的印花点缀，使整体造型层次鲜明，彰显出女性独特的时尚魅力。腰间采用松紧压褶处理，修身显瘦，使整体造型更加立体。
