加载基础模型和lora权重，并使用gradio进行推理，值得注意的是基础模型和微调后得到的lora权重可以不合并成一个微调模型，这样的话基础模型则可以灵活的与不同的lora权重进行推理，处理不同任务。

In [1]:
import os
import sys

import fire
import gradio as gr
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

from utils.callbacks import Iteratorize, Stream
from utils.prompter import Prompter

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

try:
    if torch.backends.mps.is_available():
        device = "mps"
except:  # noqa: E722
    pass

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 初始化参数
load_8bit: bool = False
base_model: str = "models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16" # 对应微调代码的基础模型，手动下载
lora_weights: str = "alpaca-lora-7b" # 微调后的lora权重，也可以手动下载
prompt_template: str = ""  # The prompt template to use, will default to alpaca.
server_name: str = "0.0.0.0"  # Allows to listen on all interfaces by providing '0.
share_gradio: bool = False

In [3]:
# 加载基础模型和lora权重
base_model = base_model or os.environ.get("BASE_MODEL", "")
assert (
    base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"

prompter = Prompter(prompt_template)    # 加载prompt模板
tokenizer = LlamaTokenizer.from_pretrained(base_model)  #加载LlamaTokenizer分词器

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


In [4]:
if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(   #加载基础模型
        base_model,
        load_in_8bit=load_8bit,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(  #加载lora权重
        model,
        lora_weights,   # 值得注意的是需要观察peft库和transformers库的版本，以及loraconfig的版本，https://huggingface.co/docs/peft/tutorial/peft_model_config?config=LoraConfig
        torch_dtype=torch.float16,
    )
elif device == "mps":
    model = LlamaForCausalLM.from_pretrained(
        base_model,
        device_map={"": device},
        torch_dtype=torch.float16,
    )
    model = PeftModel.from_pretrained(
        model,
        lora_weights,
        device_map={"": device},
        torch_dtype=torch.float16,
    )
else:
    model = LlamaForCausalLM.from_pretrained(
        base_model, device_map={"": device}, low_cpu_mem_usage=True
    )
    model = PeftModel.from_pretrained(
        model,
        lora_weights,
        device_map={"": device},
    )

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.98s/it]


In [5]:
# 配置模型
model.config.pad_token_id = tokenizer.pad_token_id = 0 # 配置模型的特殊标记ID
model.config.bos_token_id = 1
model.config.eos_token_id = 2

if not load_8bit:
    model.half()    # 半精度16float

model.eval() # 设置模型为评估模式
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model) # torch编译，优化性能


In [6]:
# 定义eval函数
def evaluate(
    instruction,    # 指令
    input=None, # 补充输入（可选
    temperature=0.1, # 温度（控制文本随机性，越高越丰富
    top_p=0.75, # 较高的值会使用更广泛的词汇
    top_k=40,   #生成时考虑最高概率的K个词汇
    num_beams=4,    #   使用束搜索生成时的束数量。较高的值会生成更多候选答案。
    max_new_tokens=128, #
    stream_output=False,    # 流式输出
    **kwargs,
):
    prompt = prompter.generate_prompt(instruction, input)   # 将指令和输入组合成一个完整的提示
    inputs = tokenizer(prompt, return_tensors="pt") # 将提示转换为模型输入格式（张量
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(   # 生成配置对象
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )

    generate_params = { # 生成参数
        "input_ids": input_ids,
        "generation_config": generation_config,
        "return_dict_in_generate": True,
        "output_scores": True,
        "max_new_tokens": max_new_tokens,
    }

    if stream_output:   # 流式输出
        def generate_with_callback(callback=None, **kwargs):
            kwargs.setdefault("stopping_criteria", transformers.StoppingCriteriaList())
            kwargs["stopping_criteria"].append(Stream(callback_func=callback))
            with torch.no_grad():
                model.generate(**kwargs)

        def generate_with_streaming(**kwargs):
            return Iteratorize(generate_with_callback, kwargs, callback=None)

        with generate_with_streaming(**generate_params) as generator:
            for output in generator:
                decoded_output = tokenizer.decode(output)
                if output[-1] in [tokenizer.eos_token_id]:  # 判断是否生成结束
                    break
                yield prompter.get_response(decoded_output)
        return

    with torch.no_grad():   # 上下文管理器禁用梯度计算
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    yield prompter.get_response(output)


In [8]:
# 使用gradio创建接口
gr.Interface(
    fn=evaluate,
    inputs=[
        gr.components.Textbox(
            lines=2,
            label="Instruction",
            placeholder="Tell me about alpacas.",
        ),
        gr.components.Textbox(lines=2, label="Input", placeholder="none"),
        gr.components.Slider(
            minimum=0, maximum=1, value=0.1, label="Temperature"
        ),
        gr.components.Slider(
            minimum=0, maximum=1, value=0.75, label="Top p"
        ),
        gr.components.Slider(
            minimum=0, maximum=100, step=1, value=40, label="Top k"
        ),
        gr.components.Slider(
            minimum=1, maximum=4, step=1, value=4, label="Beams"
        ),
        gr.components.Slider(
            minimum=1, maximum=2000, step=1, value=128, label="Max tokens"
        ),
        gr.components.Checkbox(label="Stream output"),
    ],
    outputs=[
        gr.components.Textbox(
            lines=5,
            label="Output",
        )
    ],
    title="🦙🌲 Alpaca-LoRA",
    description="Alpaca-LoRA is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://github.com/tloen/alpaca-lora).",
).queue().launch(server_name="0.0.0.0", share=share_gradio)


Running on local URL:  http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.




