In [None]:
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

def modelscope_quickstart(prompt):
    model_path = 'Meta-Llama-3.1-8B-Instruct'
    lora_path = './output/llama3_1_instruct_lora/checkpoint-18'  # 这里改为你的 lora 输出对应 checkpoint 地址

    # 加载 tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    # 加载模型
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

    # 加载 LoRA 权重
    model = PeftModel.from_pretrained(model, model_id=lora_path)

    messages = [
        {"role": "system", "content": "假设你是一位政治哲学评论家"},
        {"role": "user", "content": prompt}
    ]

    input_ids = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # 将输入和模型移动到设备（GPU 或 CPU）
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_inputs = tokenizer([input_ids], return_tensors='pt').to(device)
    model.to(device)

    generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)

    # 提取生成的文本
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

# 创建 Gradio 接口
demo = gr.Interface(fn=modelscope_quickstart, inputs="text", outputs="text")
demo.launch()

### 说明
1. 确保您已安装 `gradio`, `transformers`, `torch`, 和 `peft` 库。
2. 确保模型路径和 LoRA 路径正确。
3. 运行此 Notebook 后，Gradio 接口将启动，您可以通过浏览器访问并输入文本进行测试。