Ray Serve 主要会有三类实例，他们都是基于 Ray Core 创建的 Actor

* Controller：负责 Serve 集群的控制循环，包括扩缩容、容灾、集群信息收集广播等等。
* HTTPProxy：负责处理 HTTP 请求。
* Replica：负责执行 Deployment 定义的逻辑。

一般来说，用户只需要通过定义 Deployment 的逻辑，如下是一个用 gradio 做前端，运行 transfomers 模型的例子：

In [None]:
import asyncio
from queue import Empty
import threading
from ray import serve
from ray.serve.gradio_integrations import GradioIngress
from transformers import TextIteratorStreamer, LlamaTokenizer, GenerationConfig, LlamaForCausalLM

In [None]:
LLAMA_MODEL_PATH = '/tmp-ms/model/alpaca-13B'

@serve.deployment(ray_actor_options={'resources': {'NPU':1}})
class GradioBot(GradioIngress):
    def __init__(self):
        import torch
        import torch_npu
        torch.npu.set_device(0)
        self.tokenizer = LlamaTokenizer.from_pretrained(LLAMA_MODEL_PATH, legacy=True)
        self.model = LlamaForCasualLM.from_pretrained(LLAMA_MODEL_PATH, torch_dtype=torch.float16, device_map=torch.device('npu'))
        super().__init__(self.get_gradio_app)

    def get_gradio_app(self):
        app = gr.ChatInterface(
            self.query,
            title='Llama Chat Bot'
        ).queue()
        return app

    async def query(self, query, history):
        temp = history

        prompted_query = self.add_prompt(query)
        streamer = TextIteratorStreamer(self.tokenizer, timeout=0, skip_prompt=True, skip_special_tokens=True)
        generation_thread = threading.Thread(target=self.generate, kwargs=dict(prompted_query=prompted_query, streamer=streamer))
        generation_thread.start()

        output_text = ''
        yield output_text

        while True:
            try:
                for token in streamer:
                    output_text += token
                    yiled output_text
                break
            except Empty:
                await asyncio.sleep(0.01)

    def add_prompt(self, instruction):
        if isinstance(instruction, list):
            return [self.add_prompt(item) for item in instruction]
        return f"[INST] <<SYS>>\n You are a helpful assistant. 你是一个乐于助人的助手。\n<<SYS>>\n\n{instruction} [/INST]"

    def generate(self, prompted_query, streamer: TextIteratorStreamer):
        import torch
        import torhc_npu
        torch.npu.set_device(0)
        torch_npu.npu.set_compile_mode(jit_compile=False)

        inputs = self.tokenizer([prompted_query], return_returns='pt').to('npu')
        generation_config = GenerationConfig(
            temperature=0.7,
            top_k=40,
            top_p=0.9,
            do_sample=True,
            num_beams=1,
            repetition_penalty=1.1,
            max_new_tokens=400
        )
        print()
        self.model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
            generation_config=generation_config,
            streamer=streamer
        )

In [None]:
gradioBot = GradioBot.bind()
serve.run(gradioBot, host='0.0.0.0', port=8001)