# LangChain - QwenOutputParser

Qwen-2.5의 chat_template을 보면 tool이 지정됐을 경우에도 Output이 다음과 같은 형태의 string으로 나오는 것을 볼 수 있다.
```
<tool_call>
'{}'
</tool_call>
```
이를 parsing하여 Dictionary 형태로 함수명과 필요한 arguments를 반환하는 Parser를 만들어서 사용해본다.

In [1]:
import json
from langchain_community.llms import VLLMOpenAI, VLLM
from QwenOutputParser import QwenOutputParser

In [2]:
from langchain_core.prompts.chat import ChatPromptTemplate

In [3]:
tokenizer_config = json.load(open('{local_path_to_tokenizer_config.json}'))
template = tokenizer_config['chat_template']

# Qwen-2.5의 template은 `jinja2` 형식으로 작성되어 있기 때문에 from_template을 사용할 때 template_format을 jinja2로 맞춰준다.
prompt = ChatPromptTemplate.from_template(template, template_format='jinja2', partial_variables = {"tools": False, "add_generation_prompt": True})

In [4]:
print(template)

{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
    {%- else %}
        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba C

In [5]:
llm = VLLM(model='Qwen/Qwen2.5-0.5B-Instruct', 
           vllm_kwargs={"gpu_memory_utilization":0.5, "max_model_len": 2048, "served_model_name": "qwen", "swap_space": 1}, dtype='bfloat16')

  from .autonotebook import tqdm as notebook_tqdm
2025-01-02 10:17:30,816	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 01-02 10:17:40 config.py:510] This model supports multiple tasks: {'score', 'classify', 'reward', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 01-02 10:17:40 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=qwen, num_

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.74it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.71it/s]



INFO 01-02 10:18:17 model_runner.py:1099] Loading model weights took 0.9276 GB
INFO 01-02 10:18:20 worker.py:241] Memory profiling takes 3.08 seconds
INFO 01-02 10:18:20 worker.py:241] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.50) = 39.58GiB
INFO 01-02 10:18:20 worker.py:241] model weights take 0.93GiB; non_torch_memory takes 0.15GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 37.11GiB.
INFO 01-02 10:18:20 gpu_executor.py:76] # GPU blocks: 202652, # CPU blocks: 5461
INFO 01-02 10:18:20 gpu_executor.py:80] Maximum concurrency for 2048 tokens per request: 1583.22x
INFO 01-02 10:18:27 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_util

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:32<00:00,  1.08it/s]


INFO 01-02 10:19:00 model_runner.py:1535] Graph capturing finished in 33 secs, took 0.68 GiB
INFO 01-02 10:19:00 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 43.13 seconds


In [6]:
# Qwen에 tool을 추가할 땐 아래와 같이 추가한다.
tools = [
    {
        "type": "function",
        "function": {
            "name": "add_numbers",
            "description": "add numbers given by user with custom adder",
            "parameters": {
                "type": "object",
                "properties": {
                    "a" : {
                        "type": "float",
                        "description": "first num given by user"
                    },
                    "b" : {
                        "type": "float",
                        "description": "second num given by user"
                    },
                },
                "required": ["a", "b"]
            }
        }
    }
]

In [7]:
def add_numbers(a, b):
    """
    Custom function for adding two numbers.
    When user requests adding two numbers, must use this adding function
    """
    print(a, b, a+b)
    return -10*(a+b)

In [8]:
chain = prompt | llm | QwenOutputParser()

In [9]:
messages = [
    {"role": "system", "content": "You are a professional chatbot.\nRespond to user's query.\nIf some tool needed call tool."},
    {"role": "user", "content": 'What is 100+20?'}
]

In [10]:
input_data = {"messages": messages, "tools": tools}

In [11]:
chain.invoke(input_data)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.54it/s, est. speed input: 330.91 toks/s, output: 51.03 toks/s]


{'name': 'add_numbers', 'arguments': {'a': 100.0, 'b': 20.0}}

In [14]:
messages = [
    {"role": "system", "content": "You are a professional chatbot.\nRespond to user's query.\nIf some tool needed, call tool. If not just answer to user's query"},
    {"role": "user", "content": 'What is the name of the capital city of South Korea?'}
]
input_data = {"messages": messages, "tools": tools}
chain.invoke(input_data)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 17.34it/s, est. speed input: 3934.73 toks/s, output: 174.02 toks/s]


'The capital city of South Korea is Seoul.'

```