## Launch A Server

Launch the server with a reasoning model and reasoning parser.

In [1]:


from sglang import separate_reasoning, assistant_begin, assistant_end
from sglang import assistant, function, gen, system, user
from sglang import image
from sglang import RuntimeEndpoint, set_default_backend
from sglang.srt.utils import load_image
from sglang.test.test_utils import is_in_ci
from sglang.utils import print_highlight, terminate_process, wait_for_server


if is_in_ci():
    from patch import launch_server_cmd
else:
    from sglang.utils import launch_server_cmd


server_process, port = launch_server_cmd(
    "python3 -m sglang.launch_server --model-path Qwen/Qwen3-4B --reasoning-parser qwen3 --host 0.0.0.0"
)

wait_for_server(f"http://localhost:{port}")
print(f"Server started on http://localhost:{port}")

  from .autonotebook import tqdm as notebook_tqdm


[2025-05-04 18:37:42] server_args=ServerArgs(model_path='Qwen/Qwen3-4B', tokenizer_path='Qwen/Qwen3-4B', tokenizer_mode='auto', skip_tokenizer_init=False, enable_tokenizer_batch_encode=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Qwen/Qwen3-4B', chat_template=None, completion_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=38453, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=106689967, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info', log_level_http=None,

Set the default backend. Note: you can set chat_template_name in RontimeEndpoint. 

In [2]:
set_default_backend(RuntimeEndpoint(f"http://localhost:{port}", chat_template_name="qwen"))

[2025-05-04 18:38:07] INFO:     127.0.0.1:35064 - "GET /get_model_info HTTP/1.1" 200 OK


In [3]:
@function
def basic_qa(s, question):
    s += system(f"You are a helpful assistant than can answer questions.")
    s += user(question)
    s += assistant_begin()
    s += gen("answer", max_tokens=512)
    s += assistant_end()

state = basic_qa("List 3 countries and their capitals.")
print_highlight(state["answer"])

[2025-05-04 18:38:08] Prefill batch. #new-seq: 1, #new-token: 31, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-04 18:38:08] Decode batch. #running-req: 1, #token: 64, token usage: 0.00, gen throughput (token/s): 3.63, #queue-req: 0
[2025-05-04 18:38:09] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 82.06, #queue-req: 0
[2025-05-04 18:38:09] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 81.59, #queue-req: 0
[2025-05-04 18:38:10] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 81.13, #queue-req: 0
[2025-05-04 18:38:10] Decode batch. #running-req: 1, #token: 224, token usage: 0.00, gen throughput (token/s): 80.87, #queue-req: 0
[2025-05-04 18:38:11] Decode batch. #running-req: 1, #token: 264, token usage: 0.00, gen throughput (token/s): 80.54, #queue-req: 0
[2025-05-04 18:38:11] INFO:     127.0.0.1:35066 - "POST /generate HTTP/1.1" 

In [4]:

# print as raw text



<think>
Okay, the user is asking for three countries and their capitals. Let me think about which countries to pick. I should choose well-known ones so that the information is accurate and commonly recognized.

First, France. Paris is its capital, and that's a major city, so that's a good start. Next, maybe Japan. Tokyo is the capital, and it's a big country with a large population. Then, Brazil. Brasília is the capital, but I should make sure I get the correct spelling. Wait, yes, Brasília is the capital. Alternatively, maybe Canada? Ottawa is the capital. But I think France, Japan, and Brazil are all good examples. Alternatively, the United States with Washington D.C. But maybe the user wants a variety of regions. Let me confirm the capitals again. France: Paris. Japan: Tokyo. Brazil: Brasília. Yes, that's correct. I should present them clearly.
</think>

Here are three countries and their capitals:  
1. **France** – **Paris**  
2. **Japan** – **Tokyo**  
3. **Brazil** – **Brasília**

In [11]:

@function
def basic_qa_separate_reasoning(s, question):
    s += system(f"You are a helpful assistant than can answer questions.")
    s += user(question)
    s += assistant_begin()
    s += separate_reasoning(
        gen("answer", max_tokens=512),
        model_type="qwen3"
    )
    s += assistant_end()

reasoning_state = basic_qa_separate_reasoning("List 3 countries and their capitals.")
print_highlight(reasoning_state.stream_executor.variable_event.keys())
print_highlight(f"\nSeparated Reasoning Content:\n{reasoning_state["answer_reasoning_content"]}")
print_highlight(f"\n\nContent:\n{reasoning_state["answer"]}")
print_highlight(f"\n\nMessages:\n{reasoning_state.messages()[-1]}")

dict_keys(['answer', 'answer_reasoning_content'])
[2025-05-04 18:40:56] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 30, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-04 18:40:56] Decode batch. #running-req: 1, #token: 42, token usage: 0.00, gen throughput (token/s): 1.01, #queue-req: 0
[2025-05-04 18:40:57] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 82.29, #queue-req: 0
[2025-05-04 18:40:57] Decode batch. #running-req: 1, #token: 122, token usage: 0.00, gen throughput (token/s): 81.90, #queue-req: 0
[2025-05-04 18:40:58] Decode batch. #running-req: 1, #token: 162, token usage: 0.00, gen throughput (token/s): 81.36, #queue-req: 0
[2025-05-04 18:40:58] Decode batch. #running-req: 1, #token: 202, token usage: 0.00, gen throughput (token/s): 81.05, #queue-req: 0
[2025-05-04 18:40:59] INFO:     127.0.0.1:54664 - "POST /generate HTTP/1.1" 200 OK

Separated Reasoning Content:
Okay, the user asked for three countries and t

In [6]:
repr(reasoning_state['answer'])

'"Here are three countries and their capitals:  \\n1. **United States** – Washington, D.C.  \\n2. **France** – Paris  \\n3. **Germany** – Berlin  \\n\\nLet me know if you\'d like more examples!"'

In [10]:
@function
def multi_turn_qa(s):
    s += system(f"You are a helpful assistant than can answer questions.")
    s += user("Please give me a list of 3 countries and their capitals.")
    s += assistant(separate_reasoning(gen("first_answer", max_tokens=512), model_type="qwen3"))
    s += user("Please give me another list of 3 countries and their capitals.")
    s += assistant(separate_reasoning(gen("second_answer", max_tokens=512), model_type="qwen3"))
    return s


reasoning_state = multi_turn_qa()
print_highlight(f"\n\nfirst_answer:\n{reasoning_state['first_answer']}")
print_highlight(f"\n\nfirst_answer_reasoning_content:\n{reasoning_state['first_answer_reasoning_content']}")
print_highlight(f"\n\nsecond_answer:\n{reasoning_state['second_answer']}")
print_highlight(f"\n\nsecond_answer_reasoning_content:\n{reasoning_state['second_answer_reasoning_content']}")

[2025-05-04 18:40:08] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 35, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-05-04 18:40:08] Decode batch. #running-req: 1, #token: 60, token usage: 0.00, gen throughput (token/s): 3.67, #queue-req: 0
[2025-05-04 18:40:09] Decode batch. #running-req: 1, #token: 100, token usage: 0.00, gen throughput (token/s): 82.13, #queue-req: 0
[2025-05-04 18:40:09] Decode batch. #running-req: 1, #token: 140, token usage: 0.00, gen throughput (token/s): 81.68, #queue-req: 0
[2025-05-04 18:40:10] Decode batch. #running-req: 1, #token: 180, token usage: 0.00, gen throughput (token/s): 81.19, #queue-req: 0
[2025-05-04 18:40:10] Decode batch. #running-req: 1, #token: 220, token usage: 0.00, gen throughput (token/s): 80.95, #queue-req: 0
[2025-05-04 18:40:11] Decode batch. #running-req: 1, #token: 260, token usage: 0.00, gen throughput (token/s): 80.64, #queue-req: 0
[2025-05-04 18:40:11] Decode batch. #running-req: 1, #token: 300, token usag

## Using No thinking as Qwen 3's advanced feature 

sglang separate_reasoning is particularly useful when combined with Qwen 3's advanced feature.

[Qwen 3's advanced usages](https://qwenlm.github.io/blog/qwen3/#advanced-usages)


In [8]:
reasoning_state = basic_qa("List 3 countries and their capitals. /no_think")
print("Reasoning Content:\n", reasoning_state["answer_reasoning_content"])
print("Content:\n", reasoning_state["answer"])

[2025-05-04 18:35:04] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 26, token usage: 0.00, #running-req: 0, #queue-req: 0


KeyError: 'answer_reasoning_content'

[2025-05-04 18:35:04] INFO:     127.0.0.1:36614 - "POST /generate HTTP/1.1" 200 OK


In [10]:
@function
def regular_expression_gen(s):
    s += user("What is the IP address of the Google DNS servers? just provide the answer")
    s += separate_reasoning(
        assistant(
            gen(
                "answer",
                temperature=0,
                regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
                max_tokens=512,
            )
        ), model_type="qwen3"
    )


reasoning_state = regular_expression_gen()

In [None]:
print(reasoning_state.messages()[-1]['content'])

In [12]:
user_prompt = r'''
My name is Max.I am trying to talk to someone or something. Specifically, I want to "hey fish". I am probably using or involving this object in the interaction: Fresh-Caught Namazu Fish (MageMart is proud to work with aquariums all over the world to sustainably reduce the number of earthquakes across the globe.)
   
   Did I mention talking to as specific person, or am I talking to an object? Which is the more likely?
   Do NOT give the final answer yet, but consider all the options and explain your logic in a single concise, brief, insightful sentence. Limit output to 40 words. Do not output line breaks.
'''
system_prompt = '''
You are a helpful assistant. Respond with either Person or Object. Do not include any extra details or explanations.
'''

In [13]:
@function
def my_chat(s):
    global system_prompt, user_prompt
    s += system(system_prompt)
    s += user(user_prompt)
    s += separate_reasoning(
        assistant(
            gen(
                "answer",
                temperature=0.3,
                regex=r'(Yes|No)',
                max_tokens=512,
            )
        ), model_type="qwen3"
    )


reasoning_state = my_chat()

In [None]:
from pprint import pprint
pprint(reasoning_state.messages()[-1]['reasoning_content'])
pprint(reasoning_state.messages()[-1]['content'])