From 662f2b0698b8241d285e9d653e7d2bccc851b6ad Mon Sep 17 00:00:00 2001
From: ophilia-lee <leebanban@126.com>
Date: Mon, 27 Oct 2025 14:55:07 +0800
Subject: [PATCH 1/3] =?UTF-8?q?benchmark=E5=B7=A5=E5=85=B7=E9=80=82?=
 =?UTF-8?q?=E9=85=8DSGLang=E6=A1=86=E6=9E=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../{vLLM_default.yaml => request.yaml}            | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
 rename benchmarks/yaml/request_yaml/{vLLM_default.yaml => request.yaml} (53%)

diff --git a/benchmarks/yaml/request_yaml/vLLM_default.yaml b/benchmarks/yaml/request_yaml/request.yaml
similarity index 53%
rename from benchmarks/yaml/request_yaml/vLLM_default.yaml
rename to benchmarks/yaml/request_yaml/request.yaml
index a6385823b5f..9fc603354b6 100644
--- a/benchmarks/yaml/request_yaml/vLLM_default.yaml
+++ b/benchmarks/yaml/request_yaml/request.yaml
@@ -1,11 +1,11 @@
-top_p: 1.0
-temperature: 1.0
-metadata:
-  min_tokens: 1
-max_tokens: 30721
+top_p: 0.8
+temperature: 0.8
+max_tokens: 12288
 repetition_penalty: 1.0
 frequency_penalty: 0
 presence_penalty: 0
-skip_special_tokens: false
+metadata:
+  enable_thinking: false
+  min_tokens: 1
 chat_template_kwargs:
-  enable_thinking: true
+  enable_thinking: false

From 5ac266e28873aecc2ee9b6f02d278eb59888a451 Mon Sep 17 00:00:00 2001
From: ophilia-lee <leebanban@126.com>
Date: Mon, 27 Oct 2025 14:58:02 +0800
Subject: [PATCH 2/3] =?UTF-8?q?benchmark=E5=B7=A5=E5=85=B7=E9=80=82?=
 =?UTF-8?q?=E9=85=8DSGLang=E6=A1=86=E6=9E=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 benchmarks/backend_request_func.py | 14 +++++++++-----
 benchmarks/benchmark_serving.py    |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 837d9df91e9..545e9870e72 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -128,14 +128,14 @@ async def async_request_eb_openai_chat_completions(
 
                         chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                         if chunk != "[DONE]":
-                            # print("####chunk:", chunk, type(chunk))
+                            #print("####chunk:", chunk, type(chunk))
                             timestamp = time.perf_counter()
                             data = json.loads(chunk)
 
                             if request_id == "None" and "id" in data:
                                 request_id = data["id"]
 
-                            if choices := data.get("choices"):
+                            if (choices := data.get("choices")) and choices[0]["delta"].get("content"):
                                 content = choices[0]["delta"].get("content")
                                 reason_content = choices[0]["delta"].get("reasoning_content")
                                 # First token
@@ -143,9 +143,12 @@ async def async_request_eb_openai_chat_completions(
                                     ttft = timestamp - st
                                     output.ttft = ttft
                                     # cached_tokens
-                                    output.prompt_len = (
-                                        data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
-                                    )
+                                    if data["usage"] and data["usage"].get("prompt_tokens_details", {}):
+                                        output.prompt_len = (
+                                            data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
+                                        )
+                                    else:
+                                        output.prompt_len = 0
 
                                 # Decoding phase
                                 else:
@@ -157,6 +160,7 @@ async def async_request_eb_openai_chat_completions(
                             elif usage := data.get("usage", {}):
                                 output.output_tokens = usage.get("completion_tokens", 0)
                                 output.prompt_tokens = usage.get("prompt_tokens", 0)
+                            
 
                             most_recent_timestamp = timestamp
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index ce072555fa7..fb13301c4e4 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -635,7 +635,7 @@ def benchmark_metrics(
     goodput_config_dict = check_goodput_args(args)
 
     metrics, actual_output_lens = calculate_metrics(
-        input_requests=input_requests,
+        # input_requests=input_requests,
         outputs=outputs,
         dur_s=benchmark_duration,
         selected_percentiles=selected_percentiles,

From 14fd833b251472ba3a76e20663e6adc30b90c3b1 Mon Sep 17 00:00:00 2001
From: ophilia-lee <leebanban@126.com>
Date: Mon, 27 Oct 2025 15:09:44 +0800
Subject: [PATCH 3/3] =?UTF-8?q?benchmark=E5=B7=A5=E5=85=B7=E9=80=82?=
 =?UTF-8?q?=E9=85=8DSGLang=E6=A1=86=E6=9E=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 benchmarks/backend_request_func.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 545e9870e72..596804331f2 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -134,8 +134,8 @@ async def async_request_eb_openai_chat_completions(
 
                             if request_id == "None" and "id" in data:
                                 request_id = data["id"]
-
-                            if (choices := data.get("choices")) and choices[0]["delta"].get("content"):
+                            
+                            if choices := data.get("choices"):
                                 content = choices[0]["delta"].get("content")
                                 reason_content = choices[0]["delta"].get("reasoning_content")
                                 # First token