diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 837d9df91e9..596804331f2 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -128,13 +128,13 @@ async def async_request_eb_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": - # print("####chunk:", chunk, type(chunk)) + #print("####chunk:", chunk, type(chunk)) timestamp = time.perf_counter() data = json.loads(chunk) if request_id == "None" and "id" in data: request_id = data["id"] - + if choices := data.get("choices"): content = choices[0]["delta"].get("content") reason_content = choices[0]["delta"].get("reasoning_content") @@ -143,9 +143,12 @@ async def async_request_eb_openai_chat_completions( ttft = timestamp - st output.ttft = ttft # cached_tokens - output.prompt_len = ( - data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0) - ) + if data["usage"] and data["usage"].get("prompt_tokens_details", {}): + output.prompt_len = ( + data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0) + ) + else: + output.prompt_len = 0 # Decoding phase else: @@ -157,6 +160,7 @@ async def async_request_eb_openai_chat_completions( elif usage := data.get("usage", {}): output.output_tokens = usage.get("completion_tokens", 0) output.prompt_tokens = usage.get("prompt_tokens", 0) + most_recent_timestamp = timestamp diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index ce072555fa7..fb13301c4e4 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -635,7 +635,7 @@ def benchmark_metrics( goodput_config_dict = check_goodput_args(args) metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, + # input_requests=input_requests, outputs=outputs, dur_s=benchmark_duration, selected_percentiles=selected_percentiles, diff --git a/benchmarks/yaml/request_yaml/vLLM_default.yaml b/benchmarks/yaml/request_yaml/request.yaml similarity index 53% rename from benchmarks/yaml/request_yaml/vLLM_default.yaml rename to benchmarks/yaml/request_yaml/request.yaml index a6385823b5f..9fc603354b6 100644 --- a/benchmarks/yaml/request_yaml/vLLM_default.yaml +++ b/benchmarks/yaml/request_yaml/request.yaml @@ -1,11 +1,11 @@ -top_p: 1.0 -temperature: 1.0 -metadata: - min_tokens: 1 -max_tokens: 30721 +top_p: 0.8 +temperature: 0.8 +max_tokens: 12288 repetition_penalty: 1.0 frequency_penalty: 0 presence_penalty: 0 -skip_special_tokens: false +metadata: + enable_thinking: false + min_tokens: 1 chat_template_kwargs: - enable_thinking: true + enable_thinking: false