# Benchmark

1. It should only evaluate the LLM engine speed, but not other unrelated pre/post process
2. Metric of choice: Token per second (TPS)
3. Varying parameter: batch_size, tensor_parallel_size, enforce_eager


In [None]:
import pickle
import time

from text2graph.prompt import PromptHandlerV3
from text2graph.askxdd import get_weaviate_client


import vllm
import os

os.chdir("/root")

from chtc.db import Triplets

In [None]:
llm = vllm.LLM(
    model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
    dtype="float16",
    tensor_parallel_size=1,
    enforce_eager=True,
)

In [None]:
def get_batch(batch_size: int) -> list[str]:
    """Get a batch of prompts for benchmarking."""

    weaviate_client = get_weaviate_client()
    prompt_handler = PromptHandlerV3()
    MIXTRAL_TEMPLATE = "[INST] {system} {user} [/INST]"

    # Get ids
    all_ids_pickle = "/root/geoarchive_paragraph_ids.pkl"
    with open(all_ids_pickle, "rb") as f:
        all_ids = pickle.load(f)
    batch_ids = all_ids[0:batch_size]

    batch = []
    for id in batch_ids:
        paragraph = weaviate_client.data_object.get_by_id(id, class_name="Paragraph")
        text = paragraph["properties"]["text_content"]
        messages = prompt_handler.get_gpt_messages(text)
        prompt = MIXTRAL_TEMPLATE.format(
            system=messages[0]["content"], user=messages[1]["content"]
        )
        batch.append(prompt)
    return batch

In [None]:
def benchmark(batch_size: int, llm: vllm.LLM) -> dict:
    """Benchmark the time required to process the llm inference."""

    sampling_params = vllm.SamplingParams(
        temperature=0, max_tokens=2048, stop=["[/INST]", "[INST]"]
    )
    prompts = get_batch(batch_size)

    # Timed section
    start_time = time.perf_counter()
    outputs = llm.generate(prompts, sampling_params)
    end_time = time.perf_counter()

    # Calculate time per token
    results = {}
    results["total_time"] = end_time - start_time
    results["input_tokens"] = sum([len(output.prompt_token_ids) for output in outputs])
    results["output_tokens"] = sum(
        [len(output.outputs[0].token_ids) for output in outputs]
    )
    results["token_per_second"] = (
        results["input_tokens"] + results["output_tokens"]
    ) / results["total_time"]
    results["outputs"] = outputs
    return results

In [None]:
results = {}
for batch_size in [3]:
    results[batch_size] = benchmark(batch_size, llm)

In [None]:
for batch_size, r in results.items():
    data = {k: v for k, v in r.items() if k != "outputs"}
    print(f"Batch size: {batch_size}, {data}")

tp: 2; eager: False, OOM at 50

- Batch size: 1, {'total_time': 4.926733806729317, 'input_tokens': 369, 'output_tokens': 232, 'token_per_second': 121.98751212803651}
- Batch size: 3, {'total_time': 15.417909558862448, 'input_tokens': 1122, 'output_tokens': 1098, 'token_per_second': 143.9883916509233}
- Batch size: 5, {'total_time': 22.074966586660594, 'input_tokens': 1897, 'output_tokens': 2134, 'token_per_second': 182.60503290799284}
- Batch size: 10, {'total_time': 26.61369479680434, 'input_tokens': 3807, 'output_tokens': 3494, 'token_per_second': 274.33244634926353}
- Batch size: 20, {'total_time': 43.88761671539396, 'input_tokens': 7865, 'output_tokens': 7980, 'token_per_second': 361.03578152245}

tp: 1; eager: True

- Batch size: 1, {'total_time': 12.907626039814204, 'input_tokens': 369, 'output_tokens': 211, 'token_per_second': 44.93467646265562}
- Batch size: 3, {'total_time': 52.06450440501794, 'input_tokens': 1122, 'output_tokens': 1161, 'token_per_second': 43.849452253308414}
- Batch size: 5, {'total_time': 42.428753749933094, 'input_tokens': 1897, 'output_tokens': 2186, 'token_per_second': 96.23190971067442}
- Batch size: 10, {'total_time': 49.54290740704164, 'input_tokens': 3807, 'output_tokens': 3833, 'token_per_second': 154.20976280681722}
- Batch size: 20, {'total_time': 74.78418159391731, 'input_tokens': 7865, 'output_tokens': 6638, 'token_per_second': 193.9313861686978}
- Batch size: 50, {'total_time': 172.57586133107543, 'input_tokens': 20854, 'output_tokens': 21304, 'token_per_second': 244.2867714802979}
- Batch size: 100, {'total_time': 205.62514622602612, 'input_tokens': 40934, 'output_tokens': 40364, 'token_per_second': 395.36993160669203}
- Batch size: 200, {'total_time': 293.219068787992, 'input_tokens': 82229, 'output_tokens': 74914, 'token_per_second': 535.923535428796}

Summary:

- Larger batch sizes enhance performance when not causing out-of-memory (OOM) issues.
- Contrary to expectations, eager mode does not reduce processing speed per GPU.
- I will adopt a batch size of 200 with a single GPU in eager mode to optimize queue management, checkpointing, and runtime.