In [1]:
import time

import httpx
from faker import Faker

In [14]:
fake = Faker()
inputs = [fake.text(1000) for _ in range(100)]

### Benchmark in sequential mode

In [None]:
def benchmark_ollama(
    httpx_client: httpx.Client,
    inputs: list[str],
    model: str = "snowflake-arctic-embed2:latest",
) -> list[list[float]]:
    route = "api/embed"
    durations = []
    outputs = []
    for input in inputs:
        start = time.perf_counter()
        response = httpx_client.post(route, json={"model": model, "input": input})
        response.raise_for_status()
        end = time.perf_counter()
        durations.append(end - start)
        outputs.append(response.json()["embeddings"][0])

    n = len(durations)
    print(f"Average response time for {n} requests: {sum(durations) / n:.2f} seconds")
    return outputs


client = httpx.Client(base_url="http://olvi-1:11434")
ollama_embeddings = benchmark_ollama(client, inputs=inputs)

Average response time for 100 requests: 0.18 seconds


In [None]:
def benchmark_tei(
    httpx_client: httpx.Client,
    inputs: list[str],
) -> list[list[float]]:
    route = "embed"
    durations = []
    outputs = []
    for input in inputs:
        start = time.perf_counter()
        response = httpx_client.post(route, json={"inputs": input})
        response.raise_for_status()
        end = time.perf_counter()
        durations.append(end - start)
        outputs.append(response.json()[0])

    n = len(durations)
    print(f"Average response time for {n} requests: {sum(durations) / n:.2f} seconds")
    return outputs


client = httpx.Client(base_url="http://olvi-1:8000")
tei_embeddings = benchmark_tei(client, inputs=inputs)

Average response time for 100 requests: 0.01 seconds


### Benchmark in batch mode

In [None]:
start = time.perf_counter()
httpx.post(
    "http://olvi-1:11434/api/embed",
    json={"model": "snowflake-arctic-embed2:latest", "input": inputs},
)
end = time.perf_counter()
print(f"Time taken for 100 requests in batch mode: {end - start:.2f} seconds")

Time taken for 100 requests in batch mode: 2.36 seconds


In [42]:
start = time.perf_counter()
response = httpx.post("http://olvi-1:8000/embed", json={"inputs": inputs})
end = time.perf_counter()
print(f"Time taken for 100 requests in batch mode: {end - start:.2f} seconds")

Time taken for 100 requests in batch mode: 1.33 seconds


tl;dr: just use tei.. free 10x speed.

### Infinity

In [None]:
start = time.perf_counter()
response = httpx.post(
    "http://olvi-1:7997/embeddings", json={"model": "intfloat/multilingual-e5-large-instruct", "input": inputs}
)
end = time.perf_counter()
print(f"Time taken for 100 requests in batch mode: {end - start:.2f} seconds")


Time taken for 100 requests in batch mode: 1.63 seconds


Somewhat similar speed to TEI, but TEI is more popular.