In [74]:
import json

FILE_PATH = "/content/latency_only_question.jsonl"

records = []

with open(FILE_PATH, "r") as f:
    for line in f:
        records.append(json.loads(line))

print(f"Total records: {len(records)}")

print("\nFirst 5 records:\n")
for i, record in enumerate(records[:5], start=1):
    print(f"Record {i}:")
    print(json.dumps(record, indent=2))
    print("-" * 50)


Total records: 226

First 5 records:

Record 1:
{
  "role": "employee",
  "question": "How much monthly food allowance do junior employees receive?"
}
--------------------------------------------------
Record 2:
{
  "role": "employee",
  "question": "Is meal card allowance tax exempt?"
}
--------------------------------------------------
Record 3:
{
  "role": "employee",
  "question": "What is the internet reimbursement limit for work from home?"
}
--------------------------------------------------
Record 4:
{
  "role": "employee",
  "question": "Does health insurance cover parents?"
}
--------------------------------------------------
Record 5:
{
  "role": "employee",
  "question": "What expenses are NOT covered under LTA?"
}
--------------------------------------------------


In [65]:
import requests
import json
import time
import numpy as np

LOGIN_URL = "https://9ad932942d16.ngrok-free.app/login"
ASK_URL   = "https://9ad932942d16.ngrok-free.app/ask"

ACCOUNTS = {
    "employee": {"email": "user986@company.com", "password": "user986pass"},
    "manager":  {"email": "user978@company.com", "password": "user978pass"},
    "hr":       {"email": "user975@company.com", "password": "user975pass"},
}

ROLE_LIMITS = {
    "employee": 139,
    "manager": 55,
    "hr": 32
}
ROLE_ORDER = ["employee", "manager", "hr"]

COHERE_CALL_LIMIT = 10
COHERE_SLEEP_SECONDS = 60
cohere_call_count = 0

PRICING = {
    "embedding_per_1k": 0.00002,
    "llm_input_per_1k": 0.0005,
    "llm_output_per_1k": 0.0015,
    "reranker_per_call": 0.001
}

EVAL_FILE = "/content/latency_only_question.jsonl"
rows = [json.loads(line) for line in open(EVAL_FILE)]

def login(role):
    r = requests.post(LOGIN_URL, json=ACCOUNTS[role])
    r.raise_for_status()
    return r.json()["access_token"]

lat_total, lat_embed, lat_ret, lat_rerank, lat_llm = [], [], [], [], []

cost_total = []
cost_embed = []
cost_llm_in = []
cost_llm_out = []
cost_rerank = []

current_role_idx = 0
current_role = ROLE_ORDER[current_role_idx]
role_counter = 0

token = login(current_role)
headers = {"Authorization": f"Bearer {token}"}

try:
    for row in rows:

        if cohere_call_count > 0 and cohere_call_count % COHERE_CALL_LIMIT == 0:
            time.sleep(COHERE_SLEEP_SECONDS)

        r = requests.post(
            ASK_URL,
            json={"question": row["question"]},
            headers=headers,
            timeout=60
        )

        if r.status_code != 200:
            break

        data = r.json()
        lat = data["latency"]
        usage = data["usage"]
        print(f"ROLE={current_role} | Total={lat['total']}s")
        lat_total.append(lat["total"])
        lat_embed.append(lat["embedding"])
        lat_ret.append(lat["retrieval"])
        lat_rerank.append(lat["reranker"])
        lat_llm.append(lat["llm"])

        embed_cost = usage["embedding_tokens"] / 1000 * PRICING["embedding_per_1k"]
        llm_in_cost = usage["llm_input_tokens"] / 1000 * PRICING["llm_input_per_1k"]
        llm_out_cost = usage["llm_output_tokens"] / 1000 * PRICING["llm_output_per_1k"]
        rerank_cost = usage["reranker_calls"] * PRICING["reranker_per_call"]

        cost_total.append(embed_cost + llm_in_cost + llm_out_cost + rerank_cost)
        cost_embed.append(embed_cost)
        cost_llm_in.append(llm_in_cost)
        cost_llm_out.append(llm_out_cost)
        cost_rerank.append(rerank_cost)

        cohere_call_count += 1
        role_counter += 1

        if role_counter == ROLE_LIMITS[current_role]:
            role_counter = 0
            current_role_idx += 1
            if current_role_idx >= len(ROLE_ORDER):
                break
            current_role = ROLE_ORDER[current_role_idx]
            token = login(current_role)
            headers = {"Authorization": f"Bearer {token}"}

except Exception:
    pass

finally:
    def report(arr):
        return (
            round(np.percentile(arr, 50), 2),
            round(np.percentile(arr, 95), 2),
            round(np.percentile(arr, 99), 2),
            round(np.mean(arr), 2)
        )

    print("=" * 60)
    print("ðŸš€ FINAL METRICS (LATENCY + COST)")
    print("=" * 60)

    p50, p95, p99, avg = report(lat_total)
    print("\nTOTAL LATENCY (s)")
    print(f"P50: {p50}")
    print(f"P95: {p95}")
    print(f"P99: {p99}")
    print(f"AVG: {avg}")

    p50, p95, p99, avg = report(lat_embed)
    print("\nEMBEDDING LATENCY (s)")
    print(f"P50: {p50}")
    print(f"P95: {p95}")
    print(f"P99: {p99}")
    print(f"AVG: {avg}")

    p50, p95, p99, avg = report(lat_ret)
    print("\nRETRIEVAL LATENCY (s)")
    print(f"P50: {p50}")
    print(f"P95: {p95}")
    print(f"P99: {p99}")
    print(f"AVG: {avg}")

    p50, p95, p99, avg = report(lat_rerank)
    print("\nRERANKER LATENCY (s)")
    print(f"P50: {p50}")
    print(f"P95: {p95}")
    print(f"P99: {p99}")
    print(f"AVG: {avg}")

    p50, p95, p99, avg = report(lat_llm)
    print("\nLLM LATENCY (s)")
    print(f"P50: {p50}")
    print(f"P95: {p95}")
    print(f"P99: {p99}")
    print(f"AVG: {avg}")

    print("\n" + "-" * 60)

    p50, p95, p99, avg = report(cost_total)
    print("\nCOST PER QUERY (USD)")
    print(f"P50: {p50}")
    print(f"P95: {p95}")
    print(f"P99: {p99}")
    print(f"AVG: {avg}")

    print("\nCOST BREAKDOWN (AVG PER QUERY)")
    print(f"Embedding:   ${np.mean(cost_embed):.5f}")
    print(f"LLM Input:   ${np.mean(cost_llm_in):.5f}")
    print(f"LLM Output:  ${np.mean(cost_llm_out):.5f}")
    print(f"Reranker:    ${np.mean(cost_rerank):.5f}")

    print("\n" + "=" * 60)
    print("âœ… Benchmark finished safely")


ROLE=employee | Total=2.886s
ROLE=employee | Total=2.168s
ROLE=employee | Total=1.658s
ROLE=employee | Total=1.605s
ROLE=employee | Total=4.045s
ROLE=employee | Total=1.893s
ROLE=employee | Total=1.015s
ROLE=employee | Total=1.77s
ROLE=employee | Total=1.615s
ROLE=employee | Total=1.234s
ROLE=employee | Total=1.542s
ROLE=employee | Total=1.414s
ROLE=employee | Total=1.325s
ROLE=employee | Total=1.719s
ROLE=employee | Total=1.681s
ROLE=employee | Total=2.199s
ROLE=employee | Total=1.281s
ROLE=employee | Total=2.03s
ROLE=employee | Total=1.104s
ROLE=employee | Total=1.11s
ROLE=employee | Total=1.574s
ROLE=employee | Total=1.359s
ROLE=employee | Total=1.24s
ROLE=employee | Total=1.364s
ROLE=employee | Total=1.068s
ROLE=employee | Total=1.148s
ROLE=employee | Total=1.224s
ROLE=employee | Total=1.053s
ROLE=employee | Total=1.706s
ROLE=employee | Total=1.828s
ROLE=employee | Total=2.132s
ROLE=employee | Total=1.523s
ROLE=employee | Total=11.428s
ROLE=employee | Total=1.868s
ROLE=employee | T

In [71]:
import numpy as np

def latency_stats(arr):
    return (
        np.percentile(arr, 50),
        np.percentile(arr, 95),
        np.percentile(arr, 99),
        np.mean(arr)
    )

def cost_stats(arr):
    return (
        np.percentile(arr, 50),
        np.percentile(arr, 95),
        np.percentile(arr, 99),
        np.mean(arr)
    )

def print_latency(title, arr):
    p50, p95, p99, avg = latency_stats(arr)
    print(title)
    print(f"P50: {p50:.2f}")
    print(f"P95: {p95:.2f}")
    print(f"P99: {p99:.2f}")
    print(f"AVG: {avg:.2f}\n")

def print_cost(title, arr):
    p50, p95, p99, avg = cost_stats(arr)
    print(title)
    print(f"P50: {p50:.5f}")
    print(f"P95: {p95:.5f}")
    print(f"P99: {p99:.5f}")
    print(f"AVG: {avg:.5f}\n")

print("=" * 60)
print("ðŸš€ FINAL METRICS (LATENCY + COST)")
print("=" * 60)

print_latency("TOTAL LATENCY (s)", lat_total)
print_latency("EMBEDDING LATENCY (s)", lat_embed)
print_latency("RETRIEVAL LATENCY (s)", lat_ret)
print_latency("RERANKER LATENCY (s)", lat_rerank)
print_latency("LLM LATENCY (s)", lat_llm)

print("-" * 60)

print_cost("COST PER QUERY (USD)", cost_total)

print("COST BREAKDOWN (AVG PER QUERY)")
print(f"Embedding:   ${np.mean(cost_embed):.5f}")
print(f"LLM Input:   ${np.mean(cost_llm_in):.5f}")
print(f"LLM Output:  ${np.mean(cost_llm_out):.5f}")
print(f"Reranker:    ${np.mean(cost_rerank):.5f}")


ðŸš€ FINAL METRICS (LATENCY + COST)
TOTAL LATENCY (s)
P50: 1.52
P95: 2.52
P99: 3.96
AVG: 1.69

EMBEDDING LATENCY (s)
P50: 0.23
P95: 0.62
P99: 2.03
AVG: 0.32

RETRIEVAL LATENCY (s)
P50: 0.14
P95: 0.46
P99: 0.50
AVG: 0.18

RERANKER LATENCY (s)
P50: 0.09
P95: 0.31
P99: 1.04
AVG: 0.13

LLM LATENCY (s)
P50: 0.74
P95: 1.31
P99: 1.65
AVG: 0.88

------------------------------------------------------------
COST PER QUERY (USD)
P50: 0.00147
P95: 0.00169
P99: 0.00174
AVG: 0.00146

COST BREAKDOWN (AVG PER QUERY)
Embedding:   $0.00000
LLM Input:   $0.00039
LLM Output:  $0.00007
Reranker:    $0.00100
