In [1]:
import requests
import json

In [2]:
# LOCAL TEST
LOCAL_URL = "http://127.0.0.1:8000"

print(requests.get(f"{LOCAL_URL}/api/v1/healthcheck").json())

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=8000): Max retries exceeded with url: /api/v1/healthcheck (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11d662030>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [4]:
# Evaluate
payload = {
    "user_message": "What is the capital of France?",
    "chat_history": [
        {"role": "user", "content": "Hi"},
        {"role": "assistant", "content": "The capital of Canada is Ottawa"},
    ],
}
resp = requests.post(f"{LOCAL_URL}/api/v1/evaluate", json=payload, timeout=60)


ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=8000): Max retries exceeded with url: /api/v1/evaluate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x109a16780>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [5]:
json.loads(resp.text)['reasoning']

NameError: name 'resp' is not defined

In [7]:
# APP TEST
from databricks.sdk.core import Config
import requests

config = Config(profile="DEFAULT")
token = config.oauth_token().access_token

session = requests.Session()
session.headers.update({
    "Authorization": f"Bearer {token}",
    "Accept": "application/json",
    "Content-Type": "application/json",
})

BASE_URL = "https://fastapi-shm-984752964297111.11.azure.databricksapps.com"

response = session.get(
    f"{BASE_URL}/api/v1/healthcheck", 
    allow_redirects=False)

print(response.text)

{"status":"OK","timestamp":"2025-08-14T15:23:57.368745+00:00"}


In [4]:
# Query
payload = {
    "user_message": "What is the capital of France?",
    "chat_history": [
        {"role": "user", "content": "Hi"},
    ],
}
resp = session.post(f"{BASE_URL}/api/v1/evaluate", json=payload, timeout=60)
resp

<Response [200]>

In [9]:
# High-concurrency load test
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# Choose environment automatically
BASE = BASE_URL  # remote app
sess = session

sess.headers.setdefault("Content-Type", "application/json")


def build_payload(request_index: int) -> dict:
    return {
        "user_message": f"Ping {request_index}: what is 2 + 2?",
        "chat_history": [{"role": "user", "content": "Hi"}],
    }


def post_once(session_obj: requests.Session, base_url: str, request_index: int, timeout_seconds: int = 60) -> dict:
    payload = build_payload(request_index)
    start = time.perf_counter()
    try:
        resp = session_obj.post(f"{base_url}/api/v1/evaluate", json=payload, timeout=timeout_seconds)
        elapsed = time.perf_counter() - start
        return {"ok": resp.status_code == 200, "status": resp.status_code, "elapsed": elapsed, "error": None if resp.status_code == 200 else resp.text}
    except Exception as exc:
        elapsed = time.perf_counter() - start
        return {"ok": False, "status": None, "elapsed": elapsed, "error": str(exc)}


def percentile(values, p):
    if not values:
        return None
    values_sorted = sorted(values)
    k = max(0, min(len(values_sorted) - 1, int(round((p / 100.0) * (len(values_sorted) - 1)))))
    return values_sorted[k]


def run_concurrent_requests(total_requests: int = 100, max_workers: int = 20, timeout_seconds: int = 60):
    start = time.perf_counter()
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(post_once, sess, BASE, i, timeout_seconds) for i in range(total_requests)]
        for fut in as_completed(futures):
            results.append(fut.result())
    duration = time.perf_counter() - start

    successes = [r for r in results if r["ok"]]
    failures = [r for r in results if not r["ok"]]
    latencies = [r["elapsed"] for r in successes]

    rps = len(results) / duration if duration > 0 else float("inf")

    print(f"Target: {BASE}")
    print(f"Total: {len(results)} in {duration:.2f}s  -> {rps:.2f} req/s")
    print(f"Success: {len(successes)}  Failures: {len(failures)}  Success rate: {len(successes)/len(results)*100:.1f}%")
    if latencies:
        print(
            f"Latency p50: {percentile(latencies,50):.3f}s  p90: {percentile(latencies,90):.3f}s  p95: {percentile(latencies,95):.3f}s  p99: {percentile(latencies,99):.3f}s  max: {max(latencies):.3f}s"
        )
    if failures:
        sample = failures[0]
        preview = (sample["error"] or "")[:200]
        print(f"Sample failure -> status={sample['status']} error={preview}")

    return {"results": results, "duration": duration, "rps": rps}

# Configure and run a high-concurrency test
TOTAL_REQUESTS = 20  # bump as needed
MAX_WORKERS = 15     # concurrent threads
_ = run_concurrent_requests(total_requests=TOTAL_REQUESTS, max_workers=MAX_WORKERS, timeout_seconds=60)




Target: https://fastapi-shm-984752964297111.11.azure.databricksapps.com
Total: 20 in 6.76s  -> 2.96 req/s
Success: 17  Failures: 3  Success rate: 85.0%
Latency p50: 3.874s  p90: 4.837s  p95: 5.507s  p99: 6.646s  max: 6.646s
Sample failure -> status=None error=('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


In [6]:
# Scheduled request: one query every 30 seconds for 10 minutes
import time


def run_scheduled_requests(interval_seconds: int = 30, total_duration_seconds: int = 600, timeout_seconds: int = 60):
    count = max(1, int(total_duration_seconds // interval_seconds))
    next_t = time.perf_counter()
    results = []
    for i in range(count):
        next_t += interval_seconds
        res = post_once(sess, BASE, i, timeout_seconds)
        results.append(res)
        status = res["status"] if res["status"] is not None else "ERR"
        print(f"[{i+1}/{count}] status={status} ok={res['ok']} latency={res['elapsed']:.3f}s")
        sleep = max(0.0, next_t - time.perf_counter())
        time.sleep(sleep)

    successes = [r for r in results if r["ok"]]
    latencies = [r["elapsed"] for r in successes]
    if latencies:
        print(
            f"Latency p50: {percentile(latencies,50):.3f}s  p95: {percentile(latencies,95):.3f}s  max: {max(latencies):.3f}s"
        )
    print(f"Done. Sent {len(results)} requests over ~{total_duration_seconds}s to {BASE}")
    return results

# Run the scheduled test (10 minutes, every 30 seconds)
INTERVAL_SECONDS = 30
TOTAL_DURATION_SECONDS = 600
scheduled_results = run_scheduled_requests(
    interval_seconds=INTERVAL_SECONDS,
    total_duration_seconds=TOTAL_DURATION_SECONDS,
    timeout_seconds=60,
)


[1/20] status=200 ok=True latency=1.817s
[2/20] status=200 ok=True latency=1.460s
[3/20] status=200 ok=True latency=2.968s
[4/20] status=200 ok=True latency=1.622s
[5/20] status=200 ok=True latency=2.587s
[6/20] status=200 ok=True latency=1.938s
[7/20] status=200 ok=True latency=1.572s
[8/20] status=200 ok=True latency=1.352s
[9/20] status=200 ok=True latency=1.557s
[10/20] status=200 ok=True latency=2.233s
[11/20] status=200 ok=True latency=1.852s
[12/20] status=200 ok=True latency=1.555s
[13/20] status=200 ok=True latency=2.165s
[14/20] status=200 ok=True latency=1.573s
[15/20] status=200 ok=True latency=1.623s
[16/20] status=200 ok=True latency=1.437s
[17/20] status=200 ok=True latency=28.983s
[18/20] status=200 ok=True latency=1.675s
[19/20] status=200 ok=True latency=28.238s
[20/20] status=200 ok=True latency=6.548s
Latency p50: 1.817s  p95: 28.238s  max: 28.983s
Done. Sent 20 requests over ~600s to https://fastapi-shm-984752964297111.11.azure.databricksapps.com
