# Sentence Transformer Benchmark

This notebook benchmarks a deployed Sentence Transformer service by sending concurrent requests and measuring latency statistics.

## Prerequisites

- A Sentence Transformer service must be deployed and accessible
- A valid PAT (Personal Access Token) for authentication

## Running the Notebook

1. **Update the service configuration**:
   - Set `pat_token` to your Snowflake PAT token
   - Set `URL` to your deployed service endpoint
2. **Modify the benchmark configs** if needed (clients, requests per client, duration)
3. **Run the cells in order** to execute the benchmarks

## Benchmark Configurations

Test configs are specified as a list of dictionaries with the following fields:
- `clients`: Number of concurrent clients
- `requests`: Number of sentences per request
- `duration_seconds`: How long to run the test

Example:
```python
configs = [
  {"clients": 10, "requests": 1, "duration_seconds": 20},
  {"clients": 50, "requests": 100, "duration_seconds": 20},
]
```

Each configuration runs for the specified duration and reports latency statistics (min, median, max, p90, p95, p99).


In [None]:
# Install required packages
%pip install requests pandas aiohttp nest-asyncio huggingface-hub


In [None]:
import pandas as pd

# Load and display sample of the sentences file
dataset_path = "hf://datasets/datastax/philosopher-quotes/philosopher-quotes.csv"

df = pd.read_csv(dataset_path)
df.head(10)


In [51]:
# Imports
import pandas as pd
import requests
import os
from datetime import datetime
import json
import nest_asyncio
nest_asyncio.apply()
import numpy as np
import asyncio
import time
import aiohttp
import statistics

print("Imports loaded")


In [52]:
# Load sentence data from CSV
def load_sentence_data():
    """Load sentences from sentences.csv"""
    df = pd.read_csv(dataset_path)
    sentences = df["quote"].tolist()
    return sentences

# Load sentence data
sentences = load_sentence_data()
print(f"Loaded {len(sentences)} sentences from CSV")
print(f"Sample sentences: {sentences[:3]}")


In [None]:
# Sentence Transformer service configuration
pat_token = ""  # TODO: Set your Snowflake PAT here
SERVICE_ENDPOINT = ""  # TODO: Set your service endpoint (e.g., abc123.snowflakecomputing.app)
headers = {'Authorization': f'Snowflake Token="{pat_token}"', 'Content-Type': 'application/json'}
URL = f"https://{SERVICE_ENDPOINT}/encode"

print(f"Sentence Transformer Service Configuration:")
print(f"  URL: {URL}")
print(f"  Headers: Authorization with Snowflake Token")


In [None]:
# Sentence Transformer invoke_endpoint function
async def invoke_endpoint(session, requests):
    """Call Sentence Transformer service with the specified number of sentences"""
    data_array = []
    for i in range(0, requests):
        sentence = sentences[i % len(sentences)]
        sentence_row = [i, sentence]  # [index, input_feature_0]
        data_array.append(sentence_row)

    client_data = {"data": data_array}

    # Start timing after data preparation
    start_time = time.monotonic()
    try:
        async with session.post(URL, headers=headers, json=client_data, timeout=300) as response:
            # Read response content
            resp = await response.text()

            # Only measure successful requests with valid responses
            if response.status == 200:
                try:
                    resp_data = json.loads(resp)
                    if "data" in resp_data and isinstance(resp_data["data"], list):
                        # Calculate latency only for successful, valid responses
                        latency = (time.monotonic() - start_time) * 1000
                        latencies.append(latency)
                    else:
                        print(f"Invalid response structure: {resp[:200]}")
                except json.JSONDecodeError:
                    print(f"Invalid JSON response: {resp[:200]}")
            else:
                print(f"Error response (HTTP {response.status}): {resp[:200]}")

            http_status[response.status] = http_status.get(response.status, 0) + 1
            return resp
    except Exception as e:
        print(f"Request failed: {str(e)}")
        http_status[500] = http_status.get(500, 0) + 1
        return str(e)

print("Sentence Transformer invoke_endpoint function loaded with accurate latency measurement")


In [None]:
# Main benchmark function
async def main(num_clients, num_requests, print_results=False, show_responses=False):
    async with aiohttp.ClientSession() as session:
        tasks = [invoke_endpoint(session, num_requests) for i in range(num_clients)]
        responses = await asyncio.gather(*tasks)

        if show_responses and print_results:
            print(f'Responses from {num_clients} clients:')
            for i, response in enumerate(responses):
                print(f'Response {i + 1}: {response}')

        # Calculate latency statistics
        latencies.sort()
        min_latency = latencies[0]
        max_latency = latencies[-1]
        median_latency = latencies[num_clients // 2] if num_clients > 0 else 0
        p90_latency = latencies[int(0.90 * num_clients)] if num_clients >= 10 else latencies[-1]
        p95_latency = latencies[int(0.95 * num_clients)] if num_clients >= 20 else latencies[-1]
        p99_latency = latencies[int(0.99 * num_clients)] if num_clients >= 100 else latencies[-1]

        if print_results:
            print(f'\nSentence Transformer Benchmark Results:')
            print(f'  Clients: {num_clients}')
            print(f'  Sentences per client: {num_requests}')
            print(f'  Total embeddings: {num_clients * num_requests}')
            print(f'\nLatency Statistics:')
            print(f'  Min: {min_latency:.2f} ms')
            print(f'  Max: {max_latency:.2f} ms')
            print(f'  Median: {median_latency:.2f} ms')
            print(f'  P90: {p90_latency:.2f} ms')
            print(f'  P95: {p95_latency:.2f} ms')
            print(f'  P99: {p99_latency:.2f} ms')

            print(f'\nHTTP Status Summary:')
            for status, count in http_status.items():
                if count > 0:
                    print(f'  HTTP {status}: {count}')

            # Calculate throughput
            total_time = max_latency / 1000  # Convert to seconds
            if total_time > 0:
                throughput = (num_clients * num_requests) / total_time
                print(f'\nThroughput: {throughput:.2f} requests/second')

print("Main benchmark function loaded")


In [None]:
# Reset function
def reset_test_state():
    """Reset global state for next test iteration"""
    global latencies, http_status
    latencies.clear()
    http_status[200] = 0
    http_status[400] = 0
    http_status[500] = 0
    http_status[429] = 0

print("Reset function loaded")


In [None]:
# Simple single request test - list of lists format
def test_single_request():
    """Test a single request with the correct list of lists format"""
    # Use the correct format: [index, sentence]
    sample_sentence = sentences[0]  # Get first sentence
    sample_row = [0, sample_sentence]  # [index, sentence_text]
    # Sentence Transformer service expects: {"data": [[0, "sentence1"], [1, "sentence2"], ...]}
    test_payload = {"data": [sample_row]}

    print("Testing single Sentence Transformer request (list of lists format):")
    print(f"  URL: {URL}")
    print(f"  Format: [index, sentence_text]")
    print(f"  Sample row: [0, \"{sample_sentence[:50]}...\"]")
    print(f"  Total elements: {len(sample_row)} (1 index + 1 sentence)")

    try:
        response = requests.post(URL, headers=headers, json=test_payload, timeout=300)
        print(f"\\nResponse:")
        print(f"  Status: {response.status_code}")
        print(f"  Response: {response.text}")
        return response.status_code == 200
    except Exception as e:
        print(f"Request failed: {e}")
        return False

print("Single request test function loaded (list of lists format)")


In [None]:
# Test single request first
print("Testing single request first...")
single_test_success = test_single_request()

if single_test_success:
    print("\\nSingle request successful! Proceeding with full benchmark...")
else:
    print("\\nSingle request failed. Fix the issue before running full benchmark.")


In [None]:
# Benchmark helpers
latencies = []
http_status = {200: 0, 400: 0, 500: 0, 429: 0}

def _compute_run_stats(lat_list):
    if not lat_list:
        return {"min_ms": 0.0, "median_ms": 0.0, "max_ms": 0.0, "p90_ms": 0.0, "p95_ms": 0.0, "p99_ms": 0.0, "count": 0}
    l = sorted(lat_list); n = len(l)
    p = lambda q: l[min(int(q * (n - 1)), n - 1)]
    return {
        "min_ms": l[0],
        "median_ms": statistics.median(l),
        "max_ms": l[-1],
        "p90_ms": p(0.90), "p95_ms": p(0.95), "p99_ms": p(0.99),
        "count": n,
    }

async def run_once_async(num_clients, num_requests, print_results=False, show_responses=False):
    reset_test_state()
    await main(num_clients, num_requests, print_results=print_results, show_responses=show_responses)
    s = _compute_run_stats(latencies)
    s["http"] = dict(http_status)
    return s

async def run_for_duration_async(clients: int, requests_per_client: int, duration_seconds: int, 
                                  show_progress: bool = True, print_results: bool = False, show_responses: bool = False):
    end = time.time() + max(1, int(duration_seconds))
    runs, pure_time, i = [], 0.0, 0
    while time.time() < end:
        t0 = time.time()
        res = await run_once_async(clients, requests_per_client, print_results=print_results, show_responses=show_responses)
        pure_time += (time.time() - t0)
        runs.append(res); i += 1
        if show_progress:
            print(f"  run {i}: med={res['median_ms']:.1f}ms min={res['min_ms']:.1f}ms max={res['max_ms']:.1f}ms http={res['http']}")
    if not runs:
        return {"runs": 0, "aggregated": {}, "runs_detail": []}

    meds = [r["median_ms"] for r in runs]
    mins = [r["min_ms"] for r in runs]
    maxs = [r["max_ms"] for r in runs]
    p90s = [r["p90_ms"] for r in runs]; p95s = [r["p95_ms"] for r in runs]; p99s = [r["p99_ms"] for r in runs]

    http_agg = {}
    for r in runs:
        for code, cnt in r.get("http", {}).items():
            http_agg[code] = http_agg.get(code, 0) + cnt

    aggregated = {
        "runs": len(runs), "duration_sec": round(pure_time, 3),
        "median_of_medians_ms": statistics.median(meds), "avg_median_ms": statistics.fmean(meds),
        "min_of_mins_ms": min(mins), "max_of_maxs_ms": max(maxs),
        "avg_p90_ms": statistics.fmean(p90s), "avg_p95_ms": statistics.fmean(p95s), "avg_p99_ms": statistics.fmean(p99s),
        "http": http_agg,
    }
    return {"runs": len(runs), "aggregated": aggregated, "runs_detail": runs}

async def run_config_queue_async(configs, print_results=False, show_responses=False):
    results = []
    for cfg in configs:
        clients = cfg.get("clients", 10)
        reqs = cfg.get("requests", 1)
        duration = cfg.get("duration_seconds")
        print(f"\\nConfig: {clients} clients x {reqs} requests" + (f" for {duration}s" if duration else ""))
        if duration:
            summary = await run_for_duration_async(clients, reqs, duration, show_progress=False, 
                                                   print_results=print_results, show_responses=show_responses)
            agg = summary["aggregated"]
            print(f"  -> runs={agg['runs']} dur={agg['duration_sec']}s med_of_meds={agg['median_of_medians_ms']:.1f}ms avg_med={agg['avg_median_ms']:.1f}ms")
            results.append({"config": {"clients": clients, "requests": reqs, "duration_seconds": duration}, **summary})
        else:
            res = await run_once_async(clients, reqs, print_results=print_results, show_responses=show_responses)
            print("  ->", {k: (round(v, 2) if isinstance(v, (int, float)) else v) for k, v in res.items() if k != "http"})
            results.append({"config": {"clients": clients, "requests": reqs}, "aggregated": res, "runs": 1, "runs_detail": [res]})
    return results

# Loop-aware sync wrapper for scripts
def run_config_queue(configs, print_results=False, show_responses=False):
    try:
        loop = asyncio.get_running_loop()
        if loop.is_running():
            print("Notebook loop is running. Use:\\n  await run_config_queue_async(configs)")
            return None
    except RuntimeError:
        pass
    return asyncio.run(run_config_queue_async(configs, print_results=print_results, show_responses=show_responses))

print("Benchmark helpers ready")


In [None]:
# Single sentence benchmark
configs = [
  {"clients": 1, "requests": 1, "duration_seconds": 20},
  {"clients": 5, "requests": 1, "duration_seconds": 20},
  {"clients": 10, "requests": 1, "duration_seconds": 20},
  {"clients": 20, "requests": 1, "duration_seconds": 20},
  {"clients": 30, "requests": 1, "duration_seconds": 20},
  {"clients": 35, "requests": 1, "duration_seconds": 20},
  {"clients": 40, "requests": 1, "duration_seconds": 20},
  {"clients": 45, "requests": 1, "duration_seconds": 20},
  {"clients": 50, "requests": 1, "duration_seconds": 20},
  {"clients": 100, "requests": 1, "duration_seconds": 20},
  {"clients": 200, "requests": 1, "duration_seconds": 20}
]
results = await run_config_queue_async(configs, print_results=False, show_responses=False)


In [None]:
# Batch benchmark (larger request sizes)
configs_large_batches = [
  {"clients": 1, "requests": 100, "duration_seconds": 20},
  {"clients": 5, "requests": 100, "duration_seconds": 20},
  {"clients": 10, "requests": 100, "duration_seconds": 20},
  {"clients": 20, "requests": 100, "duration_seconds": 20},
  {"clients": 30, "requests": 100, "duration_seconds": 20},
  {"clients": 35, "requests": 100, "duration_seconds": 20},
  {"clients": 40, "requests": 100, "duration_seconds": 20},
  {"clients": 45, "requests": 100, "duration_seconds": 20},
  {"clients": 50, "requests": 100, "duration_seconds": 20},
  {"clients": 100, "requests": 100, "duration_seconds": 20},
  {"clients": 200, "requests": 100, "duration_seconds": 20}
]
results_large = await run_config_queue_async(configs_large_batches, print_results=False, show_responses=False)
