In [None]:
####Wrapping in Fast API
from fastapi import FastAPI
from pydantic import BaseModel
import onnxruntime as ort
from transformers import GPT2Tokenizer

class TextRequest(BaseModel):
    text: str

app = FastAPI()

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
session = ort.InferenceSession("gpt2.onnx", providers=["CPUExecutionProvider"])

@app.post("/predict")
async def predict(request: TextRequest):
    input_ids = tokenizer.encode(request.text, return_tensors="np").astype(np.int64)
    logits = session.run(None, {"input_ids": input_ids})[0]
    predicted_token_id = logits[0][-1].argmax()
    response = tokenizer.decode([predicted_token_id])
    return {"response": response}


In [None]:
uvicorn app:app --host 0.0.0.0 --port 8000


In [None]:
######Bench mark Fast API
import requests
import time
import numpy as np

FASTAPI_URL = "http://localhost:8000/predict"
payload = {"text": "What are the symptoms of COVID-19?"}
num_requests = 100
inference_times = []

for _ in range(num_requests):
    start = time.time()
    response = requests.post(FASTAPI_URL, json=payload)
    end = time.time()
    
    if response.status_code == 200:
        inference_times.append(end - start)
    else:
        print("Error:", response.text)

inference_times = np.array(inference_times)
print(f"Median latency: {np.median(inference_times)*1000:.2f} ms")
print(f"95th percentile: {np.percentile(inference_times, 95)*1000:.2f} ms")
print(f"99th percentile: {np.percentile(inference_times, 99)*1000:.2f} ms")
print(f"Throughput: {num_requests / np.sum(inference_times):.2f} req/s")


In [None]:
######Concurrency TEsts
import concurrent.futures

def send_request(payload):
    start = time.time()
    r = requests.post(FASTAPI_URL, json=payload)
    end = time.time()
    return end - start if r.status_code == 200 else None

def run_concurrent_tests(num_requests, max_workers):
    inference_times = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(send_request, payload) for _ in range(num_requests)]
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result: inference_times.append(result)
    return inference_times

times = run_concurrent_tests(1000, max_workers=16)

print(f"Median latency: {np.median(times)*1000:.2f} ms")
print(f"Throughput: {len(times) / np.sum(times):.2f} req/s")


In [None]:
Optimizations Lab Manual

Major Phase	Subtasks	Description
1. FastAPI Endpoint	a) Wrap model in FastAPI
b) Launch locally	Turn your PyTorch/ONNX model into a FastAPI server endpoint for inference
2. Benchmarking FastAPI	a) Single User
b) Measure Median, 95th, 99th Latency, Throughput	Send 100 sequential requests to FastAPI and collect metrics
3. Concurrent Clients Stress Test	a) Run with 16 concurrent threads
b) Measure new latencies and throughput	Simulate high load (queuing delay grows)
4. Triton Inference Server Setup	a) Serve model via Triton
b) Docker Compose setup	Deploy a production-grade Triton server (for GPU servers like Chameleon)
5. Triton Dynamic Batching	a) Enable dynamic batching in config.pbtxt
b) Re-benchmark performance	Allow Triton to batch requests automatically and improve efficiency
6. Scaling on Multiple GPUs	a) Configure multiple model instances across GPUs	Serve models across 2× GPUs (e.g., P100s)
7. Serving ONNX Optimized Models	a) Migrate Triton to ONNX Backend
b) Improve serving latency	Deploy optimized ONNX model instead of PyTorch model in Triton
8. Benchmark All Scenarios	a) Compare FastAPI vs Triton
b) Compare PyTorch backend vs ONNX backend in Triton	Full system performance comparison at different concurrency levels
9. Update Flask App	(only if using Flask frontend)	Update your frontend app to correctly interface with Triton’s ONNX servi

In [None]:
######Do Triton part on gpu

4️⃣ Triton Inference Server Setup (Once on GPU)
a) Folder structure
arduino
Copy
Edit
models/
└── gpt2_model
    ├── 1
    │   └── model.onnx
    └── config.pbtxt
b) Example config.pbtxt (for GPT-2 ONNX model)

name: "gpt2_model"
backend: "onnxruntime"
max_batch_size: 16
input [
  {
    name: "input_ids"
    data_type: TYPE_INT64
    dims: [ -1, -1 ]  # flexible batch and sequence length
  }
]
output [
  {
    name: "logits"
    data_type: TYPE_FP32
    dims: [ -1, -1, -1 ]
  }
]
instance_group [
  {
    count: 1
    kind: KIND_GPU
    gpus: [ 0 ]
  }
]

c) Launch Triton server (on GPU)

docker run --gpus all --rm -p8000:8000 -p8001:8001 -p8002:8002 \
-v $(pwd)/models:/models nvcr.io/nvidia/tritonserver:23.01-py3 \
tritonserver --model-repository=/models
✅ Triton server will start and serve GPT-2 ONNX model.


5️⃣ Triton Client Benchmark (Performance Analyzer)
Inside Triton container or another container:

perf_analyzer -u localhost:8000 -m gpt2_model -b 1 --input-data input.json
✅ Reports:

Queuing delay

Compute infer latency

Throughput

6️⃣ Enable Dynamic Batching in Triton
Edit config.pbtxt:

txt
Copy
Edit
dynamic_batching {
  preferred_batch_size: [4, 8, 16]
  max_queue_delay_microseconds: 100
}
✅ Then restart Triton Server and benchmark again.

7️⃣ Scale Across GPUs in Triton
Edit config.pbtxt again:

txt
Copy
Edit
instance_group [
  {
    count: 2
    kind: KIND_GPU
    gpus: [ 0, 1 ]
  }
]
✅ Restart Triton and run performance analyzer again!

