In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from dataclasses import field, asdict, dataclass
from typing import List
from benchmarks.benchmark_utils import RequestFuncOutput
from benchmarks.benchmark_workload_gen import *

In [2]:
def retrive_request_outputs(path):
    with open(path, 'r') as file:
        data = json.load(file)
        outputs = [RequestFuncOutput(**d) for d in data]
    return outputs

In [3]:
def lat_tpot_ttft(outputs: List[RequestFuncOutput], match = None, plot=False):
    if not match:
        match = lambda o: True
    outputs = [o for o in outputs if match(o)]
    lats = [o.request_latency for o in outputs if o.request_latency]
    tpots = [o.tpot for o in outputs if o.tpot]
    ttfts = [o.ttft for o in outputs if o.ttft]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99])
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99])
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99])
    
    print(f"Num finished: {len([o for o in outputs if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')

    if plot:
        fig, axs = plt.subplots(1,3, figsize=(16, 4))
        sns.kdeplot(x=lats,  ax=axs[0], cumulative=True).set_title('Latency')
        sns.kdeplot(x=tpots, ax=axs[1], cumulative=True).set_title('TPOT')
        sns.kdeplot(x=ttfts, ax=axs[2], cumulative=True).set_title('TTFT')
    
def ttft_slo(outputs, slo):
    ttfts = [o.ttft for o in outputs if o.ttft]
    slo_ttfts = [1 for ttft in ttfts if ttft <= slo]
    return sum(slo_ttfts) / len(ttfts)

def windowed_metric(start, end, outputs: List[RequestFuncOutput], exp_time, match = None):
    if not match:
        match = lambda o: True
    within_window = [o for o in outputs if o.send_out_time >= start and o.send_out_time <= end and match(o)]
    lats = [o.request_latency if o.success else exp_time - o.send_out_time for o in within_window]
    tpots = [o.tpot for o in within_window if o.tpot]
    ttfts = [o.ttft if o.ttft else exp_time - o.send_out_time for o in within_window]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99])
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99])
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99])
    avg_lat, avg_tpot, avg_ttft = np.mean(lats), np.mean(tpots), np.mean(ttfts)
    
    print('-'*20)
    print(f"Requests within window: {len(within_window)}")
    print(f"Num finished: {len([o for o in within_window if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')
    print(f'Avg Latency: {avg_lat:.2f}, Avg TPOT: {avg_tpot:.2f}, Avg TTFT: {avg_ttft:.2f}')
    print('-'*20)

In [4]:
is_cold = lambda o: not WorkloadPrefixDataLoader.is_hot(o)

In [16]:
sim_random = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/hc_logs_run_to_complete/sim_react_8k_100_0.4_2400_4/mistralai-Mistral-7B-v0.1_100_0.3_2400_4_DataParallelRuntimeSelectionPolicy.RANDOM-None:4r_inf.json')
lat_tpot_ttft(sim_random)
lat_tpot_ttft(sim_random, is_cold)
lat_tpot_ttft(sim_random, WorkloadPrefixDataLoader.is_hot)

Num finished: 2400
Latency: p50=28.44, p90=120.47, p99=462.60
TPOT: p50=0.25, p90=0.41, p99=0.45
TTFT: p50=7.94, p90=108.38, p99=451.20
Num finished: 720
Latency: p50=70.49, p90=337.08, p99=508.83
TPOT: p50=0.21, p90=0.38, p99=0.44
TTFT: p50=55.84, p90=327.08, p99=489.76
Num finished: 1680
Latency: p50=24.23, p90=35.62, p99=41.04
TPOT: p50=0.28, p90=0.41, p99=0.46
TTFT: p50=6.87, p90=11.59, p99=22.84


In [17]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/hc_logs_run_to_complete/sim_react_8k_100_0.4_2400_4/mistralai-Mistral-7B-v0.1_100_0.3_2400_4_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE:4r_inf.json')
lat_tpot_ttft(sim_oracle)
lat_tpot_ttft(sim_oracle, is_cold)
lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 2400
Latency: p50=26.74, p90=38.01, p99=118.82
TPOT: p50=0.32, p90=0.43, p99=0.46
TTFT: p50=5.39, p90=15.38, p99=100.06
Num finished: 720
Latency: p50=31.06, p90=67.06, p99=160.14
TPOT: p50=0.31, p90=0.42, p99=0.45
TTFT: p50=9.03, p90=48.73, p99=143.93
Num finished: 1680
Latency: p50=25.59, p90=33.40, p99=38.02
TPOT: p50=0.32, p90=0.43, p99=0.46
TTFT: p50=4.90, p90=8.37, p99=13.87


In [15]:
sim_2h2c = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/hc_logs_run_to_complete/sim_react_8k_100_0.4_2400_4/mistralai-Mistral-7B-v0.1_100_0.3_2400_4_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE_HOT_COLD:4r_2h_2c_inf.json')
lat_tpot_ttft(sim_2h2c)
lat_tpot_ttft(sim_2h2c, is_cold)
lat_tpot_ttft(sim_2h2c, WorkloadPrefixDataLoader.is_hot)

Num finished: 2400
Latency: p50=26.81, p90=75.94, p99=245.51
TPOT: p50=0.27, p90=0.39, p99=0.46
TTFT: p50=7.78, p90=60.82, p99=219.86
Num finished: 720
Latency: p50=10.18, p90=21.49, p99=31.11
TPOT: p50=0.12, p90=0.29, p99=0.46
TTFT: p50=1.95, p90=4.23, p99=6.72
Num finished: 1680
Latency: p50=33.82, p90=110.29, p99=252.30
TPOT: p50=0.31, p90=0.40, p99=0.45
TTFT: p50=14.10, p90=100.29, p99=225.57


In [18]:
sim_3h1c = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/hc_logs_run_to_complete/sim_react_8k_100_0.4_2400_4/mistralai-Mistral-7B-v0.1_100_0.3_2400_4_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE_HOT_COLD:4r_3h_1c_inf.json')
lat_tpot_ttft(sim_3h1c)
lat_tpot_ttft(sim_3h1c, is_cold)
lat_tpot_ttft(sim_3h1c, WorkloadPrefixDataLoader.is_hot)

Num finished: 2400
Latency: p50=34.49, p90=202.44, p99=584.27
TPOT: p50=0.30, p90=0.40, p99=0.45
TTFT: p50=14.55, p90=181.41, p99=567.67
Num finished: 720
Latency: p50=36.43, p90=518.76, p99=619.93
TPOT: p50=0.25, p90=0.44, p99=0.44
TTFT: p50=15.96, p90=510.02, p99=601.98
Num finished: 1680
Latency: p50=33.83, p90=110.33, p99=252.32
TPOT: p50=0.31, p90=0.40, p99=0.45
TTFT: p50=14.10, p90=100.30, p99=225.59
