In [2]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from dataclasses import field, asdict, dataclass
from typing import List
from benchmarks.benchmark_utils import RequestFuncOutput
from benchmarks.benchmark_workload_gen import *

In [3]:
def retrive_request_outputs(path):
    with open(path, 'r') as file:
        data = json.load(file)
        outputs = [RequestFuncOutput(**d) for d in data]
    return outputs

In [24]:
def lat_tpot_ttft(outputs: List[RequestFuncOutput], match = None, plot=False):
    if not match:
        match = lambda o: True
    outputs = [o for o in outputs if match(o)]
    if not outputs:
        print('No outputs to analyze')
        return
    lats = [o.request_latency for o in outputs if o.request_latency]
    tpots = [o.tpot for o in outputs if o.tpot]
    ttfts = [o.ttft for o in outputs if o.ttft]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99], method='nearest')
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99], method='nearest')
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99], method='nearest')
    
    print(f"Num finished: {len([o for o in outputs if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')

    if plot:
        fig, axs = plt.subplots(1,3, figsize=(16, 4))
        sns.kdeplot(x=lats,  ax=axs[0], cumulative=True).set_title('Latency')
        sns.kdeplot(x=tpots, ax=axs[1], cumulative=True).set_title('TPOT')
        sns.kdeplot(x=ttfts, ax=axs[2], cumulative=True).set_title('TTFT')
    
def ttft_slo(outputs, slo):
    ttfts = [o.ttft for o in outputs if o.ttft]
    slo_ttfts = [1 for ttft in ttfts if ttft <= slo]
    return sum(slo_ttfts) / len(ttfts)

def windowed_metric(start, end, outputs: List[RequestFuncOutput], exp_time, match = None):
    if not match:
        match = lambda o: True
    within_window = [o for o in outputs if o.send_out_time >= start and o.send_out_time <= end and match(o)]
    lats = [o.request_latency if o.success else exp_time - o.send_out_time for o in within_window]
    tpots = [o.tpot for o in within_window if o.tpot]
    ttfts = [o.ttft if o.ttft else exp_time - o.send_out_time for o in within_window]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99], method='nearest')
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99], method='nearest')
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99], method='nearest')
    avg_lat, avg_tpot, avg_ttft = np.mean(lats), np.mean(tpots), np.mean(ttfts)
    
    print('-'*20)
    print(f"Requests within window: {len(within_window)}")
    print(f"Num finished: {len([o for o in within_window if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')
    print(f'Avg Latency: {avg_lat:.2f}, Avg TPOT: {avg_tpot:.2f}, Avg TTFT: {avg_ttft:.2f}')
    print('-'*20)

def runtime_selections(outputs: List[RequestFuncOutput], match = None, plot=False):
    if not match:
        match = lambda o: True
    outputs = [o for o in outputs if match(o)]
    if not outputs:
        print('No outputs to analyze')
        return
    runtime_load = {}
    runtime_prefix = {}
    prefix_cnt = defaultdict(int)
    for o in outputs:
        if o.runtime_selected not in runtime_load:
            runtime_load[o.runtime_selected] = [0, 0, 0, 0]
        if o.runtime_selected not in runtime_prefix:
            runtime_prefix[o.runtime_selected] = set()
        prefix_index = WorkloadPrefixDataLoader.get_prefix_index(o)
        if prefix_index is None:
            if o.prompt_len < 7000:
                runtime_load[o.runtime_selected][2] += 1
            else:
                runtime_load[o.runtime_selected][3] += 1
        else:
            if o.prompt_len < 7000:
                runtime_load[o.runtime_selected][0] += 1
            else:
                runtime_load[o.runtime_selected][1] += 1
            runtime_prefix[o.runtime_selected].add(prefix_index)
            prefix_cnt[prefix_index] += 1
    print('gpu: [hot 3K, hot 7K, cold 3K, cold 7K], sum(load), sum(hot*ctx), sum(cold*ctx)')
    for idx, load in runtime_load.items():
        print(f'GPU {idx}: {load}, {sum(load)}, {load[0]*3 + load[1]*7}, {load[2]*3 + load[3]*7}')
    # for i in range(len(runtime_prefix)):
    #     others = set.union(*[runtime_prefix[j] for j in range(len(runtime_prefix)) if j != i])
    #     print(f'is prefix in gpu {i} unique: {len(runtime_prefix[i].intersection(others)) == 0}')
    # print(runtime_prefix)
    # print(sorted(prefix_cnt.items()))

In [25]:
is_cold = lambda o: not WorkloadPrefixDataLoader.is_hot(o)
def is_on_gpu(ks):
    def match(o: RequestFuncOutput, ks):
        return o.runtime_selected in ks
    return lambda o: match(o, ks)
def is_workload(i):
    def math(o: RequestFuncOutput, i):
        prefix_pattern = WorkloadPrefixDataLoader.get_prefix_index(o)
        return prefix_pattern is not None and prefix_pattern == i
    return lambda o: math(o, i)

In [31]:
sim_oracle_fcfs = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/perf_logs/rebalancer_hc/4r_react_40_0.285_2400_6_example_16_only_fcfsmpq_minload_2_highload_1.15_steal_even_exist/mistralai-Mistral-7B-v0.1_40_0.285_2400_6_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE:_inf.json')
lat_tpot_ttft(sim_oracle_fcfs)
# lat_tpot_ttft(sim_oracle_fcfs, WorkloadPrefixDataLoader.is_hot)
runtime_selections(sim_oracle_fcfs)

Num finished: 2399
Latency: p50=7.74, p90=26.71, p99=41.16
TPOT: p50=0.11, p90=0.32, p99=0.44
TTFT: p50=1.07, p90=8.11, p99=17.14
gpu: [hot 3K, hot 7K, cold 3K, cold 7K], sum(load), sum(hot*ctx), sum(cold*ctx)
GPU 2: [0, 429, 0, 171], 600, 3003, 1197
GPU 0: [0, 429, 0, 170], 599, 3003, 1190
GPU 1: [0, 429, 0, 171], 600, 3003, 1197
GPU 3: [0, 429, 0, 171], 600, 3003, 1197


In [32]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/perf_logs/rebalancer_hc/4r_react_40_0.285_2400_6_example_16_only_fcfsmpq_minload_2_highload_1.15_steal_even_exist/mistralai-Mistral-7B-v0.1_40_0.285_2400_6_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.GlobalSchedulerWithoutRebalancing:without_rebalancing_inf.json')
lat_tpot_ttft(sim_oracle)
runtime_selections(sim_oracle)
lat_tpot_ttft(sim_oracle, is_on_gpu([0]))
lat_tpot_ttft(sim_oracle, is_on_gpu([1]))
lat_tpot_ttft(sim_oracle, is_on_gpu([2]))
lat_tpot_ttft(sim_oracle, is_on_gpu([3]))
# lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 2399
Latency: p50=17.25, p90=28.86, p99=40.70
TPOT: p50=0.21, p90=0.36, p99=0.45
TTFT: p50=1.78, p90=11.97, p99=18.66
gpu: [hot 3K, hot 7K, cold 3K, cold 7K], sum(load), sum(hot*ctx), sum(cold*ctx)
GPU 0: [0, 602, 0, 152], 754, 4214, 1064
GPU 1: [0, 556, 0, 155], 711, 3892, 1085
GPU 2: [0, 257, 0, 201], 458, 1799, 1407
GPU 3: [0, 301, 0, 175], 476, 2107, 1225
Num finished: 754
Latency: p50=19.31, p90=27.62, p99=36.72
TPOT: p50=0.25, p90=0.36, p99=0.44
TTFT: p50=1.73, p90=10.89, p99=17.90
Num finished: 711
Latency: p50=18.88, p90=30.10, p99=40.86
TPOT: p50=0.24, p90=0.38, p99=0.47
TTFT: p50=1.98, p90=12.26, p99=20.04
Num finished: 458
Latency: p50=14.98, p90=30.37, p99=41.94
TPOT: p50=0.16, p90=0.39, p99=0.45
TTFT: p50=2.20, p90=14.54, p99=20.69
Num finished: 476
Latency: p50=9.86, p90=26.30, p99=41.55
TPOT: p50=0.10, p90=0.29, p99=0.41
TTFT: p50=1.35, p90=9.34, p99=17.21


In [36]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/perf_logs/rebalancer_hc/4r_react_40_0.285_2400_6_example_16_only_fcfs_minload_2_highload_1.3_steal_even_exist/mistralai-Mistral-7B-v0.1_40_0.285_2400_6_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.GlobalSchedulerWithoutRebalancing:without_rebalancing_inf.json')
lat_tpot_ttft(sim_oracle)
runtime_selections(sim_oracle)
lat_tpot_ttft(sim_oracle, is_on_gpu([0]))
lat_tpot_ttft(sim_oracle, is_on_gpu([1]))
lat_tpot_ttft(sim_oracle, is_on_gpu([2]))
lat_tpot_ttft(sim_oracle, is_on_gpu([3]))
# lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 2399
Latency: p50=15.87, p90=55.21, p99=81.78
TPOT: p50=0.12, p90=0.38, p99=0.47
TTFT: p50=4.23, p90=40.44, p99=64.95
gpu: [hot 3K, hot 7K, cold 3K, cold 7K], sum(load), sum(hot*ctx), sum(cold*ctx)
GPU 0: [0, 602, 0, 15], 617, 4214, 105
GPU 1: [0, 556, 0, 40], 596, 3892, 280
GPU 2: [0, 257, 0, 336], 593, 1799, 2352
GPU 3: [0, 301, 0, 292], 593, 2107, 2044
Num finished: 617
Latency: p50=2.93, p90=19.48, p99=35.78
TPOT: p50=0.04, p90=0.18, p99=0.40
TTFT: p50=0.21, p90=4.84, p99=18.06
Num finished: 596
Latency: p50=3.06, p90=21.44, p99=40.65
TPOT: p50=0.04, p90=0.22, p99=0.38
TTFT: p50=0.25, p90=6.75, p99=18.25
Num finished: 593
Latency: p50=41.94, p90=75.70, p99=87.46
TPOT: p50=0.25, p90=0.39, p99=0.50
TTFT: p50=23.71, p90=56.87, p99=67.99
Num finished: 593
Latency: p50=29.58, p90=49.19, p99=61.79
TPOT: p50=0.25, p90=0.42, p99=0.54
TTFT: p50=12.13, p90=31.43, p99=39.48


In [35]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/perf_logs/rebalancer_hc/4r_react_40_0.285_2400_6_example_16_only_fcfs_minload_2_highload_1.3_steal_even_exist/mistralai-Mistral-7B-v0.1_40_0.285_2400_6_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.GlobalSchedulerWithoutRebalancing:without_rebalancing_inf.json')
lat_tpot_ttft(sim_oracle)
runtime_selections(sim_oracle)
lat_tpot_ttft(sim_oracle, is_on_gpu([0]))
lat_tpot_ttft(sim_oracle, is_on_gpu([1]))
lat_tpot_ttft(sim_oracle, is_on_gpu([2]))
lat_tpot_ttft(sim_oracle, is_on_gpu([3]))
# lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 2399
Latency: p50=15.87, p90=55.21, p99=81.78
TPOT: p50=0.12, p90=0.38, p99=0.47
TTFT: p50=4.23, p90=40.44, p99=64.95
gpu: [hot 3K, hot 7K, cold 3K, cold 7K], sum(load), sum(hot*ctx), sum(cold*ctx)
GPU 0: [0, 602, 0, 15], 617, 4214, 105
GPU 1: [0, 556, 0, 40], 596, 3892, 280
GPU 2: [0, 257, 0, 336], 593, 1799, 2352
GPU 3: [0, 301, 0, 292], 593, 2107, 2044
Num finished: 617
Latency: p50=2.93, p90=19.48, p99=35.78
TPOT: p50=0.04, p90=0.18, p99=0.40
TTFT: p50=0.21, p90=4.84, p99=18.06
Num finished: 596
Latency: p50=3.06, p90=21.44, p99=40.65
TPOT: p50=0.04, p90=0.22, p99=0.38
TTFT: p50=0.25, p90=6.75, p99=18.25
Num finished: 593
Latency: p50=41.94, p90=75.70, p99=87.46
TPOT: p50=0.25, p90=0.39, p99=0.50
TTFT: p50=23.71, p90=56.87, p99=67.99
Num finished: 593
Latency: p50=29.58, p90=49.19, p99=61.79
TPOT: p50=0.25, p90=0.42, p99=0.54
TTFT: p50=12.13, p90=31.43, p99=39.48


In [30]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/perf_logs/rebalancer_hc/4r_react_40_0.285_2400_6_example_16_only_fcfs_minload_2_highload_1.15_steal_even_exist/mistralai-Mistral-7B-v0.1_40_0.285_2400_6_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.GlobalScheduler:add_hot_cold_inf.json')
lat_tpot_ttft(sim_oracle)
runtime_selections(sim_oracle)
lat_tpot_ttft(sim_oracle, is_on_gpu([0]))
lat_tpot_ttft(sim_oracle, is_on_gpu([1]))
lat_tpot_ttft(sim_oracle, is_on_gpu([2]))
lat_tpot_ttft(sim_oracle, is_on_gpu([3]))
# lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 2399
Latency: p50=13.60, p90=32.45, p99=46.72
TPOT: p50=0.16, p90=0.36, p99=0.48
TTFT: p50=1.34, p90=14.13, p99=29.11
gpu: [hot 3K, hot 7K, cold 3K, cold 7K], sum(load), sum(hot*ctx), sum(cold*ctx)
GPU 0: [0, 429, 0, 174], 603, 3003, 1218
GPU 1: [0, 424, 0, 166], 590, 2968, 1162
GPU 2: [0, 418, 0, 175], 593, 2926, 1225
GPU 3: [0, 445, 0, 168], 613, 3115, 1176
Num finished: 603
Latency: p50=14.61, p90=34.64, p99=46.50
TPOT: p50=0.17, p90=0.38, p99=0.56
TTFT: p50=1.97, p90=14.91, p99=29.11
Num finished: 590
Latency: p50=15.93, p90=38.34, p99=55.40
TPOT: p50=0.19, p90=0.38, p99=0.50
TTFT: p50=1.46, p90=23.92, p99=32.82
Num finished: 593
Latency: p50=9.98, p90=22.75, p99=40.90
TPOT: p50=0.13, p90=0.29, p99=0.44
TTFT: p50=1.13, p90=7.88, p99=15.77
Num finished: 613
Latency: p50=15.25, p90=28.30, p99=40.13
TPOT: p50=0.18, p90=0.36, p99=0.45
TTFT: p50=1.27, p90=9.39, p99=16.12
