In [7]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from dataclasses import field, asdict, dataclass
from typing import List
from benchmarks.benchmark_utils import RequestFuncOutput
from benchmarks.benchmark_workload_gen import *

In [8]:
def retrive_request_outputs(path):
    with open(path, 'r') as file:
        data = json.load(file)
        outputs = [RequestFuncOutput(**d) for d in data]
    return outputs

In [9]:
def lat_tpot_ttft(outputs: List[RequestFuncOutput], match = None, plot=False):
    if not match:
        match = lambda o: True
    outputs = [o for o in outputs if match(o)]
    if not outputs:
        print('No outputs to analyze')
        return
    lats = [o.request_latency for o in outputs if o.request_latency]
    tpots = [o.tpot for o in outputs if o.tpot]
    ttfts = [o.ttft for o in outputs if o.ttft]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99], method='nearest')
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99], method='nearest')
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99], method='nearest')
    
    print(f"Num finished: {len([o for o in outputs if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')

    if plot:
        fig, axs = plt.subplots(1,3, figsize=(16, 4))
        sns.kdeplot(x=lats,  ax=axs[0], cumulative=True).set_title('Latency')
        sns.kdeplot(x=tpots, ax=axs[1], cumulative=True).set_title('TPOT')
        sns.kdeplot(x=ttfts, ax=axs[2], cumulative=True).set_title('TTFT')
    
def ttft_slo(outputs, slo):
    ttfts = [o.ttft for o in outputs if o.ttft]
    slo_ttfts = [1 for ttft in ttfts if ttft <= slo]
    return sum(slo_ttfts) / len(ttfts)

def windowed_metric(start, end, outputs: List[RequestFuncOutput], exp_time, match = None):
    if not match:
        match = lambda o: True
    within_window = [o for o in outputs if o.send_out_time >= start and o.send_out_time <= end and match(o)]
    lats = [o.request_latency if o.success else exp_time - o.send_out_time for o in within_window]
    tpots = [o.tpot for o in within_window if o.tpot]
    ttfts = [o.ttft if o.ttft else exp_time - o.send_out_time for o in within_window]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99], method='nearest')
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99], method='nearest')
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99], method='nearest')
    avg_lat, avg_tpot, avg_ttft = np.mean(lats), np.mean(tpots), np.mean(ttfts)
    
    print('-'*20)
    print(f"Requests within window: {len(within_window)}")
    print(f"Num finished: {len([o for o in within_window if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')
    print(f'Avg Latency: {avg_lat:.2f}, Avg TPOT: {avg_tpot:.2f}, Avg TTFT: {avg_ttft:.2f}')
    print('-'*20)

def runtime_selections(outputs: List[RequestFuncOutput], match = None, plot=False):
    if not match:
        match = lambda o: True
    outputs = [o for o in outputs if match(o)]
    if not outputs:
        print('No outputs to analyze')
        return
    runtime_load = {}
    runtime_prefix = {}
    prefix_cnt = defaultdict(int)
    for o in outputs:
        if o.runtime_selected not in runtime_load:
            runtime_load[o.runtime_selected] = [0, 0, 0, 0]
        if o.runtime_selected not in runtime_prefix:
            runtime_prefix[o.runtime_selected] = set()
        prefix_index = WorkloadPrefixDataLoader.get_prefix_index(o)
        if prefix_index is None:
            if o.prompt_len < 7000:
                runtime_load[o.runtime_selected][2] += 1
            else:
                runtime_load[o.runtime_selected][3] += 1
        else:
            if o.prompt_len < 7000:
                runtime_load[o.runtime_selected][0] += 1
            else:
                runtime_load[o.runtime_selected][1] += 1
            runtime_prefix[o.runtime_selected].add(prefix_index)
            prefix_cnt[prefix_index] += 1
    print('gpu: [hot 3K, hot 7K, cold 3K, cold 7K], sum(load)')
    for idx, load in runtime_load.items():
        print(f'GPU {idx}: {load}, {sum(load)}')
    for i in range(len(runtime_prefix)):
        others = set.union(*[runtime_prefix[j] for j in range(len(runtime_prefix)) if j != i])
        print(f'is prefix in gpu {i} unique: {len(runtime_prefix[i].intersection(others)) == 0}')
    # print(runtime_prefix)
    # print(sorted(prefix_cnt.items()))

In [10]:
is_cold = lambda o: not WorkloadPrefixDataLoader.is_hot(o)
def is_on_gpu(ks):
    def match(o: RequestFuncOutput, ks):
        return o.runtime_selected in ks
    return lambda o: match(o, ks)
def is_workload(i):
    def math(o: RequestFuncOutput, i):
        prefix_pattern = WorkloadPrefixDataLoader.get_prefix_index(o)
        return prefix_pattern is not None and prefix_pattern == i
    return lambda o: math(o, i)

In [11]:
sim_oracle_fcfs = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/hc_integration/4r_react_20_0.285_1350_9_[8,16]/mistralai-Mistral-7B-v0.1_40_0.285_2700_9_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE:_inf.json')
lat_tpot_ttft(sim_oracle_fcfs)
lat_tpot_ttft(sim_oracle_fcfs, is_on_gpu([0]))
lat_tpot_ttft(sim_oracle_fcfs, is_on_gpu([3]))
# lat_tpot_ttft(sim_oracle_fcfs, WorkloadPrefixDataLoader.is_hot)
# runtime_selections(sim_oracle_fcfs)

Num finished: 2698
Latency: p50=10.53, p90=30.21, p99=44.76
TPOT: p50=0.15, p90=0.37, p99=0.47
TTFT: p50=1.08, p90=11.01, p99=20.62
Num finished: 676
Latency: p50=12.11, p90=34.39, p99=46.88
TPOT: p50=0.16, p90=0.39, p99=0.55
TTFT: p50=1.16, p90=13.37, p99=21.24
Num finished: 674
Latency: p50=9.90, p90=25.69, p99=43.36
TPOT: p50=0.14, p90=0.30, p99=0.42
TTFT: p50=0.88, p90=8.72, p99=19.57


In [13]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/logs/debug/mistralai-Mistral-7B-v0.1_40_0.285_2700_9_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.HiostgramBasedRecompLoadWithEvictionV2:_inf.json')
lat_tpot_ttft(sim_oracle)
runtime_selections(sim_oracle)
lat_tpot_ttft(sim_oracle, is_on_gpu([0]))
lat_tpot_ttft(sim_oracle, is_on_gpu([1]))
lat_tpot_ttft(sim_oracle, is_on_gpu([2]))
lat_tpot_ttft(sim_oracle, is_on_gpu([3]))
# lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 2698
Latency: p50=16.32, p90=30.44, p99=45.28
TPOT: p50=0.19, p90=0.35, p99=0.43
TTFT: p50=2.31, p90=12.78, p99=23.84
gpu: [hot 3K, hot 7K, cold 3K, cold 7K], sum(load)
GPU 0: [338, 193, 92, 94], 717
GPU 1: [241, 241, 89, 92], 663
GPU 2: [146, 192, 89, 97], 524
GPU 3: [240, 339, 114, 101], 794
is prefix in gpu 0 unique: True
is prefix in gpu 1 unique: True
is prefix in gpu 2 unique: True
is prefix in gpu 3 unique: True
Num finished: 717
Latency: p50=13.27, p90=26.05, p99=43.70
TPOT: p50=0.17, p90=0.33, p99=0.40
TTFT: p50=1.72, p90=11.37, p99=21.04
Num finished: 663
Latency: p50=13.47, p90=25.75, p99=42.09
TPOT: p50=0.14, p90=0.32, p99=0.40
TTFT: p50=2.02, p90=9.14, p99=20.80
Num finished: 524
Latency: p50=12.10, p90=28.68, p99=45.35
TPOT: p50=0.13, p90=0.33, p99=0.40
TTFT: p50=2.27, p90=12.37, p99=21.91
Num finished: 794
Latency: p50=24.00, p90=35.18, p99=52.11
TPOT: p50=0.29, p90=0.40, p99=0.44
TTFT: p50=3.43, p90=15.30, p99=33.26


In [14]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/hc_integration/4r_react_20_0.285_1350_9_[8,16]_cp_512/mistralai-Mistral-7B-v0.1_40_0.285_2700_9_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.HiostgramBasedRecompLoadWithEvictionV2:_inf.json')
lat_tpot_ttft(sim_oracle)
runtime_selections(sim_oracle)
lat_tpot_ttft(sim_oracle, is_on_gpu([0]))
lat_tpot_ttft(sim_oracle, is_on_gpu([1]))
lat_tpot_ttft(sim_oracle, is_on_gpu([2]))
lat_tpot_ttft(sim_oracle, is_on_gpu([3]))
# lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 2698
Latency: p50=14.42, p90=24.93, p99=37.78
TPOT: p50=0.10, p90=0.13, p99=0.26
TTFT: p50=7.36, p90=17.49, p99=29.14
gpu: [hot 3K, hot 7K, cold 3K, cold 7K], sum(load)
GPU 0: [242, 240, 91, 110], 683
GPU 1: [337, 241, 103, 93], 774
GPU 2: [194, 193, 90, 84], 561
GPU 3: [192, 291, 100, 97], 680
is prefix in gpu 0 unique: True
is prefix in gpu 1 unique: True
is prefix in gpu 2 unique: True
is prefix in gpu 3 unique: True
Num finished: 683
Latency: p50=16.26, p90=30.72, p99=43.37
TPOT: p50=0.10, p90=0.17, p99=0.26
TTFT: p50=8.80, p90=22.26, p99=33.92
Num finished: 774
Latency: p50=15.40, p90=25.33, p99=35.52
TPOT: p50=0.10, p90=0.16, p99=0.19
TTFT: p50=7.80, p90=17.57, p99=25.07
Num finished: 561
Latency: p50=10.46, p90=20.19, p99=22.41
TPOT: p50=0.09, p90=0.12, p99=0.18
TTFT: p50=4.59, p90=13.73, p99=15.56
Num finished: 680
Latency: p50=14.56, p90=24.98, p99=34.76
TPOT: p50=0.10, p90=0.12, p99=0.27
TTFT: p50=7.45, p90=17.95, p99=27.77
