In [2]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from dataclasses import field, asdict, dataclass
from typing import List
from benchmarks.benchmark_utils import RequestFuncOutput
from benchmarks.benchmark_workload_gen import *

In [3]:
def retrive_request_outputs(path):
    with open(path, 'r') as file:
        data = json.load(file)
        outputs = [RequestFuncOutput(**d) for d in data]
    return outputs

In [4]:
def lat_tpot_ttft(outputs: List[RequestFuncOutput], match = None, plot=False):
    if not match:
        match = lambda o: True
    outputs = [o for o in outputs if match(o)]
    if not outputs:
        print('No outputs to analyze')
        return
    lats = [o.request_latency for o in outputs if o.request_latency]
    tpots = [o.tpot for o in outputs if o.tpot]
    ttfts = [o.ttft for o in outputs if o.ttft]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99], method='nearest')
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99], method='nearest')
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99], method='nearest')
    
    print(f"Num finished: {len([o for o in outputs if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')

    if plot:
        fig, axs = plt.subplots(1,3, figsize=(16, 4))
        sns.kdeplot(x=lats,  ax=axs[0], cumulative=True).set_title('Latency')
        sns.kdeplot(x=tpots, ax=axs[1], cumulative=True).set_title('TPOT')
        sns.kdeplot(x=ttfts, ax=axs[2], cumulative=True).set_title('TTFT')
    
def ttft_slo(outputs, slo):
    ttfts = [o.ttft for o in outputs if o.ttft]
    slo_ttfts = [1 for ttft in ttfts if ttft <= slo]
    return sum(slo_ttfts) / len(ttfts)

def windowed_metric(start, end, outputs: List[RequestFuncOutput], exp_time, match = None):
    if not match:
        match = lambda o: True
    within_window = [o for o in outputs if o.send_out_time >= start and o.send_out_time <= end and match(o)]
    lats = [o.request_latency if o.success else exp_time - o.send_out_time for o in within_window]
    tpots = [o.tpot for o in within_window if o.tpot]
    ttfts = [o.ttft if o.ttft else exp_time - o.send_out_time for o in within_window]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99], method='nearest')
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99], method='nearest')
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99], method='nearest')
    avg_lat, avg_tpot, avg_ttft = np.mean(lats), np.mean(tpots), np.mean(ttfts)
    
    print('-'*20)
    print(f"Requests within window: {len(within_window)}")
    print(f"Num finished: {len([o for o in within_window if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')
    print(f'Avg Latency: {avg_lat:.2f}, Avg TPOT: {avg_tpot:.2f}, Avg TTFT: {avg_ttft:.2f}')
    print('-'*20)

def runtime_selection_consistency(outputs: List[RequestFuncOutput], match = None, plot=False):
    if not match:
        match = lambda o: True
    outputs = [o for o in outputs if match(o)]
    if not outputs:
        print('No outputs to analyze')
        return
    runtime_load = {}
    runtime_prefix = {}
    prefix_cnt = defaultdict(int)
    for o in outputs:
        if o.runtime_selected not in runtime_load:
            runtime_load[o.runtime_selected] = [0, 0]
        if o.runtime_selected not in runtime_prefix:
            runtime_prefix[o.runtime_selected] = set()
        prefix_index = WorkloadPrefixDataLoader.get_prefix_index(o)
        if prefix_index is None:
            runtime_load[o.runtime_selected][1] += 1
        else:
            runtime_load[o.runtime_selected][0] += 1
            runtime_prefix[o.runtime_selected].add(prefix_index)
            prefix_cnt[prefix_index] += 1
    print(runtime_load)
    print(runtime_prefix)
    print(sorted(prefix_cnt.items()))

In [5]:
is_cold = lambda o: not WorkloadPrefixDataLoader.is_hot(o)
def is_on_gpu(ks):
    def match(o: RequestFuncOutput, ks):
        return o.runtime_selected in ks
    return lambda o: match(o, ks)
def is_workload(i):
    def math(o: RequestFuncOutput, i):
        prefix_pattern = WorkloadPrefixDataLoader.get_prefix_index(o)
        return prefix_pattern is not None and prefix_pattern == i
    return lambda o: math(o, i)

In [6]:
sim_oracle_fcfs = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/cp_debug/4r_react_20_0.384_1950_6.5_fcfs/mistralai-Mistral-7B-v0.1_20_0.384_1950_6.5_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE:_inf.json')
lat_tpot_ttft(sim_oracle_fcfs)
lat_tpot_ttft(sim_oracle_fcfs, is_cold)
lat_tpot_ttft(sim_oracle_fcfs, WorkloadPrefixDataLoader.is_hot)
# runtime_selection_consistency(sim_oracle_fcfs)

Num finished: 1949
Latency: p50=20.76, p90=39.57, p99=50.59
TPOT: p50=0.24, p90=0.43, p99=0.60
TTFT: p50=4.25, p90=17.69, p99=25.10
Num finished: 748
Latency: p50=23.13, p90=43.14, p99=51.26
TPOT: p50=0.24, p90=0.40, p99=0.45
TTFT: p50=8.42, p90=20.91, p99=26.05
Num finished: 1201
Latency: p50=19.69, p90=35.94, p99=47.35
TPOT: p50=0.24, p90=0.47, p99=0.60
TTFT: p50=2.82, p90=8.58, p99=20.09


In [7]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/cp_debug/4r_react_20_0.384_1950_6.5_fcfs_cp_1024/mistralai-Mistral-7B-v0.1_20_0.384_1950_6.5_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE:_inf.json')
lat_tpot_ttft(sim_oracle)
lat_tpot_ttft(sim_oracle, is_cold)
lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)
# runtime_selection_consistency(sim_oracle)

Num finished: 1949
Latency: p50=21.45, p90=31.36, p99=37.07
TPOT: p50=0.20, p90=0.21, p99=0.32
TTFT: p50=8.14, p90=18.82, p99=23.85
Num finished: 748
Latency: p50=21.57, p90=31.91, p99=37.27
TPOT: p50=0.20, p90=0.21, p99=0.32
TTFT: p50=8.54, p90=19.13, p99=24.32
Num finished: 1201
Latency: p50=21.34, p90=31.28, p99=36.75
TPOT: p50=0.20, p90=0.21, p99=0.32
TTFT: p50=7.90, p90=18.66, p99=23.63


In [7]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/cp_debug/mpq_with_cp_512_check_hit_small_group/mistralai-Mistral-7B-v0.1_20_0.2_1200_4_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE:_inf.json')
lat_tpot_ttft(sim_oracle)
lat_tpot_ttft(sim_oracle, is_cold)
lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)
# runtime_selection_consistency(sim_oracle)

Num finished: 1200
Latency: p50=162.51, p90=242.22, p99=261.42
TPOT: p50=0.23, p90=0.95, p99=0.98
TTFT: p50=122.26, p90=230.95, p99=248.74
Num finished: 240
Latency: p50=161.87, p90=224.22, p99=232.34
TPOT: p50=0.13, p90=0.14, p99=0.21
TTFT: p50=153.44, p90=216.35, p99=221.72
Num finished: 960
Latency: p50=162.78, p90=246.69, p99=262.11
TPOT: p50=0.52, p90=0.96, p99=0.99
TTFT: p50=104.62, p90=234.46, p99=248.76


In [10]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/cp_debug/fcfs_with_cp_1024/mistralai-Mistral-7B-v0.1_20_0.2_1200_4_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE:_inf.json')
lat_tpot_ttft(sim_oracle)
lat_tpot_ttft(sim_oracle, is_cold)
lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)
# runtime_selection_consistency(sim_oracle)

Num finished: 1200
Latency: p50=248.90, p90=375.46, p99=389.50
TPOT: p50=0.23, p90=0.26, p99=0.27
TTFT: p50=233.34, p90=360.21, p99=379.07
Num finished: 240
Latency: p50=225.96, p90=369.92, p99=389.55
TPOT: p50=0.23, p90=0.26, p99=0.27
TTFT: p50=209.75, p90=355.49, p99=377.91
Num finished: 960
Latency: p50=254.90, p90=376.15, p99=389.47
TPOT: p50=0.23, p90=0.26, p99=0.27
TTFT: p50=239.22, p90=361.21, p99=379.12
