In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from dataclasses import field, asdict, dataclass
from typing import List
from benchmarks.benchmark_utils import RequestFuncOutput
from benchmarks.benchmark_workload_gen import *

In [2]:
def retrive_request_outputs(path):
    with open(path, 'r') as file:
        data = json.load(file)
        outputs = [RequestFuncOutput(**d) for d in data]
    return outputs

In [3]:
def lat_tpot_ttft(outputs: List[RequestFuncOutput], match = None, plot=False):
    if not match:
        match = lambda o: True
    outputs = [o for o in outputs if match(o)]
    if not outputs:
        print('No outputs to analyze')
        return
    lats = [o.request_latency for o in outputs if o.request_latency]
    tpots = [o.tpot for o in outputs if o.tpot]
    ttfts = [o.ttft for o in outputs if o.ttft]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99])
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99])
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99])
    
    print(f"Num finished: {len([o for o in outputs if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')

    if plot:
        fig, axs = plt.subplots(1,3, figsize=(16, 4))
        sns.kdeplot(x=lats,  ax=axs[0], cumulative=True).set_title('Latency')
        sns.kdeplot(x=tpots, ax=axs[1], cumulative=True).set_title('TPOT')
        sns.kdeplot(x=ttfts, ax=axs[2], cumulative=True).set_title('TTFT')
    
def ttft_slo(outputs, slo):
    ttfts = [o.ttft for o in outputs if o.ttft]
    slo_ttfts = [1 for ttft in ttfts if ttft <= slo]
    return sum(slo_ttfts) / len(ttfts)

def windowed_metric(start, end, outputs: List[RequestFuncOutput], exp_time, match = None):
    if not match:
        match = lambda o: True
    within_window = [o for o in outputs if o.send_out_time >= start and o.send_out_time <= end and match(o)]
    lats = [o.request_latency if o.success else exp_time - o.send_out_time for o in within_window]
    tpots = [o.tpot for o in within_window if o.tpot]
    ttfts = [o.ttft if o.ttft else exp_time - o.send_out_time for o in within_window]
    lat_p50, lat_p90, lat_p99 = np.percentile(lats, [50, 90, 99])
    tpot_p50, tpot_p90, tpot_p99 = np.percentile(tpots, [50, 90, 99])
    ttft_p50, ttft_p90, ttft_p99 = np.percentile(ttfts, [50, 90, 99])
    avg_lat, avg_tpot, avg_ttft = np.mean(lats), np.mean(tpots), np.mean(ttfts)
    
    print('-'*20)
    print(f"Requests within window: {len(within_window)}")
    print(f"Num finished: {len([o for o in within_window if o.success])}")
    print(f'Latency: p50={lat_p50:.2f}, p90={lat_p90:.2f}, p99={lat_p99:.2f}')
    print(f'TPOT: p50={tpot_p50:.2f}, p90={tpot_p90:.2f}, p99={tpot_p99:.2f}')
    print(f'TTFT: p50={ttft_p50:.2f}, p90={ttft_p90:.2f}, p99={ttft_p99:.2f}')
    print(f'Avg Latency: {avg_lat:.2f}, Avg TPOT: {avg_tpot:.2f}, Avg TTFT: {avg_ttft:.2f}')
    print('-'*20)

In [4]:
is_cold = lambda o: not WorkloadPrefixDataLoader.is_hot(o)
def is_on_gpu(ks):
    def match(o: RequestFuncOutput, ks):
        return o.runtime_selected in ks
    return lambda o: match(o, ks)

In [7]:
sim_oracle_fcfs = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/logs/from_vik/mistralai-Mistral-7B-v0.1_24_0.0_383_0.5_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.BASIC_MEM_SCHEDULERV2:mem_basic_v2_766.json')
lat_tpot_ttft(sim_oracle_fcfs)
# lat_tpot_ttft(sim_oracle_fcfs, is_cold)
# lat_tpot_ttft(sim_oracle_fcfs, WorkloadPrefixDataLoader.is_hot)

Num finished: 383
Latency: p50=6.21, p90=14.90, p99=19.77
TPOT: p50=0.05, p90=0.13, p99=0.32
TTFT: p50=3.57, p90=10.14, p99=16.05


In [8]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/logs/from_vik/mistralai-Mistral-7B-v0.1_24_0.0_383_0.5_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.MemSchedulerWithGlobalEviction:global_evict_766.json')
lat_tpot_ttft(sim_oracle)
# lat_tpot_ttft(sim_oracle, is_cold)
# lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 383
Latency: p50=13.21, p90=33.76, p99=71.70
TPOT: p50=0.11, p90=0.41, p99=0.79
TTFT: p50=4.53, p90=22.60, p99=66.76


In [23]:

sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/workload_prefix/4r_sim_80_0.2_2700_9_fcfs_escape/mistralai-Mistral-7B-v0.1_80_0.2_2700_9_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE:50_inf.json')
lat_tpot_ttft(sim_oracle)
lat_tpot_ttft(sim_oracle, is_cold)
lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 2700
Latency: p50=34.38, p90=67.96, p99=85.10
TPOT: p50=0.28, p90=0.38, p99=0.49
TTFT: p50=16.30, p90=49.10, p99=63.00
Num finished: 540
Latency: p50=58.50, p90=76.02, p99=88.94
TPOT: p50=0.28, p90=0.38, p99=0.49
TTFT: p50=41.76, p90=55.86, p99=66.23
Num finished: 2160
Latency: p50=28.91, p90=62.57, p99=80.15
TPOT: p50=0.28, p90=0.38, p99=0.49
TTFT: p50=9.33, p90=43.60, p99=60.82


In [24]:
sim_oracle = retrive_request_outputs('/mnt/ssd1/alm-os/sglang_multi_model/workload_prefix/4r_sim_80_0.2_2700_9_baseline_cp_1024/mistralai-Mistral-7B-v0.1_80_0.2_2700_9_DataParallelRuntimeSelectionPolicy.CUSTOM-CustomPolicyType.ORACLE:10_inf.json')
lat_tpot_ttft(sim_oracle)
lat_tpot_ttft(sim_oracle, is_cold)
lat_tpot_ttft(sim_oracle, WorkloadPrefixDataLoader.is_hot)

Num finished: 2700
Latency: p50=46.66, p90=81.98, p99=91.89
TPOT: p50=0.22, p90=0.23, p99=0.25
TTFT: p50=33.76, p90=68.73, p99=78.50
Num finished: 540
Latency: p50=47.77, p90=82.60, p99=92.50
TPOT: p50=0.21, p90=0.23, p99=0.24
TTFT: p50=34.72, p90=69.67, p99=79.16
Num finished: 2160
Latency: p50=46.22, p90=81.83, p99=91.69
TPOT: p50=0.22, p90=0.23, p99=0.25
TTFT: p50=33.43, p90=68.44, p99=78.34
