In [None]:
%load_ext autoreload
%autoreload 2
import glob

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib_inline
import numpy as np
import orjson
import pandas as pd
import utils

matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
plt.rcParams['figure.figsize'] = [14, 9]

pd.set_option('future.no_silent_downcasting', True)

In [None]:
logs1g = utils.load_antler_df(
    # '../tests/results/1gbps/*/*_streams.json*',
    '../tests/results/1gbps/*/*_codel_streams.json*',
)

In [None]:
logs10g = utils.load_antler_df(
    # '../tests/results/10gbps/*-2/*_streams.json*',
    # '../tests/results/10gbps/*-3/*_streams.json*',
    # '../tests/results/10gbps/dead/*_streams.json*',
    # '../tests/results/10gbps/newpie1/*pie_streams.json*',
    # '../tests/results/10gbps/newpie3/*pie_streams.json*',
    '../tests/results/10gbps/4h-10g-*/*_codel_streams.json*',
    # '../tests/results/10gbps/4h-10g-*/*fifo*_streams.json*',
    # '../tests/results/10gbps/4h-10g-*/*cake_streams.json*',
)

In [None]:
algo_order = [
    'age-pfifofast',
    'pfifo',
    'age-cake',
    'cake',
    'age-prio-fqcodel',
    'fqcodel',
    'age-prio-codel',
    'codel',
    'age-prio-fqpie',
    'fqpie',
    'age-prio-pie',
    'pie',
]

In [None]:
logs1g['algorithm'] = logs1g['algorithm'].cat.set_categories(algo_order)
# logs1g['date'] = (
#     logs1g['date']
#     .cat.rename_categories(
#         {
#             # '6h-pareto1.2-heavy2-fixed': 'threshold 86MiB',
#             '6h-pareto1.2-heavy3': 'seed=325 thr=6MiB (phfit+toolkit)',
#             '6h-pareto1.2-heavy4': 'seed=325 thr=13MiB (kneed)',
#             '6h-pareto1.2-heavy5': 'seed=326 thr=6MiB (phfit+toolkit)',
#         }
#     )
#     .cat.reorder_categories(
#         [
#             'seed=325 thr=6MiB (phfit+toolkit)',
#             'seed=325 thr=13MiB (kneed)',
#             'seed=326 thr=6MiB (phfit+toolkit)',
#         ]
#     )
# )

In [None]:
logs10g['algorithm'] = logs10g['algorithm'].cat.set_categories(algo_order, ordered=True)
# logs10g['date'] = (
#     logs10g['date']
#     .cat.rename_categories(
#         {
#             # '2025-04-19-162850Z': '2h, medium load',
#             # '2025-04-22-071430Z': '1h, heavy load',
#             # '4h-10g-1': '4h',
#             '4h-10g-2': 'thr=19MiB (phfit+toolkit)',
#             '4h': 'thr=19MiB (phfit+toolkit)',
#         }
#     )
#     # .cat.reorder_categories(
#     #     [
#     #         '4h',
#     #     ]
#     # )
# )

In [None]:
import itertools

cs =  mpl.color_sequences['tab20']
color = cs # [cs[0], cs[2], cs[4], cs[6], cs[1], cs[5], cs[7], cs[3]]

def time_bar_plot(data: pd.DataFrame, **kwargs):
    data = data.agg(mean_time=('time', 'mean'), sem_time=('time', 'sem'))
    data['interval'] = data['sem_time'] * 1.96 # 95% confidence interval
    ax = data.unstack().plot(kind='bar', y='mean_time', yerr='interval', rot=kwargs.pop('rot', 0), ylabel='time (s)', color=color, **kwargs)
    data = [c.datavalues for c in ax.containers if isinstance(c, mpl.container.BarContainer)]
    data = zip(data[0::2], data[1::2])
    data = list(itertools.chain.from_iterable((d/e-1, [None]*len(e)) for d, e in data))
    for i, container in enumerate(filter(lambda x: isinstance(x, mpl.container.BarContainer), ax.containers)):
        ann = ax.bar_label(container, label_type='edge', fontsize=8, padding=3)
        for j, label in enumerate(ann):
            label.set_text(f"{data[i][j]:.1%}" if data[i][j] else '')

def cumulative_traffic_plot(data: pd.DataFrame, **kwargs):
    data = data.groupby(['size']).agg(total_data=('size', 'sum')).cumsum()
    data.plot(kind='line', y='total_data', ylabel='total data (bytes)', rot=kwargs.pop('rot', 0), **kwargs)

In [None]:
fig, [ax0, ax1] = plt.subplots(1, 2, figsize=(15, 6), sharey=True, width_ratios=(2,1))

# df = logs1g.groupby(['date', 'algorithm'], observed=False).agg(total_data=('size', 'sum'), n_exps=('date', 'nunique'))
# df['total_data'] = df['total_data'] / (1_000_000_000 / 8 * 6 * 3600)
# df.unstack().plot(kind='bar', y='total_data', rot=0, ax=ax0, color=color)
# ax0.set_title('1gbps, 6 hours')
# ax0.axhline(1, linestyle='-')
# ax0.grid(axis='y')
# ax0.set_axisbelow(True)
# ax0.legend(loc='lower center', ncol=3)


df = logs10g.groupby(['date', 'algorithm'], observed=False).agg(total_data=('size', 'sum'), n_exps=('date', 'nunique'))
df['total_data'] = df['total_data'] / (10_000_000_000 / 8 * 4 * 3600)
# / (10_000_000_000 / 8 * 3600)
df.unstack().plot(kind='bar', y='total_data', rot=0, ax=ax1, color=color)
ax1.set_title('10gbps, 4 hours')
ax1.axhline(1, linestyle='-')
ax1.legend(ncol=2)
ax1.grid(axis='y')
ax1.set_axisbelow(True)
ax1.legend(loc='lower center', ncol=3)

fig.suptitle('Goodput divided by the link capacity')
fig.tight_layout()

In [None]:
fig, [ax0, ax1] = plt.subplots(1, 2, figsize=(15, 6), width_ratios=(0.5,1))

# df = logs1g[(logs1g['start_at'] > 3600) & (logs1g['start_at'] + logs1g['time'] < 6*3600 - 60)]
# df = df[~df['algorithm'].str.contains('pie') | df['date'].str.contains('newpie')]
# time_bar_plot(df.groupby(['setup', 'algorithm'], observed=False), ax=ax0)
# ax0.set_title('1gbps, 6 hours')
# ax0.legend(loc='lower center', ncol=3)
# ax0.grid(axis='y')
# ax0.set_axisbelow(True)

df = logs10g[(logs10g['start_at'] > 1800) & (logs10g['start_at'] + logs10g['time'] < 4*3600 - 6)]
df = df[~df['algorithm'].isin(['cake', 'age-cake'])]
time_bar_plot(df.groupby(['date', 'algorithm'], observed=False), ax=ax1)
ax1.set_title('10gbps, 4 hours')
ax1.legend(loc='lower center', ncol=3)
ax1.grid(axis='y')
ax1.set_axisbelow(True)

fig.suptitle('Mean completion time per connection')
fig.tight_layout()

In [None]:
# df = logs1g[(logs1g['start_at'] > 3600) & (logs1g['start_at'] + logs1g['time'] < 6*3600 - 60)]
df = logs10g[(logs10g['start_at'] > 1800) & (logs10g['start_at'] + logs10g['time'] < 4*3600 - 60)]
df = df[~df['date'].str.contains('newpie1')]
data = df.groupby(['algorithm'], observed=False).agg(mean_time=('time', 'mean'), sem_time=('time', 'sem'))
# data['interval'] = data['sem_time'] * 1.96 # 95% confidence interval
data['color'] = data.index.map(lambda x: mpl.colors.to_hex(mpl.colormaps['tab20'].colors[algo_order.index(x)]).replace('#', '0x'))
data['gain'] = data['mean_time'] / data['mean_time'].shift(-1) - 1
print(data.to_csv(columns=['mean_time', 'sem_time', 'color', 'gain'], header=False, sep='\t'))

In [None]:
from itertools import chain

algonames = ['FIFO', 'CAKE', 'FQ-CoDel', 'CoDel', 'FQ-PIE', 'PIE']
algonames = chain.from_iterable((x,x) for x in algonames)
algotoname = dict(zip(algo_order, algonames))


In [None]:
from itertools import batched

print('\\hline')
print('\\multirow{2}{*}{Algorithm} & \multicolumn{2}{c|}{Two-Level version} & \\multicolumn{2}{c|}{Base version} & \\multirow{2}{*}{Gain} \\\\')
# print('\\hline')
print(' & {Mean FCT} & {95\\% CI} &  {Mean FCT} & {95\\% CI} &   \\\\')
print('\\hline\n\\hline')
for x, y in batched(data.index, 2):
    print(
        f'{algotoname[x]} & '
        f'{data.loc[x, "mean_time"]*1000:.4g} & {data.loc[x, "sem_time"] * 1.96 *1000:.3g} &'
        f'{data.loc[y, "mean_time"]*1000:.4g} & {data.loc[y, "sem_time"] * 1.96 *1000:.3g} & '
        f'{data.loc[x, "gain"]*100:.2g} \\% \\\\'
    )
    print('\\hline')

In [None]:
threshold = 6332751
# df = logs1g[(logs1g['start_at'] > 3600) & (logs1g['start_at'] + logs1g['time'] < 6*3600 - 60)]
df = logs10g[(logs10g['start_at'] > 1800) & (logs10g['start_at'] + logs10g['time'] < 4*3600 - 60)]
# df = df[~df['algorithm'].str.contains('pie') | df['date'].str.contains('newpie2')]
df = df[~df['date'].str.contains('newpie1')]


df = df.groupby(['algorithm', 'size'], observed=True).agg(mean_time=('time', 'mean')).unstack('algorithm')

print(df.to_csv(sep='\t', index=True))

# fig, ax = plt.subplots(figsize=(18, 8))
# df.plot(kind='line', y='mean_time', rot=0, ylabel='time (s)', color=color, ax=ax, linewidth=0.5)

# ax.axvline(x=threshold, linestyle='--', color='red', alpha=0.5)
# ax.set_xscale('log')
# ax.set_yscale('log')

# fig.suptitle('Completion time distribution')
# fig.tight_layout()

In [None]:
# thresholds = [90475270, 6332751, 13746699]
# df = logs1g[(logs1g['start_at'] > 3600) & (logs1g['start_at'] + logs1g['time'] < 6*3600 - 60)]

# fig, axs = plt.subplots(len(df['date'].cat.categories),1, figsize=(18, 16), sharex=True)
# for (date, dfg), ax, thx in zip(df.groupby('date', observed=True), axs, thresholds):
#     dfg = dfg.groupby(['algorithm', 'size'], observed=False).agg(mean_time=('time', 'mean'), sem_time=('time', 'sem')).unstack('algorithm')
#     dfg = dfg.rolling(3, center=True).mean()
#     dfg.plot(kind='line', y='mean_time', rot=0, ylabel='time (s)', color=color, ax=ax, linewidth=0.5)

#     ax.set_title(date)
#     ax.axvline(x=thx, linestyle='--', color='red', alpha=0.5)
#     ax.set_xscale('log')
#     ax.set_yscale('log')

# fig.suptitle('Completion time distribution - no extra delay')
# fig.tight_layout()

# thresholds = [19500715, 19500715]
# df = logs10g[(logs10g['start_at'] > 360) & (logs10g['start_at'] + logs10g['time'] < 4*3600 - 6)]

# fig, [axs] = plt.subplots(len(df['date'].cat.categories),1, figsize=(18, 8), sharex=True, squeeze=False)
# for (date, dfg), ax, thx in zip(df.groupby('date', observed=True), axs, thresholds):
#     dfg = dfg.groupby(['algorithm', 'size'], observed=False).agg(mean_time=('time', 'mean'), sem_time=('time', 'sem')).unstack('algorithm')
#     dfg = dfg.rolling(3, center=True).mean()
#     dfg.plot(kind='line', y='mean_time', rot=0, ylabel='time (s)', color=color, ax=ax, linewidth=0.5)

#     ax.set_title(date)
#     ax.axvline(x=thx, linestyle='--', color='red', alpha=0.5)
#     ax.set_xscale('log')
#     ax.set_yscale('log')

fig.suptitle('Completion time distribution')
fig.tight_layout()

In [None]:
# df = logs1g[(logs1g['start_at'] > 3600) & (logs1g['start_at'] + logs1g['time'] < 6*3600 - 60)]
df = logs10g
bins = [0, 1_000_000, 10_000_000, 90475270, 800_000_000, 6_000_000_000, np.inf]
labels = ['<1M', '1M-10M', '10M-90M', '90M-800M', '800M-6G', '6G+']
df['size_cat'] = pd.cut(df['size'], bins=bins, labels=labels)

df.groupby(['date', 'algorithm', 'size_cat'], observed=True).size().unstack()

In [None]:
import re

testname_match_antler = re.compile(r'.*/([^/]+)/closedloop_([a-z\-]+)_procstat-(client|server).json')
def load_procstat(filename):
    with open(filename, 'rb') as f:
        df = pd.DataFrame(orjson.loads(f.read()))
    df['time'] = pd.to_datetime(df['time'], utc=True)
    df['time'] = df['time'] - df['time'].min()
    df['filename'] = filename
    date, algo, node = testname_match_antler.match(filename).groups()
    df['date'] = date
    df['algorithm'] = pd.Categorical([algo]*len(df), categories=algo_order, ordered=True)
    df['node'] = pd.Categorical([node]*len(df), categories=['client', 'server'], ordered=True)
    x = df['/proc/stat'].str.split()
    df['cpu_user'] = x.str[1].astype(int)
    df['cpu_system'] = x.str[3].astype(int)
    return df
pstats = pd.concat(load_procstat(filename) for filename in glob.glob('../tests/results/6h-*-heavy7/*_procstat-*.json'))
pstats.info()

In [None]:
# fig, axs = plt.subplots(4,1, figsize=(15, 25), sharex=True, sharey=True)
# for ax, col, node in zip(axs, ['cpu_user', 'cpu_system', 'cpu_user', 'cpu_system'], ['client', 'client', 'server', 'server']):
#     for algo, data in pstats[pstats['node'] == node].groupby('algorithm', observed=False):
#         df = data.sort_values('time')
#         df[['cpu_system', 'cpu_user']] = df[['cpu_system', 'cpu_user']].diff().fillna(0)/100/20
#         df['cpu_total'] = df['cpu_user'] + df['cpu_system']
#         df.set_index('time', inplace=True)
#         # df = df[['cpu_system', 'cpu_user']]
#         # df = df[['cpu_system', 'cpu_user']].resample('5min').mean()
#         df.plot(y=col, ax=ax, label=algo)
#         ax.set_title(f'{node} {col}')

In [None]:
def tot_cpu(series):
    return series.max() - series.min()
    
pstats.groupby(['node', 'algorithm']).agg(cpu_user=('cpu_user', tot_cpu), cpu_system=('cpu_system', tot_cpu)).unstack('node') / (4*3600) / 100

In [None]:
# df = logs[(logs['setup'] == setup_renames['long2']) & (logs['seed'] == '15867')]
# df = logs1g[logs1g['date'] == '6h-pareto1.2-heavy']
df = logs10g#[logs10g['date'] == '4h-10g-2']
df['ended_at'] = df['start_at'] + df['time']
timepoints = np.linspace(0, df['ended_at'].max(), 200)
algos = df['algorithm'].cat.categories
partial_means = np.zeros((0, len(algos)))
partial_sems = np.zeros((0, len(algos)))

for t in timepoints:
    # c = df[(df['ended_at'] <= t) & (df['ended_at'] > 3600) & (df['ended_at'] < 6*3600 - 20)]
    c = df[df['ended_at'] <= t]
    c = c.groupby('algorithm', observed=False).agg(mean_time=('time', 'mean'), sem_time=('time', 'sem'))
    partial_means = np.vstack([partial_means, c.loc[algos, 'mean_time']])
    # partial_sems = np.vstack([partial_sems, c.loc[algos, 'sem_time']])

fig, ax = plt.subplots(sharey=True)
for i, a in enumerate(algos):
    ax.plot(timepoints, partial_means[:, i], label=a, color=color[i])
    # lower, upper = partial_means[:, i] - 1.96 * partial_sems[:, i], partial_means[:, i] + 1.96 * partial_sems[:, i]
    # ax.fill_between(timepoints, lower, upper, alpha=0.3, color=color[i])
    ax.axhline(partial_means[-1, i], linestyle='--', color=color[i], alpha=0.5)
ax.legend(loc='lower right')
ax.set_ylabel('mean completion time (s)')
ax.set_xlabel('elapsed time (s)')
ax.set_title('Expanding mean completion time per connection, for runs with the same seed')
fig.tight_layout()
# df.sort_values('start_at').groupby('algorithm', observed=True)['start_at', 'time'].expanding().mean().unstack().plot()

In [None]:
df = logs1g[logs1g['date'] == logs1g['date'].cat.categories[1]]
df = df[df['actor'] == 33].sort_values('start_at')
g = df.groupby(['algorithm'], observed=True)['size'].apply(lambda x: list(x)[18400:18420])
pd.DataFrame(dict(g))

In [None]:
df = logs1g
bins = [0, 1_000_000, 10_000_000, 55000000, 90475270, 500_000_000, 800_000_000, np.inf]
labels = ['<1M', '1M-10M', '10M-55M', "55M-86M", '86M-500M', '500M-800M', '>800M']
df['size_cat'] = pd.cut(df['size'], bins=bins, labels=labels)
df = logs1g[logs1g['date'] == logs1g['date'].cat.categories[1]]

df = df.groupby(['date', 'algorithm', 'actor'], observed=True).apply(lambda g: g.nsmallest(18000, 'start_at')).reset_index(drop=True)
df.groupby(['date', 'algorithm', 'size_cat'], observed=True).size().unstack()

In [None]:
# df = logs1g[logs1g['date'] == logs1g['date'].cat.categories[1]]
df = logs10g
print(len(df['size'].unique()))
df['time'].plot(kind='hist', bins=30, log=True)

In [None]:
df = logs1g
df['throughput'] = df['size'] / df['time']

def fairness_jaine(x):
    return np.sum(x) ** 2 / (len(x) * np.sum(x ** 2))

df = df.groupby(['date', 'algorithm'], observed=True).agg(fairness=('throughput', fairness_jaine))
df.unstack('algorithm').plot(kind='bar', y='fairness', rot=0, ylabel='fairness index', color=color)

In [None]:
df = logs1g
df['throughput'] = df['size'] / df['time']
df = df[df['throughput'] > 0]

fig, axs = plt.subplots(1, 2, figsize=(15, 6), sharey=True, width_ratios=(1,1))
for (date, dfd), ax in zip(df.groupby('setup', observed=True), axs):
    for (algorithm, dfg), c in zip(dfd.groupby('algorithm', observed=False), color):
        dfg = dfg.sort_values('throughput')
        lor = dfg['throughput'].cumsum()
        lor /= lor.max()
        x = np.arange(len(lor)) / len(lor)

        idx = np.linspace(0, len(lor) - 1, 100).astype(int)
        data = pd.DataFrame({'lorenz': lor.iloc[idx], 'x': x[idx]})

        print(data.to_csv(sep='\t', index=False))

        imax = np.argmax(x-lor)
        print(x[imax], lor.iloc[imax])

        ax.plot(x, lor, label=f'{algorithm}', color=c)
        ax.legend(loc='upper left')
    ax.set_title(date)


In [None]:
def lorenz_curve_gap(x):
    x = x.sort_values()
    lor = x.cumsum()
    lor /= lor.max()
    x = np.arange(len(lor)) / len(lor)
    return np.max(x - lor)

# for df in [logs1g, logs10g]:
#     df['throughput'] = df['size'] / df['time']
#     df = df[df['throughput'] > 0]

#     df = df.groupby(['date', 'algorithm'], observed=True).agg(lorenz_gap=('throughput', lorenz_curve_gap))
#     df.unstack('algorithm').plot(kind='bar', y='lorenz_gap', rot=0, ylabel='Lorenz curve gap', color=color)

In [None]:
# df = logs1g[(logs1g['start_at'] > 3600) & (logs1g['start_at'] + logs1g['time'] < 6 * 3600 - 60)]
df = logs10g[(logs10g['start_at'] > 1800) & (logs10g['start_at'] + logs10g['time'] < 4*3600 - 60)]
df = df[~df['date'].str.contains('newpie1')]

# df = df[~df['algorithm'].str.contains('pie') | df['date'].str.contains('newpie2')]


df['throughput'] = df['size'] / df['time']
df = df[df['throughput'] > 0]
df = df.groupby('algorithm', observed=True).agg(lorenz_gap=('throughput', lorenz_curve_gap))
df['color'] = df.index.map(lambda x: mpl.colors.to_hex(mpl.colormaps['tab20'].colors[algo_order.index(x)]).replace('#', '0x'))
print(df.to_csv(sep='\t', index=True, header=False))