In [1128]:
import numpy as np
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt

In [1129]:
mode = 'remote'
dataset = 'librispeech'
batch_size = {"librispeech": 512, "imagenet": 512}[dataset]

In [1130]:
features = ["sim_compute_time", "threadpool_size", "block_size", 'io_time', 'processing_time', 'load_time', 'compute_time', 'completion_time', "memory", "throughput", 'num_batches']
def plot_rlt(dirs):
    data = []
    for dir in dirs:
        sim_compute_time = float(dir.split('/')[-1].split('-')[0].split('=')[1])
        
        num_batches = len(np.load(f"{dir}/data_load_time.npy"))
        load_time = np.mean(np.load(f"{dir}/data_load_time.npy"))
        compute_time = np.mean(np.load(f"{dir}/compute_time.npy"))
        completion_time = load_time + compute_time
        throughput = batch_size / completion_time

        memory_usage = np.load(f"{dir}/memory.npy", allow_pickle=True)
        memory = []
        for row in memory_usage:
            tmp = list(row.values())
            if len(tmp) < 2:
                continue
            tmp = [int(x) for x in tmp]
            memory.append(sum(tmp))
        memory = np.quantile(memory, 0.9, axis=0) / (1024 ** 2)
       
        try:
            block_size = int(dir.split('/')[-1].split('-')[1].split('=')[1])
        except:
            block_size = 1
        
        threadpool_size = 1
        if os.path.exists(f"{dir}/opt_config"):
            with open(f"{dir}/opt_config", 'r') as f:
                opt_config = f.readline().strip('\n').split(',')
                threadpool_size, block_size = opt_config
                threadpool_size, block_size = int(threadpool_size), int(block_size)
        
        io_time = np.mean(np.load(f"{dir}/io_time.npy")) / block_size
        processing_time = np.mean(np.load(f"{dir}/processing_time.npy"))
        processing_time -= io_time
        
        data.append([sim_compute_time, threadpool_size, block_size, io_time, processing_time, load_time, compute_time, completion_time, memory, throughput, num_batches])
    return data

# Exp1
Evaluate the dataset merging algorithm with our job and dataset placement algorithm:
- Scenario: 1-job/1-worker
- Datasets: ImageNet, LibriSpeech
- Baselines:
    - No merging: load individual files
    - Fixed Size Block: enumerate and benchmark multiple block sizes until meeting the early stop condition. (<= 5% imp for 3 block sizes) 
- Metrics:
    - Data loading time
    - Job completion time
    - Memory utilization

## Baseline 1: No merging

In [1131]:
baseline1 = pd.DataFrame()
for i in range(3):
    dirs = glob.glob(f"./experiments/exp1/baseline1/{dataset}/run{i+1}/{mode}/sim*")
    # dirs = glob.glob(f"./experiments/exp1/baseline1/sim*")
    data = plot_rlt(dirs)
    data = pd.DataFrame(data, columns=features)
    baseline1 = pd.concat([baseline1, data], axis=0)

baseline1 = baseline1.groupby('sim_compute_time').mean()
baseline1.sort_values(by='sim_compute_time', inplace=True)
baseline1.reset_index(inplace=True)
baseline1

Unnamed: 0,sim_compute_time,threadpool_size,block_size,io_time,processing_time,load_time,compute_time,completion_time,memory,throughput,num_batches
0,0.1,1.0,1.0,0.003198,0.004467,0.707698,0.175737,0.883435,13035.052083,579.556418,56.0
1,0.25,1.0,1.0,0.003195,0.004482,0.58286,0.324327,0.907187,14110.608073,564.383937,56.0
2,0.5,1.0,1.0,0.003187,0.004476,0.364627,0.573956,0.938583,12492.80599,545.519699,56.0
3,0.75,1.0,1.0,0.003141,0.004542,0.153752,0.837865,0.991617,15706.785286,516.340359,56.0
4,1.0,1.0,1.0,0.002451,0.004539,0.140482,1.072967,1.213449,17696.001432,421.942493,56.0


## Baseline 2: Fixed Block Size

In [1132]:
dirs = glob.glob(f"./experiments/exp1/baseline2/sim*")
data = plot_rlt(dirs)
data = pd.DataFrame(data, columns=features)
data.sort_values(by=['sim_compute_time', 'block_size'], inplace=True)

In [1133]:
baseline2 = pd.DataFrame()
for i in range(1):
    dirs = glob.glob(f"./experiments/exp1/baseline2/sim*")
    data = plot_rlt(dirs)
    data = pd.DataFrame(data, columns=features)
    data.sort_values(by=['sim_compute_time', 'block_size'], inplace=True)
    # rlt = pd.DataFrame()
    # for name, group in data.groupby(by="sim_compute_time"):
    #     group.sort_values(by="completion_time", inplace=True)
    #     group = group.iloc[0].to_frame().T
    #     rlt = pd.concat([rlt, group])
    # baseline2 = pd.concat([baseline2, rlt], axis=0)

data

Unnamed: 0,sim_compute_time,threadpool_size,block_size,io_time,processing_time,load_time,compute_time,completion_time,memory,throughput,num_batches


## Ours

In [1134]:
# for f in glob.glob(f"./experiments/exp1/ours/{mode}/sim*/imagenet*.csv"):
#     df = pd.read_csv(f)
#     df['block_size'] = df['block_size'].astype(np.int32)
#     x = np.arange(df.shape[0])

#     plt.plot(x, df['batch_completion_time'], label=f.split('/')[-2])
#     # plt.xticks(x, df['block_size'], rotation=90)

# plt.legend()
# plt.xlabel("block_size(# samples)")
# plt.ylabel("batch completion time(s)")
# plt.grid()
# plt.show()

In [1135]:
ours = pd.DataFrame()
for i in range(3):
    dirs = glob.glob(f"./experiments/exp1/ours/{dataset}/run{i+1}/{mode}/sim*")
    # dirs = glob.glob(f"./experiments/exp1/ours/sim*")
    data = plot_rlt(dirs)
    data = pd.DataFrame(data, columns=features)
    data.sort_values(by=['sim_compute_time', 'block_size'], inplace=True)

    rlt = pd.DataFrame()
    for name, group in data.groupby(by="sim_compute_time"):
        group.sort_values(by="completion_time", inplace=True)
        group = group.iloc[0].to_frame().T
        rlt = pd.concat([rlt, group])
    ours = pd.concat([ours, rlt], axis=0)

ours = ours.groupby('sim_compute_time').mean()
ours.sort_values(by='sim_compute_time', inplace=True)
ours.reset_index(inplace=True)
ours

Unnamed: 0,sim_compute_time,threadpool_size,block_size,io_time,processing_time,load_time,compute_time,completion_time,memory,throughput,num_batches
0,0.1,2.333333,160.0,0.001356,0.005845,0.600211,0.211097,0.811309,15287.46875,631.197156,56.0
1,0.25,1.333333,173.333333,0.001366,0.006276,0.495142,0.364163,0.859304,17154.53125,596.15197,56.0
2,0.5,1.333333,60.0,0.001165,0.00679,0.279895,0.619317,0.899212,16746.341146,569.466436,56.0
3,0.75,1.0,100.333333,0.001488,0.005862,0.154941,0.85235,1.007291,16309.809896,508.320178,56.0
4,1.0,1.0,20.666667,0.001956,0.00476,0.143283,1.079808,1.22309,18447.119531,418.627818,56.0


In [1136]:
100 * (ours['throughput'] - baseline1['throughput']) / baseline1['throughput']

0    8.910390
1    5.628798
2    4.389711
3   -1.553274
4   -0.785575
Name: throughput, dtype: float64