In [134]:
import numpy as np
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt

In [135]:
mode = 'local'
dataset = 'ucf'
batch_size = 64

In [136]:
features = ["sim_compute_time", "threadpool_size", "block_size", 'io_time', 'processing_time', 'load_time', 'compute_time', 'completion_time', "memory", "throughput", 'num_batches']
def plot_rlt(dirs):
    data = []
    for dir in dirs:
        sim_compute_time = float(dir.split('/')[-1].split('-')[0].split('=')[1])
        
        num_batches = len(np.load(f"{dir}/data_load_time.npy"))
        load_time = np.mean(np.load(f"{dir}/data_load_time.npy"))
        compute_time = np.mean(np.load(f"{dir}/compute_time.npy"))
        completion_time = load_time + compute_time
        throughput = batch_size / completion_time

        memory_usage = np.load(f"{dir}/memory.npy", allow_pickle=True)
        memory = []
        for row in memory_usage:
            tmp = list(row.values())
            if len(tmp) < 2:
                continue
            tmp = [int(x) for x in tmp]
            memory.append(sum(tmp))
        memory = np.quantile(memory, 0.9, axis=0) / (1024 ** 2)
       
        try:
            block_size = int(dir.split('/')[-1].split('-')[1].split('=')[1])
        except:
            block_size = 1
        
        threadpool_size = 1
        if os.path.exists(f"{dir}/opt_config"):
            with open(f"{dir}/opt_config", 'r') as f:
                opt_config = f.readline().strip('\n').split(',')
                threadpool_size, block_size = opt_config
                threadpool_size, block_size = int(threadpool_size), int(block_size)
        
        io_time = np.mean(np.load(f"{dir}/io_time.npy")) / block_size
        processing_time = np.mean(np.load(f"{dir}/processing_time.npy"))
        processing_time -= io_time
        
        data.append([sim_compute_time, threadpool_size, block_size, io_time, processing_time, load_time, compute_time, completion_time, memory, throughput, num_batches])
    return data

# Exp1
Evaluate the dataset merging algorithm with our job and dataset placement algorithm:
- Scenario: 1-job/1-worker
- Datasets: ImageNet, LibriSpeech
- Baselines:
    - No merging: load individual files
    - Fixed Size Block: enumerate and benchmark multiple block sizes until meeting the early stop condition. (<= 5% imp for 3 block sizes) 
- Metrics:
    - Data loading time
    - Job completion time
    - Memory utilization

## Baseline 1: No merging

In [137]:
baseline1 = pd.DataFrame()
for i in range(2):
    # dirs = glob.glob(f"./experiments/exp1/baseline1/{dataset}/{mode}/sim*")
    dirs = glob.glob(f"./experiments/exp1/baseline1/sim*")
    data = plot_rlt(dirs)
    data = pd.DataFrame(data, columns=features)
    baseline1 = pd.concat([baseline1, data], axis=0)

baseline1 = baseline1.groupby('sim_compute_time').mean()
baseline1.sort_values(by='sim_compute_time', inplace=True)
baseline1.reset_index(inplace=True)
baseline1

Unnamed: 0,sim_compute_time,threadpool_size,block_size,io_time,processing_time,load_time,compute_time,completion_time,memory,throughput,num_batches
0,0.25,1.0,1.0,0.002491,0.300646,2.523287,0.278032,2.801319,5985.320312,22.846382,50.0
1,0.5,1.0,1.0,0.000839,0.309906,2.444054,0.531464,2.975517,4738.578125,21.508867,50.0
2,0.75,1.0,1.0,0.000785,0.307792,2.042232,0.780861,2.823093,4704.085938,22.670167,50.0


## Baseline 2: Fixed Block Size

In [138]:
dirs = glob.glob(f"./experiments/exp1/baseline2/local/sim*")
data = plot_rlt(dirs)
data = pd.DataFrame(data, columns=features)
data.sort_values(by=['sim_compute_time', 'block_size'], inplace=True)

In [139]:
baseline2 = pd.DataFrame()
for i in range(1):
    # dirs = glob.glob(f"./experiments/exp1/baseline2/{dataset}/{mode}/sim*")
    dirs = glob.glob(f"./experiments/exp1/baseline2/sim*")
    data = plot_rlt(dirs)
    data = pd.DataFrame(data, columns=features)
    data.sort_values(by=['sim_compute_time', 'block_size'], inplace=True)
    # rlt = pd.DataFrame()
    # for name, group in data.groupby(by="sim_compute_time"):
    #     group.sort_values(by="completion_time", inplace=True)
    #     group = group.iloc[0].to_frame().T
    #     rlt = pd.concat([rlt, group])
    # baseline2 = pd.concat([baseline2, rlt], axis=0)

data

Unnamed: 0,sim_compute_time,threadpool_size,block_size,io_time,processing_time,load_time,compute_time,completion_time,memory,throughput,num_batches
0,0.5,1,100,0.002666,0.308846,2.470931,0.52913,3.00006,8378.591406,21.332904,50


## Ours

In [140]:
# for f in glob.glob(f"./experiments/exp1/ours/{mode}/sim*/imagenet*.csv"):
#     df = pd.read_csv(f)
#     df['block_size'] = df['block_size'].astype(np.int32)
#     x = np.arange(df.shape[0])

#     plt.plot(x, df['batch_completion_time'], label=f.split('/')[-2])
#     # plt.xticks(x, df['block_size'], rotation=90)

# plt.legend()
# plt.xlabel("block_size(# samples)")
# plt.ylabel("batch completion time(s)")
# plt.grid()
# plt.show()

In [141]:
ours = pd.DataFrame()
for i in range(3):
    # dirs = glob.glob(f"./experiments/exp1/ours/{dataset}/{mode}/sim*")
    dirs = glob.glob(f"./experiments/exp1/ours/sim*")
    data = plot_rlt(dirs)
    data = pd.DataFrame(data, columns=features)
    data.sort_values(by=['sim_compute_time', 'block_size'], inplace=True)

    rlt = pd.DataFrame()
    for name, group in data.groupby(by="sim_compute_time"):
        group.sort_values(by="completion_time", inplace=True)
        group = group.iloc[0].to_frame().T
        rlt = pd.concat([rlt, group])
    ours = pd.concat([ours, rlt], axis=0)

ours = ours.groupby('sim_compute_time').mean()
ours.sort_values(by='sim_compute_time', inplace=True)
ours.reset_index(inplace=True)
ours

Unnamed: 0,sim_compute_time,threadpool_size,block_size,io_time,processing_time,load_time,compute_time,completion_time,memory,throughput,num_batches
0,0.25,1.0,20.0,0.001695,0.31008,2.712361,0.27964,2.992001,5901.867188,21.390369,50.0
1,0.5,1.0,100.0,0.002938,0.307529,2.376442,0.530405,2.906847,8154.984375,22.016981,50.0
2,0.75,1.0,20.0,0.001658,0.308603,2.179043,0.781038,2.960082,6631.863281,21.621025,50.0


In [142]:
100 * (ours['throughput'] - baseline1['throughput']) / baseline1['throughput']

0   -6.373060
1    2.362349
2   -4.627852
Name: throughput, dtype: float64