# Dataloading Example

This notebook shows how to load `_resource.csv` and `_runtimes.csv` files from `/data`

In [1]:
import pandas as pd
import itertools

## Read resource files

Resource files have the following structure:

- _datetime_: Timestamp when the resource snapshot was taken
- _epoch_: Current epoch of the trainer
- _memory\_used_: (CPU-)RAM used
- _memory\_free_: (CPU-)RAM available

<code>for i in number_of_gpus:</code>
- _gpu\_{i}\_memory\_used_: VRAM of GPU i used
- _gpu\_{i}\_memory\_free_: VRAM of GPU i available

In [2]:
def read_resources(file= 'ocp-metrics/s2ef/gemnet_t/1658534487_stage2_8gpus__resources.csv'):
    num_gpus = 8
    gpu_headers = list(itertools.chain.from_iterable([[f"gpu_{i}_memory_used", f"gpu_{i}_memory_free"] for i in range(num_gpus)]))

    resources = pd.read_csv(file,
                            header=None,
                            names=[
                                "datetime",
                                "epoch",
                                "memory_used",
                                "memory_free"
                            ] + gpu_headers)

    return resources

## Read runtime files

Runtime files have the following structure:

- _rank_: Rank of device (=GPU)
- _epoch_: Current epoch of the trainer
- _epoch\_time_: Total time of epoch (in s)
- _dataloading\_time_: Time spend during dataloading (in s)
- _forward\_time_: Time spend during forward pass (in s)
- _backward\_time_: Time spend during backward pass (in s)

In [3]:
def read_runtimes(file = "ocp-metrics/s2ef/gemnet_t/1658138522_stage1_8gpus_runtimes.csv"):
    runtimes = pd.read_csv(file,
                            header=None,
                            names=[
                                "rank",
                                "epoch",
                                "epoch_time",
                                "dataloading_time",
                                "forward_time",
                                "backward_time"
                            ])

    return runtimes

In [4]:
def stats_resources(file = 'ocp-metrics/s2ef/gemnet_t/1658534487_stage2_8gpus__resources.csv'):
    resources = read_resources(file)
    drop = [x for x in resources.columns if x.endswith('free')] + ["datetime","epoch"]
    resources = resources.drop(columns = drop)
    resources["gpu_memory"] = resources[resources.columns[1:]].sum(axis = 1)
    resources["cpu_memory"] = resources["memory_used"]
    return resources[["gpu_memory","cpu_memory"]].apply(['mean','std'])
    


In [37]:

def stats_runtimes(file = "ocp-metrics/s2ef/gemnet_t/1658138522_stage1_8gpus_runtimes.csv"):
    runtimes = read_runtimes(file)
    stats = pd.concat([runtimes.groupby("epoch").max()[['epoch_time']],runtimes.groupby("epoch").std()[['epoch_time']].rename(columns = {"epoch_time":"std_per_gpu"})],axis = 1)
    runtimes = runtimes.groupby("epoch").mean()
    runtimes["rest_time"] = runtimes['epoch_time'] - runtimes['dataloading_time'] - runtimes['forward_time'] - runtimes['backward_time']
    for col in ["dataloading_time","forward_time","backward_time","rest_time"]:
        runtimes[col+"_percentage"] = runtimes[col]/runtimes['epoch_time']
    runtimes = runtimes[[x +"_percentage" for x in ["dataloading_time","forward_time","backward_time","rest_time"] ]]
    return pd.concat([stats,runtimes],axis = 1).mean() 


In [38]:
stats_runtimes()

epoch_time                     5157.134857
std_per_gpu                       0.102046
dataloading_time_percentage       0.000473
forward_time_percentage           0.441708
backward_time_percentage          0.449657
rest_time_percentage              0.108162
dtype: float64

In [39]:
stats_resources()

Unnamed: 0,gpu_memory,cpu_memory
mean,293040.601449,47877000000.0
std,7005.278007,2493555000.0
