In [21]:
from typing import Iterable, List, NamedTuple, Tuple
import pandas as pd
import os
from common import subdirs_of

In [22]:
def convert_percentage_to_float(s: str) -> float:
    return float(s.strip('%')) / 100


def read_experiment_data(path: str) -> pd.DataFrame:
    # the table header is not consistent, so we don't want pandas to detect and process the header (1st row of csv)
    # but let it use the hard-coded column names below.
    return pd.read_csv(
        path,
        index_col=False,
        header=None,
        skiprows=1,
        names=[
            '# relative_time', 'cycles_done', 'cur_item', 'corpus_count', 'pending_total', 'pending_favs', 'bit_cvg',
            'shw_cvg', 'saved_crashes', 'saved_hangs', 'max_depth', 'execs_per_sec', 'total_execs', 'edges_found'
        ],
        converters={
            'bit_cvg': convert_percentage_to_float,
            'shw_cvg': convert_percentage_to_float,
        }
    )


In [23]:
# for preview purpose only
read_experiment_data('/home/peter/archives/combined/libfuzzer/dagisel/aarch64/0/default/plot_data')

Unnamed: 0,# relative_time,cycles_done,cur_item,corpus_count,pending_total,pending_favs,bit_cvg,shw_cvg,saved_crashes,saved_hangs,max_depth,execs_per_sec,total_execs,edges_found
0,61,0,0,9,9,1,0.29233,0.00319,0,0,2,225.67,20791,19158
1,66,0,0,9,9,1,0.29233,0.00319,0,0,2,223.81,21919,19158
2,71,0,0,9,9,1,0.29233,0.00319,0,0,2,220.54,23039,19158
3,76,0,0,9,9,1,0.29233,0.00319,0,0,2,220.66,24167,19158
4,81,0,0,9,9,1,0.29233,0.00319,0,0,2,220.60,25287,19158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50725,259178,0,21928,23992,21872,324,0.79321,0.02574,0,0,27,67.21,34779453,51984
50726,259184,0,21928,23993,21873,324,0.79323,0.02574,0,0,27,79.58,34779798,51985
50727,259189,0,21928,23993,21873,324,0.79323,0.02574,0,0,27,76.73,34780168,51985
50728,259194,0,21928,23993,21873,324,0.79323,0.02574,0,0,27,69.27,34780512,51985


In [24]:
# for preview purpose only
read_experiment_data('/home/peter/archives/combined/aflplusplus/dagisel/ve/0/default/plot_data')

Unnamed: 0,# relative_time,cycles_done,cur_item,corpus_count,pending_total,pending_favs,bit_cvg,shw_cvg,saved_crashes,saved_hangs,max_depth,execs_per_sec,total_execs,edges_found
0,65,0,643,703,680,247,0.1707,0.0,7,19,4,5149.52,165453,11188
1,70,0,609,729,705,245,0.1717,0.0,12,19,4,5859.50,195079,11255
2,75,0,377,750,723,255,0.1731,0.0,12,19,4,4322.63,217609,11347
3,80,0,377,768,741,255,0.1745,0.0,12,19,4,1648.54,234407,11438
4,85,0,719,790,758,263,0.1757,0.0,13,19,5,5245.60,264431,11514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50743,259182,17,6403,12376,167,2,0.4087,0.0,1614,177,61,5568.61,1132816346,26784
50744,259187,17,7964,12376,167,2,0.4087,0.0,1614,177,61,5592.57,1132844290,26784
50745,259192,17,11735,12376,167,2,0.4087,0.0,1614,177,61,5099.69,1132870860,26784
50746,259197,17,9455,12376,167,2,0.4087,0.0,1614,177,61,5438.35,1132894801,26784


In [25]:
class Experiment(NamedTuple):
    fuzzer: str
    isel: str
    arch: str
    replicate_id: int
    data: pd.DataFrame

def iterate_over_all_experiments(dir: str, allow_missing_data: bool = False) -> Iterable[Experiment]:
    for fuzzer_dir in subdirs_of(dir):
        fuzzer = fuzzer_dir.name
        for isel_dir in subdirs_of(fuzzer_dir.path):
            isel = isel_dir.name
            for arch_dir in subdirs_of(isel_dir.path):
                arch = arch_dir.name
                for replicate_dir in subdirs_of(arch_dir.path):
                    replicate_id = int(replicate_dir.name)
                    plot_data_path = os.path.join(replicate_dir.path, 'default', 'plot_data')
                    try:
                        yield Experiment(fuzzer, isel, arch, replicate_id, read_experiment_data(plot_data_path))
                    except FileNotFoundError:
                        if not allow_missing_data:
                            raise


In [26]:
def combine_last_row_of_each_experiment_data(experiments: Iterable[Experiment], columns: List[str]) -> pd.DataFrame:
    return pd.DataFrame(
        columns=['fuzzer', 'isel', 'arch', 'replicate', *columns],
        data=(
            [
                exp.fuzzer, 
                exp.isel, 
                exp.arch, 
                exp.replicate_id, 
                *exp.data.tail(1)[columns].values.flatten().tolist()
            ]
            for exp in experiments
        )
    )
    # df = df[df[prop] == value]

    # if df.shape[0] == 0:
    #     raise Exception(f"Experiment {replicate_dir.path} does not have a record where {prop} = {value}")

In [28]:
df = combine_last_row_of_each_experiment_data(
    iterate_over_all_experiments(
        '/home/peter/archives/combined',
        allow_missing_data=True
    ),
    columns=['# relative_time', 'total_execs', 'bit_cvg', 'shw_cvg']
)

df

Unnamed: 0,fuzzer,isel,arch,replicate,# relative_time,total_execs,bit_cvg,shw_cvg
0,libfuzzer,dagisel,aarch64,0,259196.0,3.478068e+07,0.79323,0.02574
1,libfuzzer,dagisel,aarch64,1,259196.0,4.188732e+07,0.80125,0.02539
2,libfuzzer,dagisel,aarch64,2,259196.0,3.677318e+07,0.79765,0.02545
3,libfuzzer,dagisel,aarch64_32,0,259196.0,3.486497e+07,0.78960,0.02536
4,libfuzzer,dagisel,aarch64_32,1,259196.0,4.159030e+07,0.79477,0.02534
...,...,...,...,...,...,...,...,...
233,aflplusplus,dagisel,ve,4,259198.0,1.304326e+09,0.50230,0.00000
234,aflplusplus,dagisel,ve,1,259198.0,1.246483e+09,0.42570,0.00000
235,aflplusplus,dagisel,ve,3,259198.0,1.729388e+09,0.50070,0.00000
236,aflplusplus,dagisel,ve,0,259198.0,1.132901e+09,0.40870,0.00000


In [29]:
df.to_csv('last_row_of_each_experiment.csv', index=False)