In [122]:
# System
import os
from collections import defaultdict
import csv
import re
import progressbar

# Data Analysis
import pandas
import tabulate
import seaborn
import numpy

In [17]:
DATA_DIR = "./data/"
for experiment_dir in os.listdir(DATA_DIR):
    full_path = os.path.join(DATA_DIR, experiment_dir)
    if os.path.isdir(full_path):
        print(f'DATA_SOURCE = "{full_path}"')

DATA_SOURCE = "./data/experiments-09-18"


In [9]:
DATA_SOURCE = "./data/experiments-09-18"

## Creating DataFrame

In [297]:
ERR = -1
TIMEOUT = -1

def to_operation(src):
    op = src.split('-')[-1]
    if 'runtime' in op:
        return 'runtime'
    elif 'inter' in op:
        return 'intersection'
    elif 'union' in op:
        return 'union'
    elif 'construction' in op or 'conversion' in op:
        return 'transform'
    elif 'parsing' in op:
        return 'parsing'
    elif 'result' in op:
        return 'result'
    elif 'inclusion_check' == op or 'inclusion' == op:
        return 'inclusion'
    elif 'emptiness_check' == op or 'emptiness' == op:
        return 'emptiness'
    elif 'compl' == op or 'complementation' == op or 'complement' == op:
        return 'complement'
    print(f"{src} unhandled")
    assert False

def to_tool_and_lang(tool):
    if 'mata-bin' in tool:
        return None, None
    elif 'awali' in tool:
        return 'awali', 'c++'
    elif 'mona' in tool:
        return 'mona', 'c++'
    elif 'vata' in tool:
        return 'vata', 'c++'
    elif 'java-brics' in tool:
        return 'brics', 'java'
    elif 'java-automata' in tool:
        return '(j)alib', 'java'
    elif 'pyfado' in tool:
        return 'fado', 'python'
    elif 'pyautomata-lib' in tool:
        return '(py)alib', 'python'
    elif 'pymata' in tool:
        return '(py)mata', 'python'
    elif 'automata' in tool:
        return 'automata', 'c#'
    elif 'mata' in tool:
        return 'mata', 'c++'
    print(f"{tool} unhandled")
    assert False

def to_bench(bench):
    if 'automata_inclusion' in bench:
        return 'aut_inclusion'
    elif 'cox/diff' in bench:
        return 'bc_cox_diff'
    elif 'cox/inter' in bench:
        return 'bc_cox_inter'
    elif 'intersect' in bench:
        return 'bc_intersect'
    elif 'email_filter' in bench:
        return 'email_filter'
    elif 'z3-noodler' in bench:
        return 'z3_noodler'
    elif 'presburger' in bench:
        return 'presburger'
    print(f"{bench} unhandled")
    assert False

def to_value(val):
    val = val.strip()
    try:
        return float(val)
    except ValueError:
        pass
    if val in ['EMPTY', "NOT EMPTY"]:
        return val
    elif val in ('false', 'False'):
        return 'false'
    elif val in ('true', 'True'):
        return 'true'
    elif val == 'ERR':
        return 'ERR'
    elif val == 'MISSING':
        return numpy.NAN
    elif val == 'TIMEOUT' or val == 'TO':
        return TIMEOUT if TIMEOUT < 0 else -TIMEOUT
    print(f"{val} unhandled")
    assert False

In [298]:
HEADERS = ["bench", "input", "tool", "lang", "op", "time"]
TIMEOUT = -60
TIMEOUT_REGEX = re.compile("timeout-(\d+)")
processed = defaultdict(set)
def to_pandas(src_dir):
    global TIMEOUT
    data = []
    for csv_source in progressbar.progressbar(os.listdir(src_dir)):
        if csv_source.endswith('.csv'):
            if timeout := TIMEOUT_REGEX.search(csv_source):
                TIMEOUT = int(timeout.group(1))
            with open(os.path.join(src_dir, csv_source), 'r', newline='') as csvfile:
                try:
                    csv_reader = csv.reader(csvfile, delimiter=';')
                    head = next(csv_reader)
                    for row in csv_reader:
                        bench = to_bench(row[0]) # bench
                        inputs = row[0] # inputs
                        for i, val in enumerate(row[1:], 1):
                            tool, lang = to_tool_and_lang(head[i]) # tool, lang
                            if not tool:
                                continue
                            op = to_operation(head[i]) # op
                            val = to_value(val)
                            data.append([bench, inputs, tool, lang, op, val])
                except StopIteration:
                    pass
    return pandas.DataFrame(data, columns=HEADERS)
df = to_pandas(DATA_SOURCE)
print(df)

100% (28 of 28) |########################| Elapsed Time: 0:00:00 Time:  0:00:00


               bench                                              input  \
0      aut_inclusion  /home/experiments/nfa-bench/benchmarks/automat...   
1      aut_inclusion  /home/experiments/nfa-bench/benchmarks/automat...   
2      aut_inclusion  /home/experiments/nfa-bench/benchmarks/automat...   
3      aut_inclusion  /home/experiments/nfa-bench/benchmarks/automat...   
4      aut_inclusion  /home/experiments/nfa-bench/benchmarks/automat...   
...              ...                                                ...   
81699     z3_noodler  /home/experiments/nfa-bench/benchmarks/z3-nood...   
81700     z3_noodler  /home/experiments/nfa-bench/benchmarks/z3-nood...   
81701     z3_noodler  /home/experiments/nfa-bench/benchmarks/z3-nood...   
81702     z3_noodler  /home/experiments/nfa-bench/benchmarks/z3-nood...   
81703     z3_noodler  /home/experiments/nfa-bench/benchmarks/z3-nood...   

           tool lang            op      time  
0      automata   c#       runtime      0.98  
1    

In [316]:
def to_table(df, rows):
    def aggregation(series):
        timeouts = [a for a in series if (isinstance(a, float) or isinstance(a, int)) and a < 0 and not numpy.isnan(a) and a != 'ERR']
        errors = [a for a in series if a == 'ERR']
        times = [t for t in series if  (isinstance(t, float) or isinstance(t, int)) and t >= 0]
        mean = round(numpy.mean(times or [-1]), 2)
        median = round(numpy.median(times or [-1]), 2)
        return f"{mean:0.2f}" if mean != -1 else f"{'-':<4}", f"{median:0.2f}" if median != -1 else f"{'-':<5}", f"{len(timeouts):>5}"
    tools = ['mata', 'awali', 'mona', 'vata',  'automata', 'brics', '(j)alib', 'fado', '(py)alib', '(py)mata',]
    data = {
        grp: [grp] + ['-' for i in range(len(tools))] for grp in set(df[rows]) if grp != 'result'
    }
    for grp, series in df.groupby([rows, 'tool']):
        if grp[0] == 'result':
            continue
        vals = aggregation(series['time'])
        #data[grp[0]][tools.index(grp[1]) + 1] = f"{vals[0]:0.2f}/{vals[1]:0.2f}/{vals[2]:0.0f}"
        data[grp[0]][tools.index(grp[1]) + 1] = ", ".join([vals[0], vals[1]])
    print(tabulate.tabulate(
        sorted(data.values()), headers=[rows] + tools
    ))
to_table(df, rows='op')
print()
to_table(df, rows='bench')

op            mata        awali       mona        vata        automata    brics       (j)alib     fado        (py)alib    (py)mata
------------  ----------  ----------  ----------  ----------  ----------  ----------  ----------  ----------  ----------  ----------
complement    0.08, 0.00  0.01, 0.00  -           0.07, 0.00  -           0.03, 0.01  0.04, 0.03  0.00, 0.00  0.00, 0.00  0.01, 0.00
emptiness     0.16, 0.00  0.39, 0.00  -           0.00, 0.00  -           0.00, 0.00  0.00, 0.00  0.01, 0.00  0.00, 0.00  0.24, 0.00
inclusion     0.08, 0.00  0.96, 0.01  -           0.90, 0.04  -           0.35, 0.01  0.07, 0.04  0.35, 0.00  0.00, 0.00  0.12, 0.00
intersection  0.50, 0.07  0.54, 0.13  0.22, 0.13  0.23, 0.06  -           1.07, 0.01  0.19, 0.08  3.48, 0.07  0.13, 0.00  0.61, 0.00
parsing       0.56, 0.08  0.48, 0.02  0.55, 0.08  0.55, 0.08  -           -           -           -           -           -
runtime       1.06, 0.09  1.08, 0.02  -   , -     1.06, 0.15  1.05, 0.23  2.66, 