In [1]:
import itertools
import os
import random
import json
from json import JSONEncoder

from frozendict import frozendict

def _default(self, obj):
    return getattr(obj.__class__, "to_json", _default.default)(obj)

_default.default = JSONEncoder().default
JSONEncoder.default = _default

from pathos import multiprocessing
import logging
import copy
from pathlib import Path
from typing import Tuple
import re
from z3.z3 import Solver, And, Or, Not, Bool, Int, sat

baselines: Path = Path("/home/austin/git/Sugarlyzer-Results/clang/varbugs/baseline.json")
experimental_results: Path = Path("/home/austin/git/Sugarlyzer-Results/clang/varbugs/desugared.json")

with open(baselines) as f:
    baselines = json.load(f)

with open(experimental_results) as f:
    experimental_results = json.load(f)

lonely_baselines = copy.deepcopy(baselines)
lonely_experimental_results = copy.deepcopy(experimental_results)

class IntRange:
    def __init__(self, lower_bound_inclusive, upper_bound_exclusive):
        self.lower_bound_inclusive = lower_bound_inclusive
        self.upper_bound_exclusive = upper_bound_exclusive

    def __contains__(self, item):
        return isinstance(item, int) and (self.lower_bound_inclusive <= item < self.upper_bound_exclusive)

    def __repr__(self):
        return f"IntRange({self.lower_bound_inclusive, self.upper_bound_exclusive})"

    def __str__(self):
        return f"[{self.lower_bound_inclusive}:{self.upper_bound_exclusive})"

    def to_json(self):
        return str(self)


for e in experimental_results:
    toks = e['original_line'].split(':')
    try:
        e['original_line'] = IntRange(int(toks[0]), int(toks[1]) + 1)
    except Exception as ex:
        e['original_line'] = []
    #print('\t'.join(["experimental", *[str(s) for s in e.values()]]).replace("\n", ""))

    if e['function_line_range'] == 'ERROR':
        e['function_line_range'] = []
    else:
        toks = e['function_line_range'].split(':')
        try:
            e['function_line_range'] = IntRange(int(toks[1]), int(toks[2]) + 1)
        except Exception as ex:
            logging.exception(f"e was {e}")
    e['presence_condition'] = str(e['presence_condition'])

print(f"We have {len(baselines)} baseline results.")
print(f"We have {len(experimental_results)} experimental results.")

We have 78 baseline results.
We have 108 experimental results.


In [2]:
def match_stats(baseline_result: dict, experimental_result: dict) -> Tuple:
    """
    Returns a vector of different match information.
    (a, b, c)
    a = True iff baseline and experimental have the same line number, message, and file.
    b = True iff baseline and experimental have the same message, file, and baseline is within experimental's function scope.
    c = True iff baseline's configuration is compatible with experimental's presence condition.
    """

    a = (baseline_result['message'] == experimental_result['sanitized_message'] and \
         baseline_result['input_line'] in experimental_result['original_line'] and\
         baseline_result['input_file'].split('.')[0] == experimental_result['input_file'].split('.')[0])

    b = (baseline_result['message'] == experimental_result['sanitized_message'] and \
         baseline_result['input_line'] in experimental_result['function_line_range'] and\
         baseline_result['input_file'].split('.')[0] == experimental_result['input_file'].split('.')[0])

    c = False

    if experimental_result['presence_condition'] != 'None' and (a or b):  # Don't bother doing this expensive step when the file and line number are different.
        baseline_var_mapping = {}
        for var in baseline_result['configuration']:
            if var.startswith('DEF'):
                baseline_var_mapping[re.sub(r"^DEF_(.*)", "\1", var)] = True
            elif var.startswith('UNDEF'):
                baseline_var_mapping[re.sub(r"^UNDEF_(.*)", "\1", var)] = False
            else:
                raise RuntimeError(f"Don't know how to handle variable {var}")

        s = Solver()
        for var, val in baseline_var_mapping.items():
            var = Bool(var)
            if val:
                s.add(var)
            else:
                s.add(Not(var))

        for mat in re.findall("DEF_[a-zA-Z0-9_]+", experimental_result['presence_condition']):
            exec(f"{mat} = Bool('{mat}')")

        for mat in re.findall("USE_[a-zA-Z0-9_]+", experimental_result['presence_condition']):
            exec(f"{mat} = Int('{mat}')")

        while True:
            try:
                s.add(eval(experimental_result['presence_condition']))  # TODO Definitely need to do more transformation here.
                break
            except NameError as ne:
                var = re.search("name '(.*)' is not defined", str(ne))
                exec(f"{var.group(1)} = Int('{var.group(1)}')")

        c = s.check() == sat
    return a, b, c

def tupleize(func, args): return func(*args), tuple(args)

summary = {}

result_hierarchy = {(True, True, True): 0, (False, True, True): 0, (True, False, True): 0, (True, True, False): 0, (False, True, False): 0, (False, False, True): 0, (True, False, False): 0, (False, False, False): 0}

counter = 0

report = []
for b in baselines:
    results = [(b, e, match_stats(b, e)) for e in experimental_results]
    found = False
    for r in result_hierarchy.keys():
        for res in results:
            if res[2] == r:
                found = True
                result_hierarchy[r] += 1
                if (r == (False, False, False)):
                    report.append({"baseline": res[0]})
                break
        if found:
            break
import os
print("Current working directory is " + str(Path(os.curdir).absolute()))
with open('./not_matched.json', 'w') as f:
    json.dump(report, f, indent=2)
[print(f"{k}: {v}") for k, v in result_hierarchy.items()]

Current working directory is /home/austin/git/Sugarlyzer/scripts
(True, True, True): 61
(False, True, True): 5
(True, False, True): 0
(True, True, False): 4
(False, True, False): 1
(False, False, True): 0
(True, False, False): 0
(False, False, False): 7


[None, None, None, None, None, None, None, None]

In [20]:
result_hierarchy = {(True, True, True): 0, (False, True, True): 0, (True, False, True): 0, (True, True, False): 0, (False, True, False): 0, (False, False, True): 0, (True, False, False): 0, (False, False, False): 0}

for e in experimental_results:
    results = [(b, e, match_stats(b, e)) for b in baselines]
    found = False
    for r in result_hierarchy.keys():
        for res in results:
            if res[2] == r:
                found = True
                result_hierarchy[r] += 1
                break
        if found:
            break

[print(f"{k}: {v}") for k, v in result_hierarchy.items()]


(True, True, True): 37
(False, True, True): 12
(True, False, True): 0
(True, True, False): 6
(False, True, False): 3
(False, False, True): 0
(True, False, False): 0
(False, False, False): 50


[None, None, None, None, None, None, None, None]

At this point in the notebook, we have a few structures.
- summary: A dictionary mapping 3-tuples corresponding to results to a list of pairs of results.
- lonely_baselines: A list of baseline results for which no matching experimental result was found.
- lonely_experimental_results: A list of experimental results for which no matching baseline was found.

# Sample

This code randomly samples a result from each classification and prints it for inspection.

In [None]:
print(json.dumps({"summary": {str(k): len(summary[k]) for k in summary.keys()}}))

In [None]:
import random
for k, v in filter(lambda k: (k[0][0] or k[0][1]) and not k[0][2], summary.items()): # == str((False, False, False)), summary.items()):
    print(str(k))
    print(json.dumps(random.sample(v, k=max(1, len(v))), indent=2))
    print("-----------------------------------------------")
#{k: v for k, v in summary.items() if k != str((False, False, False))}}, indent=4))
print(f"Lonely baselines: {len(lonely_baselines)}, Lonely exps: {len(lonely_experimental_results)}")

In [None]:
print(f"Types of lonely baselines: \n" + json.dumps([s for s in sorted(lonely_baselines, key = lambda x: x['sanitized_message'])], indent=2))

In [None]:
print(json.dumps([e for e in experimental_results if "BUSYBOX/eef" in e['input_file']], indent=2))