In [48]:
import itertools
import random
import json
from json import JSONEncoder

def _default(self, obj):
    return getattr(obj.__class__, "to_json", _default.default)(obj)

_default.default = JSONEncoder().default
JSONEncoder.default = _default

from pathos import multiprocessing
import logging
import copy
from pathlib import Path
from typing import Tuple
import re
from z3.z3 import Solver, And, Or, Not, Bool, Int, sat

baselines: Path = Path("/home/austin/git/Sugarlyzer_results/baseline.json")
experimental_results: Path = Path("/home/austin/git/Sugarlyzer_results/desugared.json")

with open(baselines) as f:
    baselines = json.load(f)

with open(experimental_results) as f:
    experimental_results = json.load(f)

lonely_baselines = copy.deepcopy(baselines)
lonely_experimental_results = copy.deepcopy(experimental_results)

class IntRange:
    def __init__(self, lower_bound_inclusive, upper_bound_exclusive):
        self.lower_bound_inclusive = lower_bound_inclusive
        self.upper_bound_exclusive = upper_bound_exclusive

    def __contains__(self, item):
        return isinstance(item, int) and (self.lower_bound_inclusive <= item < self.upper_bound_exclusive)

    def __repr__(self):
        return f"IntRange({self.lower_bound_inclusive, self.upper_bound_exclusive})"

    def __str__(self):
        return f"[{self.lower_bound_inclusive}:{self.upper_bound_exclusive})"

    def to_json(self):
        return str(self)

def match_stats(baseline_result: dict, experimental_result: dict) -> Tuple:
    """
    Returns a vector of different match information.
    (a, b, c)
    a = True iff baseline and experimental have the same line number, message, and file.
    b = True iff baseline and experimental have the same message, file, and baseline is within experimental's function scope.
    c = True iff baseline's configuration is compatible with experimental's presence condition.
    """

    a = (baseline_result['message'] == experimental_result['sanitized_message'] and \
         baseline_result['input_line'] in experimental_result['original_line'] and\
         baseline_result['input_file'].split('.')[0] == experimental_result['input_file'].split('.')[0])

    b = (baseline_result['message'] == experimental_result['sanitized_message'] and \
         baseline_result['input_line'] in experimental_result['function_line_range'] and\
         baseline_result['input_file'].split('.')[0] == experimental_result['input_file'].split('.')[0])

    c = False

    if experimental_result['presence_condition'] != 'None' and (a or b):  # Don't bother doing this expensive step when the file and line number are different.
        baseline_var_mapping = {}
        for var in baseline_result['configuration']:
            if var.startswith('DEF'):
                baseline_var_mapping[re.sub(r"^DEF_(.*)", "\1", var)] = True
            elif var.startswith('UNDEF'):
                baseline_var_mapping[re.sub(r"^UNDEF_(.*)", "\1", var)] = False
            else:
                raise RuntimeError(f"Don't know how to handle variable {var}")

        s = Solver()
        for var, val in baseline_var_mapping.items():
            var = Bool(var)
            if val:
                s.add(var)
            else:
                s.add(Not(var))

        for mat in re.findall("DEF_[a-zA-Z0-9_]+", experimental_result['presence_condition']):
            exec(f"{mat} = Bool('{mat}')")

        for mat in re.findall("USE_[a-zA-Z0-9_]+", experimental_result['presence_condition']):
            exec(f"{mat} = Int('{mat}')")

        while True:
            try:
                s.add(eval(experimental_result['presence_condition']))  # TODO Definitely need to do more transformation here.
                break
            except NameError as ne:
                var = re.search("name '(.*)' is not defined", str(ne))
                exec(f"{var.group(1)} = Int('{var.group(1)}')")

        c = s.check() == sat
    return a, b, c

for e in experimental_results:
    toks = e['original_line'].split(':')
    try:
        e['original_line'] = IntRange(int(toks[0]), int(toks[1]) + 1)
    except Exception as ex:
        e['original_line'] = []
    #print('\t'.join(["experimental", *[str(s) for s in e.values()]]).replace("\n", ""))

    if e['function_line_range'] == 'ERROR':
        e['function_line_range'] = []
    else:
        toks = e['function_line_range'].split(':')
        try:
            e['function_line_range'] = IntRange(int(toks[1]), int(toks[2]) + 1)
        except Exception as ex:
            logging.exception(f"e was {e}")
    e['presence_condition'] = str(e['presence_condition'])

def tupleize(func, args): return func(*args), tuple(args)

summary = {}
for result, input in multiprocessing.Pool().starmap(tupleize,
                                                    (zip(itertools.cycle([match_stats]), itertools.product(baselines, experimental_results)))):
    if result not in summary:
        summary[result] = []
    summary[result].append(input)

    match result:
        case (a, b, _) if a or b:
            lonely_baselines = [l for l in lonely_baselines if l != input[0]]
            lonely_experimental_results = [l for l in lonely_experimental_results if l != input[1]]

At this point in the notebook, we have a few structures.
- summary: A dictionary mapping 3-tuples corresponding to results to a list of pairs of results.
- lonely_baselines: A list of baseline results for which no matching experimental result was found.
- lonely_experimental_results: A list of experimental results for which no matching baseline was found.

# Sample

This code randomly samples a result from each classification and prints it for inspection.

In [70]:
import random

print(json.dumps({"summary": {str(k): len(summary[k]) for k in summary.keys()}}))
for k, v in filter(lambda k: k[0][0] or k[0][1], summary.items()): # == str((False, False, False)), summary.items()):
    print(str(k))
    print(json.dumps(random.sample(v, k=min(1, len(v))), indent=2))
    print("-----------------------------------------------")
#{k: v for k, v in summary.items() if k != str((False, False, False))}}, indent=4))
print(f"Lonely baselines: {len(lonely_baselines)}, Lonely exps: {len(lonely_experimental_results)}")

{"summary": {"(True, True, True)": 82, "(False, False, False)": 8312, "(False, True, True)": 24, "(True, True, False)": 6}}
(True, True, True)
[
  [
    {
      "id": "5",
      "input_file": "/targets/VarBugsPatches/BUSYBOX/b62bd7b261b.c",
      "input_line": 24,
      "original_line": "ERROR",
      "function_line_range": "GLOBAL:1:46",
      "message": "Dereference of undefined pointer value (loaded from variable 'p')",
      "sanitized_message": "Dereference of undefined pointer value (loaded from variable 'p')",
      "presence_condition": null,
      "feasible": null,
      "configuration": [
        "DEF_ENABLE_FEATURE_MDEV_CONF",
        "DEF_ENABLE_FEATURE_MDEV_RENAME",
        "DEF_ENABLE_FEATURE_MDEV_RENAME_REGEXP"
      ],
        [
          11,
          16,
          23,
          24,
          43,
          44
        ]
      ],
    },
    {
      "id": "53",
      "input_file": "/targets/VarBugsPatches/BUSYBOX/b62bd7b261b.desugared.c",
      "input_line": 3265,
      "

In [85]:
print(f"Types of lonely baselines: \n" + json.dumps([s for s in sorted(lonely_baselines, key = lambda x: x['sanitized_message']) if "'sc'" in s['message']], indent=2))

Types of lonely baselines: 
[
  {
    "id": "1",
    "input_file": "/targets/VarBugsPatches/APACHE/2a6cbfa00e0.c",
    "input_line": 16,
    "original_line": "ERROR",
    "function_line_range": "GLOBAL:1:23",
    "message": "Potential leak of memory pointed to by 'sc'",
    "sanitized_message": "Potential leak of memory pointed to by 'sc'",
    "presence_condition": null,
    "feasible": null,
    "configuration": [
      "UNDEF_SHARED_MODULE"
    ],
      [
        6,
        16,
        21
      ]
    ],
  }
]


In [83]:
print(json.dumps([e for e in experimental_results if "'sc'" in e['sanitized_message'] and '2a6cb' in e['input_file']], indent=2))

[
  {
    "id": "5",
    "input_file": "/targets/VarBugsPatches/APACHE/2a6cbfa00e0.desugared.c",
    "input_line": 1575,
    "original_line": [],
    "function_line_range": [],
    "message": "Potential leak of memory pointed to by '__sc_1084'",
    "sanitized_message": "Potential leak of memory pointed to by 'sc'",
    "presence_condition": "None",
    "feasible": false,
    "configuration": "None",
      [
        1553,
        1560,
        1562,
        1566,
        1569,
        1572,
        1575,
        1633,
        1634
      ]
    ],
  },
  {
    "id": "6",
    "input_file": "/targets/VarBugsPatches/APACHE/2a6cbfa00e0.desugared.c",
    "input_line": 1610,
    "original_line": [],
    "function_line_range": [],
    "message": "Potential leak of memory pointed to by '__sc_1085'",
    "sanitized_message": "Potential leak of memory pointed to by 'sc'",
    "presence_condition": "And(And((DEF__FORTIFY_SOURCE) ,And( (_FORTIFY_SOURCE > 0) ,And( (DEF___OPTIMIZE__) , (__OPTIMIZE__ >