In [1]:
from smith.smith.code_analyzer import CCodeAnalyzer
import json
import tempfile
from pathlib import Path
import os
from glob import glob
from tqdm import tqdm


In [2]:
FILE_DIR= "syzbot-data"
FUNCTION_DIR = "syzbot-function"


In [3]:
def parse_code(code: str):
    with tempfile.NamedTemporaryFile(suffix='.c') as f:
        f.write(code.encode())
        f.flush()
        path = Path(f.name)

        analyzer = CCodeAnalyzer(path, [])
        analyzer.initialize()
    return analyzer.declarations

def get_changed_functions(filename):
    with open(filename) as f:
        data = json.load(f)

    changed = []
    for patch_file in data:
        old_functions = parse_code(patch_file['old_contents'])
        old_functions = old_functions["FUNCTION_DECL"] if "FUNCTION_DECL" in old_functions else {}
        new_functions = parse_code(patch_file['new_contents'])
        new_functions = new_functions["FUNCTION_DECL"] if "FUNCTION_DECL" in new_functions else {}

        for old_function, old_info in old_functions.items():
            new_info = new_functions.get(old_function)
            if new_info is None:
                changed.append((old_info.code, None))
            elif new_info.code != old_info.code:
                changed.append((old_info.code, new_info.code))

    return changed
        
get_changed_functions("syzbot-data/KASAN__double-free_or_invalid-free/ddb673727990990c6ded3e7cc220e39abfc244ab.json")


[('static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)\n{\n\tstruct tun_struct *tun;\n\tstruct tun_file *tfile = file->private_data;\n\tstruct net_device *dev;\n\tint err;\n\n\tif (tfile->detached)\n\t\treturn -EINVAL;\n\n\tdev = __dev_get_by_name(net, ifr->ifr_name);\n\tif (dev) {\n\t\tif (ifr->ifr_flags & IFF_TUN_EXCL)\n\t\t\treturn -EBUSY;\n\t\tif ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)\n\t\t\ttun = netdev_priv(dev);\n\t\telse if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)\n\t\t\ttun = netdev_priv(dev);\n\t\telse\n\t\t\treturn -EINVAL;\n\n\t\tif (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=\n\t\t    !!(tun->flags & IFF_MULTI_QUEUE))\n\t\t\treturn -EINVAL;\n\n\t\tif (tun_not_capable(tun))\n\t\t\treturn -EPERM;\n\t\terr = security_tun_dev_open(tun->security);\n\t\tif (err < 0)\n\t\t\treturn err;\n\n\t\terr = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER);\n\t\tif (err < 0)\n\t\t\treturn err;\n\n\t\tif (tun->f

In [4]:
dataset = {}
for file_path in tqdm(glob(f"{FILE_DIR}/*/*.json")):
    # change FILE_DIR to FUNCTION_DIR
    function_path = file_path.replace(FILE_DIR, FUNCTION_DIR)
    if not os.path.exists(os.path.dirname(function_path)):
        os.makedirs(os.path.dirname(function_path))

    # get path without file name
    function_path = os.path.splitext(function_path)[0]
    category = os.path.dirname(file_path).split("/")[-1]
    changed = get_changed_functions(file_path)
    if changed:
        # array concat
        if category in dataset:
            dataset[category].extend(changed)
        else:
            dataset[category] = changed

        for i, change in enumerate(changed):
            # write to file
            with open(f"{function_path}-{i}.json", "w") as f:
                function_data = {
                    "vulnerable": change[0],
                    "benign": change[1],
                }

                json.dump(function_data, f)


100%|██████████| 3385/3385 [05:11<00:00, 10.87it/s]


In [5]:
print (len(glob(f"{FILE_DIR}/*/*.json")))


3385
