In [1]:
!sudo apt install linux-tools-generic bpftool

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'linux-tools-common' instead of 'bpftool'
linux-tools-generic is already the newest version (6.2.0.39.39).
linux-tools-common is already the newest version (6.2.0-39.40).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


In [2]:
%load_ext autoreload
%autoreload now

from utils import *

In [3]:
from pathlib import Path


def get_linux_tools_path():
    parent = Path("/usr/lib/linux-tools")
    versions = [x for x in parent.iterdir() if x.is_dir()]
    if len(versions) == 0:
        raise Exception("No linux-tools found")
    versions.sort()
    return parent / versions[-1]


def get_bpftool_path():
    path = get_linux_tools_path() / "bpftool"
    if not path.exists():
        raise Exception("bpftool not found")
    return path


bpftool_path = get_bpftool_path()
bpftool_path

PosixPath('/usr/lib/linux-tools/6.2.0-39-generic/bpftool')

In [4]:
for file in Path("data").glob("**/*.btf"):
    for ext, cmd in [
            (".h", "format c"),
            (".txt", "format raw"),
            (".json", "--json"),
        ]:
        result = file.with_suffix(ext)
        if not result.exists():
            system(f"{bpftool_path} btf dump file {file} {cmd} > {result}")
        else:
            print(f"{result} already exists")

data/18.04-x86/5.4.0-91-generic.h already exists
data/18.04-x86/5.4.0-91-generic.txt already exists
data/18.04-x86/5.4.0-91-generic.json already exists
data/18.04-x86/5.0.0-65-generic.h already exists
data/18.04-x86/5.0.0-65-generic.txt already exists
data/18.04-x86/5.0.0-65-generic.json already exists
data/18.04-x86/5.3.0-76-generic.h already exists
data/18.04-x86/5.3.0-76-generic.txt already exists
data/18.04-x86/5.3.0-76-generic.json already exists
data/18.04-x86/4.15.0-213-generic.h already exists
data/18.04-x86/4.15.0-213-generic.txt already exists
data/18.04-x86/4.15.0-213-generic.json already exists
data/18.04-x86/4.18.0-25-generic.h already exists
data/18.04-x86/4.18.0-25-generic.txt already exists
data/18.04-x86/4.18.0-25-generic.json already exists
data/20.04-x86/5.13.0-52-generic.h already exists
data/20.04-x86/5.13.0-52-generic.txt already exists
data/20.04-x86/5.13.0-52-generic.json already exists
data/20.04-x86/5.11.0-46-generic.h already exists
data/20.04-x86/5.11.0-46-g

In [5]:
class BTFNormalizer:
    def __init__(self, raw_data):
        self.raw_data = raw_data

    RECURSE_KINDS = {
        Kind.CONST,
        Kind.VOLATILE,
        Kind.RESTRICT,
        Kind.PTR,
        Kind.FUNC,
        Kind.FUNC_PROTO,
        Kind.ARRAY,
    }

    def normalize_int(self, elem):
        assert elem["bits_offset"] == 0
        del elem["bits_offset"]
        del elem["encoding"]
        del elem["nr_bits"]
        del elem["size"]

    @staticmethod
    def uint2sint(u, nbytes):
        nbits = nbytes * 8
        u &= (1 << nbits) - 1
        if u >= (1 << (nbits - 1)):
            return u - (1 << nbits)
        return u

    def normalize_enum(self, elem, recurse):
        assert elem["vlen"] == len(elem["values"])
        del elem["vlen"]

        if recurse:
            if elem["encoding"] == "UNSIGNED":
                elem["values"] = [
                    {**v, "val": self.uint2sint(v["val"], elem["size"])}
                    for v in elem["values"]
                ]
        else:
            del elem["values"]
        del elem["encoding"]

    def normalize_type_id(self, elem, recurse):
        for type_key in ["type", "ret_type"]:
            type_id = f"{type_key}_id"

            if type_id not in elem:
                continue

            if recurse:
                elem[type_key] = self.normalize_impl(elem[type_id])
            del elem[type_id]


    def get_new_list(self, old_list):
        anon_count = 0

        def new_item(item):
            name = item["name"]
            if name == "(anon)":
                nonlocal anon_count

                if anon_count > 0:
                    name = f"(anon-{anon_count})"
                anon_count += 1
            
            return {
                "name": name,
                **{
                    k: v
                    for k, v in item.items()
                    if k not in ["name", "type_id"]
                },
                "type": self.normalize_impl(item["type_id"]),
            }

        return [new_item(item) for item in old_list]

    def normalize_list(self, elem, recurse):
        for list_key in ["params", "members"]:
            if list_key not in elem:
                continue

            assert len(elem[list_key]) == elem["vlen"]
            del elem["vlen"]

            if recurse:
                elem[list_key] = self.get_new_list(elem[list_key])
            else:
                del elem[list_key]
                if list_key == "members":
                    del elem["size"]

    def normalize_impl(self, type_id, recurse=False):
        if type_id == 0:
            return {"name": "void", "kind": "VOID"}

        elem = self.raw_data[type_id - 1]
        assert elem["id"] == type_id

        kind = elem["kind"]

        # Recurse into types for certain kinds
        recurse = recurse or kind in self.RECURSE_KINDS

        elem = elem.copy()

        del elem["id"]

        
        if kind == Kind.INT:
            self.normalize_int(elem)
        elif kind == Kind.ARRAY:
            del elem["index_type_id"]
        elif kind == Kind.ENUM:
            self.normalize_enum(elem, recurse)
        elif kind == Kind.FUNC:
            assert elem["linkage"] == "static"
            del elem["linkage"]
        elif kind in (Kind.PTR, Kind.FUNC_PROTO):
            assert elem["name"] == "(anon)"
            del elem["name"]

        self.normalize_type_id(elem, recurse)
        self.normalize_list(elem, recurse)

        return elem
    
    def normalize(self, type_id):
        return self.normalize_impl(type_id, recurse=True)

In [6]:
import json

for json_path in Path("data").glob("**/*.json"):
    jsonl_path = json_path.with_suffix(".jsonl")
    with open(json_path) as f:
        data = json.load(f)["types"]
        normalizer = BTFNormalizer(data)
        result = [normalizer.normalize(i) for i in range(1, len(data) + 1)]
        print(f"Writing {jsonl_path}")
        with open(jsonl_path, "w") as f:
            for elem in result:
                f.write(json.dumps(elem) + "\n")

Writing data/18.04-x86/5.0.0-65-generic.jsonl
Writing data/18.04-x86/5.4.0-91-generic.jsonl
Writing data/18.04-x86/5.3.0-76-generic.jsonl
Writing data/18.04-x86/4.15.0-213-generic.jsonl
Writing data/18.04-x86/4.18.0-25-generic.jsonl
Writing data/20.04-x86/5.13.0-52-generic.jsonl
Writing data/20.04-x86/5.11.0-46-generic.jsonl
Writing data/20.04-x86/5.4.0-170-generic.jsonl
Writing data/20.04-x86/5.8.0-63-generic.jsonl
Writing data/20.04-x86/5.15.0-92-generic.jsonl
Writing data/16.04-x86/4.15.0-142-generic.jsonl
Writing data/16.04-x86/4.4.0-210-generic.jsonl
Writing data/16.04-x86/4.8.0-58-generic.jsonl
Writing data/16.04-x86/4.13.0-45-generic.jsonl
Writing data/16.04-x86/4.10.0-42-generic.jsonl
