In [6]:
!sudo apt install linux-tools-generic bpftool

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'linux-tools-common' instead of 'bpftool'
linux-tools-generic is already the newest version (6.2.0.39.39).
linux-tools-common is already the newest version (6.2.0-39.40).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.


In [2]:
import os


def system(cmd):
    print(cmd)
    os.system(cmd)

In [3]:
from pathlib import Path

data_path = Path("data/ubuntu-18.04-x86")

In [4]:
def unzip_all(path, suffix=".tar.xz"):
    for file in path.glob(f"*{suffix}"):
        stem = file.name.removesuffix(suffix)
        result = path / stem
        if not result.exists():
            system(f"tar -xf {file} -C {path}")
        else:
            print(f"{result} already exists")


unzip_all(data_path)

data/ubuntu-18.04-x86/5.0.0-65-generic.btf already exists
data/ubuntu-18.04-x86/4.18.0-25-generic.btf already exists
data/ubuntu-18.04-x86/5.3.0-76-generic.btf already exists
data/ubuntu-18.04-x86/4.15.0-213-generic.btf already exists
data/ubuntu-18.04-x86/5.4.0-91-generic.btf already exists


In [5]:
def get_linux_tools_path():
    parent = Path("/usr/lib/linux-tools")
    versions = [x for x in parent.iterdir() if x.is_dir()]
    if len(versions) == 0:
        raise Exception("No linux-tools found")
    versions.sort()
    return parent / versions[-1]


def get_bpftool_path():
    path = get_linux_tools_path() / "bpftool"
    if not path.exists():
        raise Exception("bpftool not found")
    return path


bpftool_path = get_bpftool_path()

In [6]:
for file in data_path.glob("*.btf"):
    for ext, cmd in [(".h", "format c"), (".txt", "format raw"), (".json", "--json")]:
        result = file.with_suffix(ext)
        if not result.exists():
            system(f"{bpftool_path} btf dump file {file} {cmd} > {result}")
        else:
            print(f"{result} already exists")

data/ubuntu-18.04-x86/5.4.0-91-generic.h already exists
data/ubuntu-18.04-x86/5.4.0-91-generic.txt already exists
data/ubuntu-18.04-x86/5.4.0-91-generic.json already exists
data/ubuntu-18.04-x86/5.0.0-65-generic.h already exists
data/ubuntu-18.04-x86/5.0.0-65-generic.txt already exists
data/ubuntu-18.04-x86/5.0.0-65-generic.json already exists
data/ubuntu-18.04-x86/5.3.0-76-generic.h already exists
data/ubuntu-18.04-x86/5.3.0-76-generic.txt already exists
data/ubuntu-18.04-x86/5.3.0-76-generic.json already exists
data/ubuntu-18.04-x86/4.15.0-213-generic.h already exists
data/ubuntu-18.04-x86/4.15.0-213-generic.txt already exists
data/ubuntu-18.04-x86/4.15.0-213-generic.json already exists
data/ubuntu-18.04-x86/4.18.0-25-generic.h already exists
data/ubuntu-18.04-x86/4.18.0-25-generic.txt already exists
data/ubuntu-18.04-x86/4.18.0-25-generic.json already exists


In [7]:
json_paths = sorted(file for file in data_path.glob("*.json"))

json_paths

[PosixPath('data/ubuntu-18.04-x86/4.15.0-213-generic.json'),
 PosixPath('data/ubuntu-18.04-x86/4.18.0-25-generic.json'),
 PosixPath('data/ubuntu-18.04-x86/5.0.0-65-generic.json'),
 PosixPath('data/ubuntu-18.04-x86/5.3.0-76-generic.json'),
 PosixPath('data/ubuntu-18.04-x86/5.4.0-91-generic.json')]

In [8]:
import json


def load_json(json_path):
    with open(json_path) as f:
        return json.load(f)


jsons = [load_json(p) for p in json_paths]

In [9]:
from enum import Enum


class Kind(str, Enum):
    INT = "INT"
    PTR = "PTR"
    ARRAY = "ARRAY"
    STRUCT = "STRUCT"
    UNION = "UNION"
    ENUM = "ENUM"
    FWD = "FWD"
    TYPEDEF = "TYPEDEF"
    VOLATILE = "VOLATILE"
    CONST = "CONST"
    RESTRICT = "RESTRICT"
    FUNC = "FUNC"
    FUNC_PROTO = "FUNC_PROTO"
    VAR = "VAR"
    DATASEC = "DATASEC"
    FLOAT = "FLOAT"
    DECL_TAG = "DECL_TAG"
    TYPE_TAG = "TYPE_TAG"
    ENUM64 = "ENUM64"

In [201]:
from functools import cache


class BTF:
    def __init__(self, path, raw_data=None):
        self.path = path
        if raw_data is None:
            self._raw_data = load_json(path)['types']
        else:
            self._raw_data = raw_data['types']

    def __getitem__(self, id):
        if id == 0:
            return {'id': 0, 'name': 'void', 'kind': 'VOID'}
        e = self._raw_data[id - 1]
        assert e['id'] == id
        return e

    def __len__(self):
        return len(self._raw_data)

    def __iter__(self):
        return iter(self._raw_data)

    @property
    def short_name(self):
        return self.path.name.split("-")[0]

    def print(self):
        from collections import defaultdict

        print(f"File: {self.path}")

        print("Sample:")
        kinds = defaultdict(int)
        for e in self:
            if e['kind'] not in kinds:
                print(f"\t{e['id']:6} ({e['kind']:10}): {e}")
                print(f"\t{'':18}-> {self.normalize(e['id'])}")
            kinds[e['kind']] += 1

        kinds = sorted(kinds.items(), key=lambda x: x[1], reverse=True)
        print(f"Kinds:")
        print(f"\t{dict(kinds)}")

        print()

    @cache
    def filter_on_kind(self, kind):
        return {
            e['name']: self.normalize(e['id']) for e in self
            if e['kind'] == kind and e['name'] != '(anon)'
        }

    @cache
    def get_by_kind_name(self, kind, name):
        return self.filter_on_kind(kind)[name]

    RECURSE_KINDS = {Kind.CONST, Kind.VOLATILE, Kind.RESTRICT,
                     Kind.PTR, Kind.FUNC, Kind.FUNC_PROTO, Kind.ARRAY}

    @cache
    def normalize(self, type_id, recurse=True):
        elem = self[type_id].copy()

        # Recurse into types for certain kinds
        recurse = recurse or elem['kind'] in self.RECURSE_KINDS

        # Remove redundant fields
        del elem['id']

        kind = elem['kind']
        if kind == Kind.INT:
            assert elem['bits_offset'] == 0
            del elem['bits_offset']
            del elem['encoding']
            del elem['nr_bits']
            del elem['size']
        elif kind == Kind.ARRAY:
            del elem['index_type_id']
        elif kind == Kind.ENUM:
            if not recurse:
                assert elem['vlen'] == len(elem['values'])
                del elem['values']
                del elem['vlen']
                del elem['encoding']
        elif kind == Kind.FUNC:
            assert elem['linkage'] == 'static'
            del elem['linkage']
        elif kind in (Kind.PTR, Kind.FUNC_PROTO):
            assert elem['name'] == '(anon)'
            del elem['name']

        # Normalize types
        for type in ['type', 'ret_type']:
            type_id = f"{type}_id"

            if type_id not in elem:
                continue

            if recurse:
                elem[type] = self.normalize(elem[type_id], recurse=False)
            del elem[type_id]

        for list_key in ['params', 'members']:
            if list_key not in elem:
                continue

            assert len(elem[list_key]) == elem['vlen']
            del elem['vlen']

            if recurse:
                elem[list_key] = [
                    {
                        **{k: v for k, v in item.items() if k != 'type_id'},
                        'type': self.normalize(item['type_id'], recurse=False)
                    }
                    for item in elem[list_key]
                ]
            else:
                del elem[list_key]
                if list_key == 'members':
                    del elem['size']

        return elem


d1 = BTF(json_paths[0])
d2 = BTF(json_paths[1])

d1.print()

d1.get_by_kind_name(Kind.STRUCT, "task_struct")
# d1.get_by_kind_name(Kind.FUNC, "vfs_read")

File: data/ubuntu-18.04-x86/4.15.0-213-generic.json
Sample:
	     1 (INT       ): {'id': 1, 'kind': 'INT', 'name': 'long unsigned int', 'size': 8, 'bits_offset': 0, 'nr_bits': 64, 'encoding': '(none)'}
	                  -> {'kind': 'INT', 'name': 'long unsigned int'}
	     2 (CONST     ): {'id': 2, 'kind': 'CONST', 'name': '(anon)', 'type_id': 1}
	                  -> {'kind': 'CONST', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	     3 (VOLATILE  ): {'id': 3, 'kind': 'VOLATILE', 'name': '(anon)', 'type_id': 1}
	                  -> {'kind': 'VOLATILE', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	     4 (ARRAY     ): {'id': 4, 'kind': 'ARRAY', 'name': '(anon)', 'type_id': 1, 'index_type_id': 20, 'nr_elems': 2}
	                  -> {'kind': 'ARRAY', 'name': '(anon)', 'nr_elems': 2, 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	     5 (PTR       ): {'id': 5, 'kind': 'PTR', 'name': '(anon)', 'type_id': 8}
	               

{'kind': 'STRUCT',
 'name': 'task_struct',
 'size': 9152,
 'members': [{'name': 'thread_info',
   'bits_offset': 0,
   'type': {'kind': 'STRUCT', 'name': 'thread_info'}},
  {'name': 'state',
   'bits_offset': 128,
   'type': {'kind': 'VOLATILE',
    'name': '(anon)',
    'type': {'kind': 'INT', 'name': 'long int'}}},
  {'name': 'stack',
   'bits_offset': 192,
   'type': {'kind': 'PTR', 'type': {'name': 'void', 'kind': 'VOID'}}},
  {'name': 'usage',
   'bits_offset': 256,
   'type': {'kind': 'TYPEDEF', 'name': 'atomic_t'}},
  {'name': 'flags',
   'bits_offset': 288,
   'type': {'kind': 'INT', 'name': 'unsigned int'}},
  {'name': 'ptrace',
   'bits_offset': 320,
   'type': {'kind': 'INT', 'name': 'unsigned int'}},
  {'name': 'wake_entry',
   'bits_offset': 384,
   'type': {'kind': 'STRUCT', 'name': 'llist_node'}},
  {'name': 'on_cpu',
   'bits_offset': 448,
   'type': {'kind': 'INT', 'name': 'int'}},
  {'name': 'cpu',
   'bits_offset': 480,
   'type': {'kind': 'INT', 'name': 'unsigned in

In [202]:
import sys


def print_as_list(name, s, num=10):
    print(f"{name} ({len(s)}): {list(s)[:num]}")


class FileLogger:
    def __init__(self, name):
        self.stdout = sys.stdout

        file_path = Path("output") / name
        file_path.parent.mkdir(parents=True, exist_ok=True)
        self.log = open(file_path, "w")

    def write(self, message):
        self.stdout.write(message)
        self.log.write(message)

    def __enter__(self):
        sys.stdout = self
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout = self.stdout
        self.log.close()


def _check_diff(d_old, d_new, kind, diff):
    f_old = d_old.filter_on_kind(kind)
    f_new = d_new.filter_on_kind(kind)

    print_as_list(f"Old {kind}", f_old.keys())
    print_as_list(f"New {kind}", f_new.keys())

    print_as_list(f"Removed {kind}", f_old.keys() - f_new.keys())
    print_as_list(f"Added {kind}", f_new.keys() - f_old.keys())

    common = {
        name: (f_old[name], f_new[name])
        for name in f_old.keys() & f_new.keys()
    }
    print_as_list(f"Common {kind}", common.keys())

    changed = {
        name: (old, new)
        for name, (old, new) in common.items()
        if old != new
    }
    print_as_list(f"Changed {kind}", changed.keys())

    for name, (old, new) in changed.items():
        reason = diff(old, new)
        lines = reason.strip().split("\n")
        print(f"{kind:12}{name}")
        for line in lines:
            print(f"{'':20} {line}")


def check_diff(d_old, d_new, kind, diff):
    name = f"{d_old.short_name}-{d_new.short_name}-{kind.name}.txt"
    with FileLogger(name):
        _check_diff(d_old, d_new, kind, diff)

In [203]:
def diff_struct(old, new):
    result = ""

    old_members = {m['name']: m for m in old['members']}
    new_members = {m['name']: m for m in new['members']}

    # added field
    added_members = new_members.keys() - old_members.keys()
    if added_members:
        result += f"Added fields:\n"
        for name in added_members:
            result += f"{name:>20}: {new_members[name]['type']}\n"

    # removed field
    removed_members = old_members.keys() - new_members.keys()
    if removed_members:
        result += f"Removed fields:\n"
        for name in removed_members:
            result += f"{name:>20}: {old_members[name]['type']}\n"

    # fields reordered
    common = old_members.keys() & new_members.keys()
    if [n for n in old_members if n in common] != [n for n in new_members if n in common]:
        result += f"Fields reordered:\n"
        result += f"{'':>20} {list(old_members)}\n"
        result += f"{'':>20} {list(new_members)}\n"

    # fields changed type
    changed_types = {
        name: (old_members[name]['type'], new_members[name]['type'])
        for name in common
        if old_members[name]['type'] != new_members[name]['type']
    }
    if changed_types:
        result += "Field type changed:\n"
        for name, (old_type, new_type) in changed_types.items():
            result += f"{name:>20}: {old_type}\n"
            result += f"{'':>20}->{new_type}\n"

    # fields changed offset
    old_offset = {name: old_members[name]
                  ['bits_offset'] for name in old_members}
    new_offset = {name: new_members[name]
                  ['bits_offset'] for name in new_members}
    layout_changed = old_offset != new_offset or old['size'] != new['size']
    if layout_changed and result == "":
        result += f"Layout changed\n"

    assert result, f"\n{old}\n{new}"
    return result


check_diff(d1, d2, Kind.STRUCT, diff_struct)

Old Kind.STRUCT (7329): ['list_head', 'hlist_head', 'hlist_node', 'callback_head', 'jump_entry', 'static_key', 'static_key_true', 'static_key_false', 'file_operations', 'atomic_notifier_head']
New Kind.STRUCT (8439): ['list_head', 'hlist_head', 'hlist_node', 'callback_head', 'file_system_type', 'jump_entry', 'static_key', 'static_key_true', 'static_key_false', 'file_operations']
Removed Kind.STRUCT (272): ['blk_issue_stat', 'hotplug_slot_info', 'spi_flash_read_message', 'tpm_cmd_t', 'request_list', 'fuse_out', 'cpufreq_user_policy', 'dpc_rp_pio_regs', 'property_set', 'trace_event_data_offsets_bpf_map_delete_elem']
Added Kind.STRUCT (1382): ['ib_dm_alloc_attr', 'mctrl_gpios', 'irq_bypass_producer', 'vfio_iommu', 'x86_perf_regs', 'cpuid_regs', 'cfg80211_pmsr_capabilities', 'xarray', 'fw_priv', 'sctp_globals']
Common Kind.STRUCT (7057): ['rpc_version', 'config_s', 'ncsi_cmd_sp_pkt', 'aml_resource_spi_serialbus', 'ib_udata', 'nla_policy', 'trace_event_raw_mm_shrink_slab_start', 'clk_notifi

In [204]:
check_diff(d1, d2, Kind.UNION, diff_struct)

Old Kind.UNION (99): ['fpregs_state', 'irq_stack_union', 'sigval', 'kernfs_node_id', 'key_payload', 'thread_union', 'perf_mem_data_src', 'flowi_uli', 'ethtool_flow_union', 'perf_capabilities']
New Kind.UNION (126): ['fpregs_state', 'sigval', '__sifields', 'kernfs_node_id', 'thread_union', 'key_payload', 'perf_mem_data_src', 'bpf_attr', 'flowi_uli', 'ethtool_flow_union']
Removed Kind.UNION (6): ['ipmi_smi_info_union', 'irq_stack_union', 'c', 'ec_response_get_next_data', 'fuse_dentry', 'tpm2_cmd_params']
Added Kind.UNION (33): ['uvh_rh_gam_alias210_redirect_config_2_mmr_u', 'uvh_apicid', 'sidtab_entry_inner', 'hv_gpa_page_range', 'ec_response_get_next_data_v1', 'tls_crypto_context', 'uvh_rh_gam_config_mmr_u', 'uv1h_lb_target_physical_apic_id_mask_u', 'uv3h_gr0_gam_gr_config_u', 'bau_payload_header']
Common Kind.UNION (93): ['tpacket_req_u', 'mon_data_bits', 'acpi_descriptor', 'irte_ga_lo', 'entry_union', 'hsw_tsx_tuning', 'l2_cache', 'tcp_cc_info', 'l3_cache', 'ftrace_code_union']
Change

In [205]:
def diff_func(old, new):
    result = ""

    old_params = {p['name']: p for p in old['type']['params']}
    new_params = {p['name']: p for p in new['type']['params']}

    # params added
    added_params = new_params.keys() - old_params.keys()
    if added_params:
        result += f"Added params:\n"
        for name in added_params:
            result += f"{name:>20}: {new_params[name]['type']}\n"

    # params removed
    removed_params = old_params.keys() - new_params.keys()
    if removed_params:
        result += f"Removed params:\n"
        for name in removed_params:
            result += f"{name:>20}: {old_params[name]['type']}\n"

    # params reordered
    common = old_params.keys() & new_params.keys()
    if [n for n in old_params if n in common] != [n for n in new_params if n in common]:
        result += f"Params reordered:\n"
        result += f"{'':>20} {list(old_params)}\n"
        result += f"{'':>20} {list(new_params)}\n"

    # params changed type
    changed_types = {
        name: (old_params[name]['type'], new_params[name]['type'])
        for name in common
        if old_params[name]['type'] != new_params[name]['type']
    }
    if changed_types:
        result += "Param type changed:\n"
        for name, (old_type, new_type) in changed_types.items():
            result += f"{name:>20}: {old_type}\n"
            result += f"{'':>20}->{new_type}\n"

    # changed return value
    old_ret = old['type']['ret_type']
    new_ret = new['type']['ret_type']
    if old_ret != new_ret:
        result += f"Return type changed:\n"
        result += f"{'':>20}: {old_ret}\n"
        result += f"{'':>20}->{new_ret}\n"

    assert result, f"\n{old}\n{new}"
    return result


check_diff(d1, d2, Kind.FUNC, diff_func)

In [None]:
def diff_enum(old, new):
    result = ""

    old_values = {v['name']: v for v in old['values']}
    new_values = {v['name']: v for v in new['values']}

    # added value
    added_values = new_values.keys() - old_values.keys()
    if added_values:
        result += f"Added values:\n"
        for name in added_values:
            result += f"{'':8}{name:40}: {new_values[name]['val']}\n"

    # removed value
    removed_values = old_values.keys() - new_values.keys()
    if removed_values:
        result += f"Removed values:\n"
        for name in removed_values:
            result += f"{'':8}{name:40}: {old_values[name]['val']}\n"

    # values changed
    common = old_values.keys() & new_values.keys()
    changed_values = {
        name: (old_values[name]['val'], new_values[name]['val'])
        for name in common
        if old_values[name]['val'] != new_values[name]['val']
    }
    if changed_values:
        result += "Value changed:\n"
        for name, (old_val, new_val) in changed_values.items():
            result += f"{'':8}{name:40}: {old_val} -> {new_val}\n"

    assert result, f"\n{old}\n{new}"

    return result


check_diff(d1, d2, Kind.ENUM, diff_enum)

Old Kind.ENUM (1153): ['system_states', 'timespec_type', 'x86_hardware_subarch', 'l1tf_mitigations', 'x86_legacy_i8042_state', 'fixed_addresses', 'uprobe_task_state', 'apic_intr_mode_id', 'pcpu_fc', 'numa_stat_item']
New Kind.ENUM (1178): ['system_states', 'timespec_type', 'x86_hardware_subarch', 'l1tf_mitigations', 'x86_legacy_i8042_state', 'fixed_addresses', 'uprobe_task_state', 'apic_intr_mode_id', 'pcpu_fc', 'numa_stat_item']
Removed Kind.ENUM (7): ['hmm_update_event', 'nvdimm_security_state', 'userfaultfd_state', 'si_sm_result', 'evdev_clock_type', 'pcie_data_rate', 'memmap_context']
Added Kind.ENUM (32): ['uv_memprotect', 'ramfs_param', 'rproc_crash_type', 'vfio_notify_type', 'input_clock_type', 'ioc_running', 'net_dm_attr', 'perf_event_sample_format', 'devlink_trap_action', 'ip_conntrack_status']
Common Kind.ENUM (1146): ['audit_nlgrps', 'tg_state_flags', 'rseq_cpu_id_state', 'devkmsg_log_bits', 'opal_mbr_done_flag', 'cgroup_filetype', 'iommu_fault_type', 'devlink_dpipe_header_i