In [6]:
!sudo apt install linux-tools-generic bpftool

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'linux-tools-common' instead of 'bpftool'
linux-tools-generic is already the newest version (6.2.0.39.39).
linux-tools-common is already the newest version (6.2.0-39.40).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.


In [1]:
import os


def system(cmd):
    print(cmd)
    os.system(cmd)

In [2]:
from pathlib import Path

data_path = Path("data/ubuntu-18.04-x86")

In [17]:
def unzip_all(path, suffix=".tar.xz"):
    for file in path.glob(f"*{suffix}"):
        stem = file.name.removesuffix(suffix)
        result = path / stem
        if not result.exists():
            system(f"tar -xf {file} -C {path}")
        else:
            print(f"{result} already exists")


unzip_all(data_path)

data/ubuntu-18.04-x86/5.0.0-65-generic.btf already exists
data/ubuntu-18.04-x86/4.18.0-25-generic.btf already exists
data/ubuntu-18.04-x86/5.3.0-76-generic.btf already exists
data/ubuntu-18.04-x86/4.15.0-213-generic.btf already exists
data/ubuntu-18.04-x86/5.4.0-91-generic.btf already exists


In [11]:
def get_linux_tools_path():
    parent = Path("/usr/lib/linux-tools")
    versions = [x for x in parent.iterdir() if x.is_dir()]
    if len(versions) == 0:
        raise Exception("No linux-tools found")
    versions.sort()
    return parent / versions[-1]


def get_bpftool_path():
    path = get_linux_tools_path() / "bpftool"
    if not path.exists():
        raise Exception("bpftool not found")
    return path


bpftool_path = get_bpftool_path()

In [18]:
for file in data_path.glob("*.btf"):
    for ext, cmd in [(".h", "format c"), (".txt", "format raw"), (".json", "--json")]:
        result = file.with_suffix(ext)
        if not result.exists():
            system(f"{bpftool_path} btf dump file {file} {cmd} > {result}")
        else:
            print(f"{result} already exists")

data/ubuntu-18.04-x86/5.4.0-91-generic.h already exists
data/ubuntu-18.04-x86/5.4.0-91-generic.txt already exists
data/ubuntu-18.04-x86/5.4.0-91-generic.json already exists
data/ubuntu-18.04-x86/5.0.0-65-generic.h already exists
data/ubuntu-18.04-x86/5.0.0-65-generic.txt already exists
data/ubuntu-18.04-x86/5.0.0-65-generic.json already exists
data/ubuntu-18.04-x86/5.3.0-76-generic.h already exists
data/ubuntu-18.04-x86/5.3.0-76-generic.txt already exists
data/ubuntu-18.04-x86/5.3.0-76-generic.json already exists
data/ubuntu-18.04-x86/4.15.0-213-generic.h already exists
data/ubuntu-18.04-x86/4.15.0-213-generic.txt already exists
data/ubuntu-18.04-x86/4.15.0-213-generic.json already exists
data/ubuntu-18.04-x86/4.18.0-25-generic.h already exists
data/ubuntu-18.04-x86/4.18.0-25-generic.txt already exists
data/ubuntu-18.04-x86/4.18.0-25-generic.json already exists


In [20]:
json_paths = sorted(file for file in data_path.glob("*.json"))

json_paths

[PosixPath('data/ubuntu-18.04-x86/4.15.0-213-generic.json'),
 PosixPath('data/ubuntu-18.04-x86/4.18.0-25-generic.json'),
 PosixPath('data/ubuntu-18.04-x86/5.0.0-65-generic.json'),
 PosixPath('data/ubuntu-18.04-x86/5.3.0-76-generic.json'),
 PosixPath('data/ubuntu-18.04-x86/5.4.0-91-generic.json')]

In [126]:
import json


def load_json(json_path):
    with open(json_path) as f:
        return json.load(f)


jsons = [load_json(p) for p in json_paths]

In [86]:
from enum import Enum


class Kind(str, Enum):
    INT = "INT"
    PTR = "PTR"
    ARRAY = "ARRAY"
    STRUCT = "STRUCT"
    UNION = "UNION"
    ENUM = "ENUM"
    FWD = "FWD"
    TYPEDEF = "TYPEDEF"
    VOLATILE = "VOLATILE"
    CONST = "CONST"
    RESTRICT = "RESTRICT"
    FUNC = "FUNC"
    FUNC_PROTO = "FUNC_PROTO"
    VAR = "VAR"
    DATASEC = "DATASEC"
    FLOAT = "FLOAT"
    DECL_TAG = "DECL_TAG"
    TYPE_TAG = "TYPE_TAG"
    ENUM64 = "ENUM64"

In [179]:
import copy
from functools import cache


class BTF:
    def __init__(self, path, raw_data=None):
        self.path = path
        if raw_data is None:
            self._raw_data = load_json(path)['types']
        else:
            self._raw_data = raw_data['types']

    def __getitem__(self, id):
        if id == 0:
            return {'id': 0, 'name': 'void', 'kind': 'VOID'}
        e = self._raw_data[id - 1]
        assert e['id'] == id
        return e

    def __len__(self):
        return len(self._raw_data)

    def __iter__(self):
        return iter(self._raw_data)

    def print(self):
        from collections import defaultdict

        print(f"File: {self.path}")

        print("Sample:")
        kinds = defaultdict(int)
        for e in self:
            if e['kind'] not in kinds:
                print(f"\t{e['id']:6} ({e['kind']:10}): {e}")
                print(f"\t{'':18}-> {self.normalize(e)}")
            kinds[e['kind']] += 1

        kinds = sorted(kinds.items(), key=lambda x: x[1], reverse=True)
        print(f"Kinds:")
        print(f"\t{dict(kinds)}")

        print()

    @cache
    def filter_on_kind(self, kind):
        return {
            e['name']: self.normalize(e) for e in self
            if e['kind'] == kind and e['name'] != '(anon)'
        }

    def get_by_kind_name(self, kind, name):
        return self.filter_on_kind(kind)[name]

    RECURSE_KINDS = {Kind.CONST, Kind.VOLATILE,
                     Kind.PTR, Kind.FUNC, Kind.FUNC_PROTO, Kind.ARRAY}

    @cache
    def _normalize_sub_elem(self, type_id):
        elem = self[type_id]
        recurse = elem['kind'] in self.RECURSE_KINDS
        return self.normalize(elem, recurse=recurse)

    def normalize(self, elem, recurse=True):
        elem = copy.deepcopy(elem)

        # Remove redundant fields
        del elem['id']
        if elem['kind'] == Kind.INT:
            assert elem['bits_offset'] == 0
            del elem['bits_offset']
            del elem['encoding']
            del elem['nr_bits']
            del elem['size']
        if elem['kind'] == Kind.ARRAY:
            del elem['index_type_id']
        if elem['kind'] == Kind.ENUM:
            if not recurse:
                assert elem['vlen'] == len(elem['values'])
                del elem['values']
                del elem['vlen']
                del elem['encoding']
        if elem['kind'] == Kind.FUNC:
            assert elem['linkage'] == 'static'
            del elem['linkage']

        # Normalize types
        for type in ['type', 'ret_type']:
            type_id = f"{type}_id"

            if type_id not in elem:
                continue

            if recurse:
                elem[type] = self._normalize_sub_elem(elem[type_id])
            del elem[type_id]

        for list_key in ['params', 'members']:
            if list_key not in elem:
                continue

            assert len(elem[list_key]) == elem['vlen']
            del elem['vlen']

            if recurse:
                for item in elem[list_key]:
                    item['type'] = self._normalize_sub_elem(item['type_id'])
                    del item['type_id']
            else:
                del elem[list_key]
                if list_key == 'members':
                    del elem['size']

        return elem


d1 = BTF(json_paths[0], jsons[0])
d2 = BTF(json_paths[1], jsons[1])

d1.print()

# d1.get_by_kind_name(Kind.STRUCT, "task_struct")
# d1.get_by_kind_name(Kind.STRUCT, "uprobe_task")
d1.get_by_kind_name(Kind.FUNC, "vfs_read")

File: data/ubuntu-18.04-x86/4.15.0-213-generic.json
Sample:
	     1 (INT       ): {'id': 1, 'kind': 'INT', 'name': 'long unsigned int', 'size': 8, 'bits_offset': 0, 'nr_bits': 64, 'encoding': '(none)'}
	                  -> {'kind': 'INT', 'name': 'long unsigned int'}
	     2 (CONST     ): {'id': 2, 'kind': 'CONST', 'name': '(anon)', 'type_id': 1}
	                  -> {'kind': 'CONST', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	     3 (VOLATILE  ): {'id': 3, 'kind': 'VOLATILE', 'name': '(anon)', 'type_id': 1}
	                  -> {'kind': 'VOLATILE', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	     4 (ARRAY     ): {'id': 4, 'kind': 'ARRAY', 'name': '(anon)', 'type_id': 1, 'index_type_id': 20, 'nr_elems': 2}
	                  -> {'kind': 'ARRAY', 'name': '(anon)', 'nr_elems': 2, 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	     5 (PTR       ): {'id': 5, 'kind': 'PTR', 'name': '(anon)', 'type_id': 8}
	               

{'kind': 'FUNC',
 'name': 'vfs_read',
 'type': {'kind': 'FUNC_PROTO',
  'name': '(anon)',
  'params': [{'name': 'file',
    'type': {'kind': 'PTR',
     'name': '(anon)',
     'type': {'kind': 'STRUCT', 'name': 'file'}}},
   {'name': 'buf',
    'type': {'kind': 'PTR',
     'name': '(anon)',
     'type': {'kind': 'INT', 'name': 'char'}}},
   {'name': 'count', 'type': {'kind': 'TYPEDEF', 'name': 'size_t'}},
   {'name': 'pos',
    'type': {'kind': 'PTR',
     'name': '(anon)',
     'type': {'kind': 'TYPEDEF', 'name': 'loff_t'}}}],
  'ret_type': {'kind': 'TYPEDEF', 'name': 'ssize_t'}}}

In [180]:
def diff_struct(old, new):
    old_members = {m['name']: m for m in old['members']}
    new_members = {m['name']: m for m in new['members']}

    # fields added or removed
    if set(old_members) != set(new_members):
        added_members = new_members.keys() - old_members.keys()
        removed_members = old_members.keys() - new_members.keys()

        if added_members and removed_members:
            return f"Fields added and removed: {added_members} {removed_members}"
        elif added_members and not removed_members:
            return f"Added fields: {added_members}"
        elif not added_members and removed_members:
            return f"Removed fields: {removed_members}"
        else:
            assert False

    # fields reordered
    if list(old_members) != list(new_members):
        return f"Fields reordered: {old_members} {new_members}"
    
    # fields changed type
    changed_types = {
        name: (old_members[name]['type'], new_members[name]['type'])
        for name in old_members
        if old_members[name]['type'] != new_members[name]['type']
    }
    if changed_types:
        result = "Field type changed:\n"
        for name, (old_type, new_type) in changed_types.items():
            result += f"{name:20}: {old_type} \n{'':20}->{new_type}\n"
        return result

    # same type but different size
    for name in old_members:
        if old_members[name]['bits_offset'] != new_members[name]['bits_offset']:
            return f"Field size changed despite same type"

    if old['size'] != new['size']:
        return f"Field size changed despite same type"

    assert False, f"\n{old}\n{new}"



def print_collection(name, s, num=10):
    print(f"{name} ({len(s)}): {list(s)[:num]}")


def check_diff(d_old, d_new, kind):
    f_old = d_old.filter_on_kind(kind)
    f_new = d_new.filter_on_kind(kind)

    print_collection(f"Old {kind}", f_old.keys())
    print_collection(f"New {kind}", f_new.keys())

    print_collection(f"Removed {kind}", f_old.keys() - f_new.keys())
    print_collection(f"Added {kind}", f_new.keys() - f_old.keys())

    common = {
        name: (f_old[name], f_new[name])
        for name in f_old.keys() & f_new.keys()
    }
    print_collection(f"Common {kind}", common.keys())

    diff = {
        "STRUCT": diff_struct,
        "FUNC": diff_func
    }[kind]

    changed = {
        name: (old, new)
        for name, (old, new) in common.items()
        if old != new
    }
    print_collection(f"Changed {kind}", changed.keys())

    for name, (old, new) in changed.items():
        reason = diff(old, new)
        lines = reason.split("\n")
        print(f"{kind:10}{name:30}: {lines[0]}")
        for line in lines[1:]:
            print(f"{'':40} {line}")

def diff_func(old, new):
    assert old['type']['kind'] == Kind.FUNC_PROTO
    assert new['type']['kind'] == Kind.FUNC_PROTO
    old_params = {p['name']: p for p in old['type']['params']}
    new_params = {p['name']: p for p in new['type']['params']}

    # params added or removed
    if set(old_params) != set(new_params):
        added_params = new_params.keys() - old_params.keys()
        removed_params = old_params.keys() - new_params.keys()

        if added_params and removed_params:
            return f"Params added and removed: {added_params} {removed_params}"
        elif added_params and not removed_params:
            return f"Added params: {added_params}"
        elif not added_params and removed_params:
            return f"Removed params: {removed_params}"
        else:
            assert False

    # params reordered
    if list(old_params) != list(new_params):
        return f"Params reordered: {old_params} {new_params}"
    
    # params changed type
    changed_types = {
        name: (old_params[name]['type'], new_params[name]['type'])
        for name in old_params
        if old_params[name]['type'] != new_params[name]['type']
    }
    if changed_types:
        result = "Param type changed:\n"
        for name, (old_type, new_type) in changed_types.items():
            result += f"{name:20}: {old_type} \n{'':20}->{new_type}\n"
        return result
    
    # param changed return value
    old_ret = old['type']['ret_type']
    new_ret = new['type']['ret_type']
    if old_ret != new_ret:
        return f"Return type changed: {old_ret} -> {new_ret}"

    assert False, f"\n{old}\n{new}" 

# check_diff(d1, d2, 'STRUCT')
check_diff(d1, d2, 'FUNC')

Old FUNC (42719): ['copy_bootdata', 'early_make_pgtable', 'reset_early_page_tables', '__startup_secondary_64', '__startup_64', 'x86_64_start_reservations', 'x86_64_start_kernel', '__early_make_pgtable', 'read_cr3_pa', 'clear_page']
New FUNC (44531): ['copy_bootdata', 'early_make_pgtable', 'reset_early_page_tables', '__startup_secondary_64', '__startup_64', 'x86_64_start_reservations', 'x86_64_start_kernel', '__early_make_pgtable', 'read_cr3_pa', 'clear_page']
Removed FUNC (1801): ['SYSC_newfstatat', 'SyS_fchown', 'sme_map_range_encrypted', 'SyS_pwrite64', 'SyS_sched_get_priority_max', 'hv_pci_init', 'compat_SyS_clock_adjtime', 'acpi_gpiochip_alloc_event', 'compat_SyS_old_getrlimit', 'serial8250_get_baud_rate']
Added FUNC (3613): ['dev_direct_xmit', 'genpd_perf_state_show', '__bpf_trace_ext4_fsmap_class', 'dma_direct_free', '__bpf_trace_rtc_alarm_irq_enable', '__x64_sys_brk', '__x64_sys_setregid16', '__device_add_disk', 'ip6mr_seq_read', 'smack_sk_free_security']
Common FUNC (40918): ['