In [8]:
%load_ext autoreload
%autoreload now

from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from pathlib import Path
import json


class BTF:
    def __init__(self, path):
        self.path = Path(path)
        self.data = self.read_jsonl(self.path)
        self.data_by_kind = {}

    @staticmethod
    def read_jsonl(jsonl_path):
        assert jsonl_path.exists()
        assert jsonl_path.suffix == ".jsonl"

        with open(jsonl_path) as f:
            return [json.loads(line) for line in f]

    @property
    def short_name(self):
        linux_version = self.path.name.split("-")[0]
        assert linux_version.endswith(".0")
        return linux_version[:-2]

    def print(self):
        from collections import defaultdict

        print(f"File: {self.path}")

        kinds = defaultdict(int)
        print("Sample:")
        for e in self.data:
            if e["kind"] not in kinds:
                print(f"\t{e['kind']:10}: {e}")
            kinds[e["kind"]] += 1

        kinds = sorted(kinds.items(), key=lambda x: x[1], reverse=True)
        print(f"Kinds: {dict(kinds)}")

        print()

    def filter_on_kind(self, kind):
        if kind not in self.data_by_kind:
            self.data_by_kind[kind] = {
                e["name"]: e
                for e in self.data
                if e["kind"] == kind and e["name"] != "(anon)"
            }
        return self.data_by_kind[kind]

    def get(self, kind, name):
        return self.filter_on_kind(kind)[name]


btf1 = BTF("data/20.04-x86/5.13.0-52-generic.jsonl")


# d1.get(Kind.UNION, "intel_x86_pebs_dse")
# d1.get(Kind.STRUCT, "task_struct")
# d1.get(Kind.FUNC, "vfs_read")
btf1.get(Kind.ENUM, "nf_ip_hook_priorities")

{'kind': 'ENUM',
 'name': 'nf_ip_hook_priorities',
 'size': 4,
 'values': [{'name': 'NF_IP_PRI_FIRST', 'val': -2147483648},
  {'name': 'NF_IP_PRI_RAW_BEFORE_DEFRAG', 'val': -450},
  {'name': 'NF_IP_PRI_CONNTRACK_DEFRAG', 'val': -400},
  {'name': 'NF_IP_PRI_RAW', 'val': -300},
  {'name': 'NF_IP_PRI_SELINUX_FIRST', 'val': -225},
  {'name': 'NF_IP_PRI_CONNTRACK', 'val': -200},
  {'name': 'NF_IP_PRI_MANGLE', 'val': -150},
  {'name': 'NF_IP_PRI_NAT_DST', 'val': -100},
  {'name': 'NF_IP_PRI_FILTER', 'val': 0},
  {'name': 'NF_IP_PRI_SECURITY', 'val': 50},
  {'name': 'NF_IP_PRI_NAT_SRC', 'val': 100},
  {'name': 'NF_IP_PRI_SELINUX_LAST', 'val': 225},
  {'name': 'NF_IP_PRI_CONNTRACK_HELPER', 'val': 300},
  {'name': 'NF_IP_PRI_CONNTRACK_CONFIRM', 'val': 2147483647},
  {'name': 'NF_IP_PRI_LAST', 'val': 2147483647}]}

In [10]:
btf2 = BTF("data/20.04-x86/5.15.0-92-generic.jsonl")
btf2.print()

File: data/20.04-x86/5.15.0-92-generic.jsonl
Sample:
	INT       : {'kind': 'INT', 'name': 'long unsigned int'}
	CONST     : {'kind': 'CONST', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	VOLATILE  : {'kind': 'VOLATILE', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	ARRAY     : {'kind': 'ARRAY', 'name': '(anon)', 'nr_elems': 2, 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	PTR       : {'kind': 'PTR', 'type': {'kind': 'CONST', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'char'}}}
	TYPEDEF   : {'kind': 'TYPEDEF', 'name': '__s8', 'type': {'kind': 'INT', 'name': 'signed char'}}
	ENUM      : {'kind': 'ENUM', 'name': '(anon)', 'size': 4, 'values': [{'name': 'false', 'val': 0}, {'name': 'true', 'val': 1}]}
	FUNC_PROTO: {'kind': 'FUNC_PROTO', 'params': [{'name': '(anon)', 'type': {'kind': 'INT', 'name': 'int'}}], 'ret_type': {'name': 'void', 'kind': 'VOID'}}
	STRUCT    : {'kind': 'STRUCT', 'name': '(anon)', 'size': 4, 'member

In [11]:
import sys


class FileLogger:
    def __init__(self, path, print_to_stdout=True):
        self.print_to_stdout = print_to_stdout
        self.stdout = sys.stdout

        path.parent.mkdir(parents=True, exist_ok=True)
        self.log = open(path, "w")

    def write(self, message):
        if self.print_to_stdout:
            self.stdout.write(message)
        self.log.write(message)

    def __enter__(self):
        sys.stdout = self
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout = self.stdout
        self.log.close()

In [12]:
from collections import defaultdict
from dataclasses import dataclass


def print_as_list(name, s, num=10):
    print(f"{name} ({len(s)}): {list(s)[:num]}")


def diff_dict(old, new):
    added = {k: v for k, v in new.items() if k not in old}
    removed = {k: v for k, v in old.items() if k not in new}
    common = {k: (old[k], new[k]) for k in old.keys() if k in new}
    return added, removed, common


@dataclass(frozen=True)
class DiffResult:
    added: set
    removed: set
    common: set
    changed: dict
    reasons: dict[str, int]


def check_diff_impl(d_old, d_new, kind, diff_fn):
    f_old = d_old.filter_on_kind(kind)
    f_new = d_new.filter_on_kind(kind)
    print_as_list(f"Old {kind}", f_old.keys())
    print_as_list(f"New {kind}", f_new.keys())

    added, removed, common = diff_dict(f_old, f_new)
    print_as_list(f"Added {kind}", added)
    print_as_list(f"Removed {kind}", removed)
    print_as_list(f"Common {kind}", common)

    changed = {
        name: diff_fn(old, new).strip().split("\n")
        for name, (old, new) in common.items()
        if old != new
    }
    print_as_list(f"Changed {kind}", changed.keys())

    reasons = defaultdict(int)
    for reason in changed.values():
        for line in reason:
            if not line.startswith("\t"):
                reasons[line.rstrip(":")] += 1
    print_as_list(f"Reasons {kind}", reasons.items())

    for name, reason in changed.items():
        print(name)
        for line in reason:
            print(f"\t{line}")

    return DiffResult(added, removed, common, changed, reasons)

In [13]:
def diff_struct(old, new):
    result = ""

    old_members = {m["name"]: m for m in old["members"]}
    new_members = {m["name"]: m for m in new["members"]}

    added, removed, common = diff_dict(old_members, new_members)

    # added field
    if added:
        result += f"Added fields:\n"
        for name, value in added.items():
            result += f"\t{name:20}: {value['type']}\n"

    # removed field
    if removed:
        result += f"Removed fields:\n"
        for name, value in removed.items():
            result += f"\t{name:20}: {value['type']}\n"

    # fields reordered
    if [n for n in old_members if n in common] != [
        n for n in new_members if n in common
    ]:
        result += f"Fields reordered:\n"
        result += f"\t{'':20} {list(old_members)}\n"
        result += f"\t{'':20} {list(new_members)}\n"

    # fields changed type
    changed_types = {
        name: (old_value["type"], new_value["type"])
        for name, (old_value, new_value) in common.items()
        if old_value["type"] != new_value["type"]
    }
    if changed_types:
        result += "Field type changed:\n"
        for name, (old_type, new_type) in changed_types.items():
            result += f"\t{name:20}: {old_type}\n"
            result += f"\t{'':20}->{new_type}\n"

    # fields changed offset
    old_offset = {name: old_members[name]
                  ["bits_offset"] for name in old_members}
    new_offset = {name: new_members[name]
                  ["bits_offset"] for name in new_members}
    layout_changed = old_offset != new_offset or old["size"] != new["size"]
    if layout_changed and result == "":
        result += f"Layout changed\n"

    assert result, f"\n{old}\n{new}"
    return result


# check_diff_impl(d1, d2, Kind.STRUCT, diff_struct)

In [14]:
# check_diff_impl(d1, d2, Kind.UNION, diff_struct)

In [15]:
def diff_func(old, new):
    result = ""

    old_params = {p["name"]: p for p in old["type"]["params"]}
    new_params = {p["name"]: p for p in new["type"]["params"]}

    added, removed, common = diff_dict(old_params, new_params)

    # params added
    if added:
        result += f"Added params:\n"
        for name, value in added.items():
            result += f"\t{name:20}: {value['type']}\n"

    # params removed
    if removed:
        result += f"Removed params:\n"
        for name, value in removed.items():
            result += f"\t{name:20}: {value['type']}\n"

    # params reordered
    if [n for n in old_params if n in common] != [n for n in new_params if n in common]:
        result += f"Params reordered:\n"
        result += f"\t{'':20} {list(old_params)}\n"
        result += f"\t{'':20} {list(new_params)}\n"

    # params changed type
    changed_types = {
        name: (old_value["type"], new_value["type"])
        for name, (old_value, new_value) in common.items()
        if old_value["type"] != new_value["type"]
    }
    if changed_types:
        result += "Param type changed:\n"
        for name, (old_type, new_type) in changed_types.items():
            result += f"\t{name:20}: {old_type}\n"
            result += f"\t{'':20}->{new_type}\n"

    # changed return value
    old_ret = old["type"]["ret_type"]
    new_ret = new["type"]["ret_type"]
    if old_ret != new_ret:
        result += f"Return type changed:\n"
        result += f"\t{'':20}: {old_ret}\n"
        result += f"\t{'':20}->{new_ret}\n"

    assert result, f"\n{old}\n{new}"
    return result


# check_diff_impl(d1, d2, Kind.FUNC, diff_func)

In [16]:
def diff_enum(old, new):
    result = ""

    old_values = {v["name"]: v for v in old["values"]}
    new_values = {v["name"]: v for v in new["values"]}

    added, removed, common = diff_dict(old_values, new_values)

    # added value
    if added:
        result += f"Added values:\n"
        for name, value in added.items():
            result += f"\t{name:40}: {value['val']}\n"

    # removed value
    if removed:
        result += f"Removed values:\n"
        for name, value in removed.items():
            result += f"\t{name:40}: {value['val']}\n"

    # values changed
    changed_values = {
        name: (old_value["val"], new_value["val"])
        for name, (old_value, new_value) in common.items()
        if old_value["val"] != new_value["val"]
    }
    if changed_values:
        result += "Value changed:\n"
        for name, (old_val, new_val) in changed_values.items():
            result += f"\t{name:40}: {old_val} -> {new_val}\n"

    assert result, f"\n{old}\n{new}"
    return result


# check_diff_impl(d1, d2, Kind.ENUM, diff_enum)

In [17]:
def check_diff(d_old, d_new, kind, print_to_stdout=False):
    assert d_old.path.parent == d_new.path.parent
    name = f"{d_old.short_name}->{d_new.short_name}.{kind.name}.log"
    path = Path("output") / d_old.path.parent.name / name
    print(f"Logging to {path}")

    diff_fn = {
        Kind.STRUCT: diff_struct,
        Kind.UNION: diff_struct,
        Kind.FUNC: diff_func,
        Kind.ENUM: diff_enum,
    }[kind]
    with FileLogger(path, print_to_stdout=print_to_stdout):
        return check_diff_impl(d_old, d_new, kind, diff_fn)

In [47]:
import pandas as pd


def diff_btf(btf1, btf2):
    print(f"Diffing {btf1.short_name} and {btf2.short_name}")
    results = {}
    for kind in [Kind.STRUCT, Kind.FUNC, Kind.ENUM, Kind.UNION]:
        diff_result = check_diff(btf1, btf2, kind)
        kind = kind.name.lower()
        results[f"{kind}-common"] = len(diff_result.common)
        results[f"{kind}-added"] = len(diff_result.added)
        results[f"{kind}-removed"] = len(diff_result.removed)
        results[f"{kind}-changed"] = len(diff_result.changed)
        for k, v in diff_result.reasons.items():
            results[f"{kind}-reason-{k}"] = v

    df = pd.DataFrame([results])
    return df


diff_btf(
    BTF("data/18.04-x86/4.15.0-213-generic.jsonl"),
    BTF("data/18.04-x86/4.18.0-25-generic.jsonl")
)

Diffing 4.15 and 4.18
Logging to output/18.04-x86/4.15->4.18.STRUCT.log
Logging to output/18.04-x86/4.15->4.18.FUNC.log
Logging to output/18.04-x86/4.15->4.18.ENUM.log
Logging to output/18.04-x86/4.15->4.18.UNION.log


Unnamed: 0,struct-common,struct-added,struct-removed,struct-changed,struct-reason-Field type changed,struct-reason-Added fields,struct-reason-Removed fields,struct-reason-Fields reordered,struct-reason-Layout changed,func-common,...,enum-reason-Added values,enum-reason-Removed values,enum-reason-Value changed,union-common,union-added,union-removed,union-changed,union-reason-Added fields,union-reason-Removed fields,union-reason-Field type changed
0,7193,426,136,785,189,354,261,27,189,40918,...,57,23,25,96,6,3,4,3,1,1


In [19]:
def get_json_paths(path):
    return sorted(
        (file for file in path.glob("*.jsonl")),
        key=lambda name: tuple(map(int, name.stem.split("-")[0].split("."))),
    )


get_json_paths(Path("data/18.04-x86"))

[PosixPath('data/18.04-x86/4.15.0-213-generic.jsonl'),
 PosixPath('data/18.04-x86/4.18.0-25-generic.jsonl'),
 PosixPath('data/18.04-x86/5.0.0-65-generic.jsonl'),
 PosixPath('data/18.04-x86/5.3.0-76-generic.jsonl'),
 PosixPath('data/18.04-x86/5.4.0-91-generic.jsonl')]

In [28]:
def diff_all_btf_files(paths):
    result = pd.DataFrame()
    for path1, path2 in list(zip(paths[:-1], paths[1:])) + [(paths[0], paths[-1])]:
        btf1 = BTF(path1)
        btf2 = BTF(path2)
        df = diff_btf(btf1, btf2)
        df["name"] = f"{btf1.short_name}->{btf2.short_name}"
        result = pd.concat([result, df], ignore_index=True)
    return result


diff_all_btf_files(get_json_paths(Path("data/18.04-x86")))

Diffing 4.15 and 4.18
Logging to output/18.04-x86/4.15->4.18.STRUCT.log
Logging to output/18.04-x86/4.15->4.18.FUNC.log
Logging to output/18.04-x86/4.15->4.18.ENUM.log
Logging to output/18.04-x86/4.15->4.18.UNION.log
Diffing 4.18 and 5.0
Logging to output/18.04-x86/4.18->5.0.STRUCT.log
Logging to output/18.04-x86/4.18->5.0.FUNC.log
Logging to output/18.04-x86/4.18->5.0.ENUM.log
Logging to output/18.04-x86/4.18->5.0.UNION.log
Diffing 5.0 and 5.3
Logging to output/18.04-x86/5.0->5.3.STRUCT.log
Logging to output/18.04-x86/5.0->5.3.FUNC.log
Logging to output/18.04-x86/5.0->5.3.ENUM.log
Logging to output/18.04-x86/5.0->5.3.UNION.log
Diffing 5.3 and 5.4
Logging to output/18.04-x86/5.3->5.4.STRUCT.log
Logging to output/18.04-x86/5.3->5.4.FUNC.log
Logging to output/18.04-x86/5.3->5.4.ENUM.log
Logging to output/18.04-x86/5.3->5.4.UNION.log
Diffing 4.15 and 5.4
Logging to output/18.04-x86/4.15->5.4.STRUCT.log
Logging to output/18.04-x86/4.15->5.4.FUNC.log
Logging to output/18.04-x86/4.15->5.4.EN

Unnamed: 0,struct-common,struct-added,struct-removed,struct-changed,struct-reason-Field type changed,struct-reason-Added fields,struct-reason-Removed fields,struct-reason-Fields reordered,struct-reason-Layout changed,func-common,...,enum-reason-Value changed,union-common,union-added,union-removed,union-changed,union-reason-Added fields,union-reason-Removed fields,union-reason-Field type changed,name,union-reason-Layout changed
0,7193,426,136,785,189,354,261,27,189,40918,...,25,96,6,3,4,3,1.0,1.0,4.15->4.18,
1,7507,347,112,813,184,437,208,35,191,43288,...,37,100,8,2,3,2,1.0,1.0,4.18->5.0,1.0
2,7762,432,92,655,112,352,183,24,168,44821,...,36,106,7,2,5,4,1.0,2.0,5.0->5.3,
3,8160,279,34,369,62,193,88,10,100,46561,...,19,111,15,2,1,1,,,5.3->5.4,
4,7057,1382,272,1293,361,779,451,69,214,39572,...,65,93,33,6,6,5,1.0,2.0,4.15->5.4,


In [31]:
result = pd.DataFrame()
for path in sorted(Path("data").glob("*")):
    if path.is_dir():
        print(f"Diffing {path}")
        df = diff_all_btf_files(get_json_paths(path))
        df["distro"] = path.name
        result = pd.concat([result, df], ignore_index=True)

result

Diffing data/16.04-x86
Diffing 4.4 and 4.8
Logging to output/16.04-x86/4.4->4.8.STRUCT.log
Logging to output/16.04-x86/4.4->4.8.FUNC.log
Logging to output/16.04-x86/4.4->4.8.ENUM.log
Logging to output/16.04-x86/4.4->4.8.UNION.log
Diffing 4.8 and 4.10
Logging to output/16.04-x86/4.8->4.10.STRUCT.log
Logging to output/16.04-x86/4.8->4.10.FUNC.log
Logging to output/16.04-x86/4.8->4.10.ENUM.log
Logging to output/16.04-x86/4.8->4.10.UNION.log
Diffing 4.10 and 4.13
Logging to output/16.04-x86/4.10->4.13.STRUCT.log
Logging to output/16.04-x86/4.10->4.13.FUNC.log
Logging to output/16.04-x86/4.10->4.13.ENUM.log
Logging to output/16.04-x86/4.10->4.13.UNION.log
Diffing 4.13 and 4.15
Logging to output/16.04-x86/4.13->4.15.STRUCT.log
Logging to output/16.04-x86/4.13->4.15.FUNC.log
Logging to output/16.04-x86/4.13->4.15.ENUM.log
Logging to output/16.04-x86/4.13->4.15.UNION.log
Diffing 4.4 and 4.15
Logging to output/16.04-x86/4.4->4.15.STRUCT.log
Logging to output/16.04-x86/4.4->4.15.FUNC.log
Logging

Unnamed: 0,struct-common,struct-added,struct-removed,struct-changed,struct-reason-Added fields,struct-reason-Field type changed,struct-reason-Removed fields,struct-reason-Layout changed,struct-reason-Fields reordered,func-common,...,union-removed,union-changed,union-reason-Added fields,union-reason-Removed fields,union-reason-Field type changed,name,func-reason-Params reordered,union-reason-Layout changed,distro,union-reason-Fields reordered
0,6072,410,138,828,404,165,286,214,35,35475,...,1,2,2,1.0,1.0,4.4->4.8,,,16.04-x86,
1,6432,228,50,640,247,112,113,279,9,37538,...,1,4,3,2.0,,4.8->4.10,,,16.04-x86,
2,6582,384,78,852,381,218,199,253,22,38096,...,1,9,6,2.0,1.0,4.10->4.13,5.0,2.0,16.04-x86,
3,6881,374,85,721,339,145,164,234,13,40100,...,0,5,2,2.0,1.0,4.13->4.15,1.0,2.0,16.04-x86,
4,5958,1297,252,1391,780,428,459,271,50,34361,...,2,11,7,3.0,1.0,4.4->4.15,3.0,2.0,16.04-x86,
5,7193,426,136,785,354,189,261,189,27,40918,...,3,4,3,1.0,1.0,4.15->4.18,3.0,,18.04-x86,
6,7507,347,112,813,437,184,208,191,35,43288,...,2,3,2,1.0,1.0,4.18->5.0,2.0,1.0,18.04-x86,
7,7762,432,92,655,352,112,183,168,24,44821,...,2,5,4,1.0,2.0,5.0->5.3,,,18.04-x86,
8,8160,279,34,369,193,62,88,100,10,46561,...,2,1,1,,,5.3->5.4,,,18.04-x86,
9,7057,1382,272,1293,779,361,451,214,69,39572,...,6,6,5,1.0,2.0,4.15->5.4,2.0,,18.04-x86,


In [30]:
result.set_index(['distro', 'name']).T.reset_index()

distro,index,18.04-x86,18.04-x86,18.04-x86,18.04-x86,18.04-x86,20.04-x86,20.04-x86,20.04-x86,20.04-x86,20.04-x86,16.04-x86,16.04-x86,16.04-x86,16.04-x86,16.04-x86
name,Unnamed: 1_level_1,4.15->4.18,4.18->5.0,5.0->5.3,5.3->5.4,4.15->5.4,5.4->5.8,5.8->5.11,5.11->5.13,5.13->5.15,5.4->5.15,4.4->4.8,4.8->4.10,4.10->4.13,4.13->4.15,4.4->4.15
0,struct-common,7193.0,7507.0,7762.0,8160.0,7057.0,8248.0,8566.0,8983.0,9099.0,7944.0,6072.0,6432.0,6582.0,6881.0,5958.0
1,struct-added,426.0,347.0,432.0,279.0,1382.0,474.0,513.0,233.0,283.0,1438.0,410.0,228.0,384.0,374.0,1297.0
2,struct-removed,136.0,112.0,92.0,34.0,272.0,191.0,156.0,96.0,117.0,495.0,138.0,50.0,78.0,85.0,252.0
3,struct-changed,785.0,813.0,655.0,369.0,1293.0,896.0,771.0,476.0,656.0,1569.0,828.0,640.0,852.0,721.0,1391.0
4,struct-reason-Field type changed,189.0,184.0,112.0,62.0,361.0,190.0,164.0,111.0,140.0,429.0,165.0,112.0,218.0,145.0,428.0
5,struct-reason-Added fields,354.0,437.0,352.0,193.0,779.0,464.0,439.0,272.0,365.0,983.0,404.0,247.0,381.0,339.0,780.0
6,struct-reason-Removed fields,261.0,208.0,183.0,88.0,451.0,257.0,232.0,154.0,163.0,554.0,286.0,113.0,199.0,164.0,459.0
7,struct-reason-Fields reordered,27.0,35.0,24.0,10.0,69.0,33.0,26.0,23.0,20.0,75.0,35.0,9.0,22.0,13.0,50.0
8,struct-reason-Layout changed,189.0,191.0,168.0,100.0,214.0,226.0,166.0,73.0,162.0,244.0,214.0,279.0,253.0,234.0,271.0
9,func-common,40918.0,43288.0,44821.0,46561.0,39572.0,46310.0,48669.0,50994.0,51214.0,44189.0,35475.0,37538.0,38096.0,40100.0,34361.0
