In [167]:
%reload_ext autoreload
%autoreload now

from btf_utils import Kind

In [168]:
%%html

<style>
th {
    text-align: left;
}
</style>

In [169]:
from pathlib import Path
import json


class BTF:
    def __init__(self, path):
        self.path = Path(path)
        self.data = self.read_jsonl(self.path)
        self.data_by_kind = {}

    @staticmethod
    def read_jsonl(jsonl_path):
        assert jsonl_path.exists()
        assert jsonl_path.suffix == ".jsonl"

        with open(jsonl_path) as f:
            return [json.loads(line) for line in f]

    @property
    def short_name(self):
        linux_version = self.path.name.split("-")[0]
        assert linux_version.endswith(".0")
        return linux_version[:-2]

    def print(self):
        from collections import defaultdict

        print(f"File: {self.path}")

        kinds = defaultdict(int)
        print("Sample:")
        for e in self.data:
            if e["kind"] not in kinds:
                print(f"\t{e['kind']:10}: {e}")
            kinds[e["kind"]] += 1

        kinds = sorted(kinds.items(), key=lambda x: x[1], reverse=True)
        print(f"Kinds: {dict(kinds)}")

        print()

    def filter(self, kind, name_filter=None):
        if kind not in self.data_by_kind:
            self.data_by_kind[kind] = {
                e["name"]: e
                for e in self.data
                if e["kind"] == kind and e["name"] != "(anon)"
            }
        if not name_filter:
            return self.data_by_kind[kind]
        else:
            return {k: v for k, v in self.data_by_kind[kind].items() if name_filter(k)}

    def get(self, kind, name):
        return self.filter(kind)[name]


btf1 = BTF("data/20.04-x86/5.13.0-52-generic.jsonl")


# d1.get(Kind.UNION, "intel_x86_pebs_dse")
# d1.get(Kind.STRUCT, "task_struct")
# d1.get(Kind.FUNC, "vfs_read")
btf1.get(Kind.ENUM, "nf_ip_hook_priorities")

{'kind': 'ENUM',
 'name': 'nf_ip_hook_priorities',
 'size': 4,
 'values': [{'name': 'NF_IP_PRI_FIRST', 'val': -2147483648},
  {'name': 'NF_IP_PRI_RAW_BEFORE_DEFRAG', 'val': -450},
  {'name': 'NF_IP_PRI_CONNTRACK_DEFRAG', 'val': -400},
  {'name': 'NF_IP_PRI_RAW', 'val': -300},
  {'name': 'NF_IP_PRI_SELINUX_FIRST', 'val': -225},
  {'name': 'NF_IP_PRI_CONNTRACK', 'val': -200},
  {'name': 'NF_IP_PRI_MANGLE', 'val': -150},
  {'name': 'NF_IP_PRI_NAT_DST', 'val': -100},
  {'name': 'NF_IP_PRI_FILTER', 'val': 0},
  {'name': 'NF_IP_PRI_SECURITY', 'val': 50},
  {'name': 'NF_IP_PRI_NAT_SRC', 'val': 100},
  {'name': 'NF_IP_PRI_SELINUX_LAST', 'val': 225},
  {'name': 'NF_IP_PRI_CONNTRACK_HELPER', 'val': 300},
  {'name': 'NF_IP_PRI_CONNTRACK_CONFIRM', 'val': 2147483647},
  {'name': 'NF_IP_PRI_LAST', 'val': 2147483647}]}

In [170]:
btf2 = BTF("data/20.04-x86/5.15.0-92-generic.jsonl")
btf2.print()

File: data/20.04-x86/5.15.0-92-generic.jsonl
Sample:
	INT       : {'kind': 'INT', 'name': 'long unsigned int'}
	CONST     : {'kind': 'CONST', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	VOLATILE  : {'kind': 'VOLATILE', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	ARRAY     : {'kind': 'ARRAY', 'name': '(anon)', 'nr_elems': 2, 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	PTR       : {'kind': 'PTR', 'type': {'kind': 'CONST', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'char'}}}
	TYPEDEF   : {'kind': 'TYPEDEF', 'name': '__s8', 'type': {'kind': 'INT', 'name': 'signed char'}}
	ENUM      : {'kind': 'ENUM', 'name': '(anon)', 'size': 4, 'values': [{'name': 'false', 'val': 0}, {'name': 'true', 'val': 1}]}
	FUNC_PROTO: {'kind': 'FUNC_PROTO', 'params': [{'name': '(anon)', 'type': {'kind': 'INT', 'name': 'int'}}], 'ret_type': {'name': 'void', 'kind': 'VOID'}}
	STRUCT    : {'kind': 'STRUCT', 'name': '(anon)', 'size': 4, 'member

In [171]:
from dataclasses import dataclass


@dataclass(frozen=True)
class DiffResult:
    added: set
    removed: set
    common: set
    changed: dict
    reasons: dict[str, int]


def print_as_list(name, s):
    l = list(s)
    print(f"{name} ({len(l)}): {l}")


def diff_dict(old, new):
    added = {k: v for k, v in new.items() if k not in old}
    removed = {k: v for k, v in old.items() if k not in new}
    common = {k: (old[k], new[k]) for k in old.keys() if k in new}
    return added, removed, common


def check_diff_impl(dict1, dict2, kind, diff_fn, all_reasons):
    # print_as_list(f"Old {kind}", dict1.keys())
    # print_as_list(f"New {kind}", dict2.keys())

    added, removed, common = diff_dict(dict1, dict2)
    print_as_list(f"Added {kind}", added)
    print_as_list(f"Removed {kind}", removed)
    # print_as_list(f"Common {kind}", common)

    changed = {
        name: diff_fn(old, new).strip().split("\n")
        for name, (old, new) in common.items()
        if old != new
    }
    print_as_list(f"Changed {kind}", changed.keys())

    reasons = {r.strip(): 0 for r in all_reasons}
    for reason in changed.values():
        for line in reason:
            if not line.startswith("\t"):
                reasons[line] += 1
    print_as_list(f"Reasons {kind}", reasons.items())

    for name, reason in changed.items():
        print(name)
        for line in reason:
            print(f"\t{line}")

    return DiffResult(added, removed, common, changed, reasons)

In [172]:
from enum import Enum


class StructChange(str, Enum):
    ADD = "Field added\n"
    REMOVE = "Field removed\n"
    TYPE = "Field type changed\n"
    LAYOUT = "Layout changed\n"


def diff_struct(old, new):
    result = ""

    old_members = {m["name"]: m for m in old["members"]}
    new_members = {m["name"]: m for m in new["members"]}

    added, removed, common = diff_dict(old_members, new_members)

    # added field
    if added:
        result += StructChange.ADD
        for name, value in added.items():
            result += f"\t{name:20}: {value['type']}\n"

    # removed field
    if removed:
        result += StructChange.REMOVE
        for name, value in removed.items():
            result += f"\t{name:20}: {value['type']}\n"

    # fields changed type
    changed_types = {
        name: (old_value["type"], new_value["type"])
        for name, (old_value, new_value) in common.items()
        if old_value["type"] != new_value["type"]
    }
    if changed_types:
        result += StructChange.TYPE
        for name, (old_type, new_type) in changed_types.items():
            result += f"\t{name:20}: {old_type}\n"
            result += f"\t{'':20}->{new_type}\n"

    # fields changed offset
    old_offset = {name: old_members[name]
                  ["bits_offset"] for name in old_members}
    new_offset = {name: new_members[name]
                  ["bits_offset"] for name in new_members}
    if old_offset != new_offset or old["size"] != new["size"]:
        result += StructChange.LAYOUT

    assert result, f"\n{old}\n{new}"
    return result


# check_diff_impl(
#     btf1.filter(Kind.STRUCT),
#     btf2.filter(Kind.STRUCT),
#     Kind.STRUCT,
#     diff_struct,
#     StructChange,
# )

In [173]:
# check_diff_impl(
#     btf1.filter(Kind.UNION),
#     btf2.filter(Kind.UNION),
#     Kind.UNION,
#     diff_struct,
#     StructChange,
# )

In [174]:
from enum import Enum


class FuncChange(str, Enum):
    ADD = "Param added\n"
    REMOVE = "Param removed\n"
    TYPE = "Param type changed\n"
    REORDER = "Param reordered\n"
    RETURN = "Return type changed\n"


def diff_func(old, new):
    result = ""

    old_params = {p["name"]: p for p in old["type"]["params"]}
    new_params = {p["name"]: p for p in new["type"]["params"]}

    added, removed, common = diff_dict(old_params, new_params)

    # params added
    if added:
        result += FuncChange.ADD
        for name, value in added.items():
            result += f"\t{name:20}: {value['type']}\n"

    # params removed
    if removed:
        result += FuncChange.REMOVE
        for name, value in removed.items():
            result += f"\t{name:20}: {value['type']}\n"

    # params reordered
    old_idx = {n: i for i, n in enumerate(old_params) if n in common}
    new_idx = {n: i for i, n in enumerate(new_params) if n in common}
    if old_idx != new_idx:
        result += FuncChange.REORDER
        result += f"\t{'':20} {list(old_params)}\n"
        result += f"\t{'':20} {list(new_params)}\n"

    # params changed type
    changed_types = {
        name: (old_value["type"], new_value["type"])
        for name, (old_value, new_value) in common.items()
        if old_value["type"] != new_value["type"]
    }
    if changed_types:
        result += FuncChange.TYPE
        for name, (old_type, new_type) in changed_types.items():
            result += f"\t{name:20}: {old_type}\n"
            result += f"\t{'':20}->{new_type}\n"

    # changed return value
    old_ret = old["type"]["ret_type"]
    new_ret = new["type"]["ret_type"]
    if old_ret != new_ret:
        result += FuncChange.RETURN
        result += f"\t{'':20}: {old_ret}\n"
        result += f"\t{'':20}->{new_ret}\n"

    assert result, f"\n{old}\n{new}"
    return result


# check_diff_impl(
#     btf1.filter(Kind.FUNC),
#     btf2.filter(Kind.FUNC),
#     Kind.FUNC,
#     diff_func,
#     FuncChange,
# )

In [175]:
from enum import Enum


class EnumChange(str, Enum):
    ADD = "Elem added\n"
    REMOVE = "Elem removed\n"
    VALUE = "Value changed\n"


def diff_enum(old, new):
    result = ""

    old_values = {v["name"]: v for v in old["values"]}
    new_values = {v["name"]: v for v in new["values"]}

    added, removed, common = diff_dict(old_values, new_values)

    # added value
    if added:
        result += EnumChange.ADD
        for name, value in added.items():
            result += f"\t{name:40}: {value['val']}\n"

    # removed value
    if removed:
        result += EnumChange.REMOVE
        for name, value in removed.items():
            result += f"\t{name:40}: {value['val']}\n"

    # values changed
    changed_values = {
        name: (old_value["val"], new_value["val"])
        for name, (old_value, new_value) in common.items()
        if old_value["val"] != new_value["val"]
    }
    if changed_values:
        result += EnumChange.VALUE
        for name, (old_val, new_val) in changed_values.items():
            result += f"\t{name:40}: {old_val} -> {new_val}\n"

    assert result, f"\n{old}\n{new}"
    return result


# check_diff_impl(
#     btf1.filter(Kind.ENUM),
#     btf2.filter(Kind.ENUM),
#     Kind.ENUM,
#     diff_enum,
#     EnumChange,
# )

In [176]:
import sys


class FileLogger:
    def __init__(self, path):
        print(f"Logging to {path}")

        self.stdout = sys.stdout
        path.parent.mkdir(parents=True, exist_ok=True)
        self.log = open(path, "w")

    def write(self, message):
        self.log.write(message)

    def __enter__(self):
        sys.stdout = self
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout = self.stdout
        self.log.close()

In [177]:
def check_diff(dict1, dict2, kind, output_path):
    diff_fn = {
        Kind.STRUCT: diff_struct,
        Kind.UNION: diff_struct,
        Kind.FUNC: diff_func,
        Kind.ENUM: diff_enum,
    }[kind]
    all_reasons = {
        Kind.STRUCT: StructChange,
        Kind.UNION: StructChange,
        Kind.FUNC: FuncChange,
        Kind.ENUM: EnumChange,
    }[kind]
    with FileLogger(output_path):
        return check_diff_impl(dict1, dict2, kind, diff_fn, all_reasons)

In [178]:
def get_lsm(btf):
    return {
        f"security_{e['name']}"
        for e in btf.get(Kind.STRUCT, "security_hook_heads")["members"]
    }


# get_lsm(btf1)

In [179]:
import pandas as pd


def diff_btf(btf1, btf2, name):
    print(f"Diffing {name}")

    assert btf1.path.parent == btf1.path.parent
    path = Path("output") / btf1.path.parent.name

    lsm_hooks = get_lsm(btf1) | get_lsm(btf2)

    results = {}
    for key, kind, filter in [
        ("trace_event", Kind.STRUCT, lambda k: k.startswith("trace_event_raw_")),
        ("raw_tp", Kind.FUNC, lambda k: k.startswith("perf_trace_")),
        ("lsm", Kind.FUNC, lambda k: k in lsm_hooks),
        ("func", Kind.FUNC, None),
        ("struct", Kind.STRUCT, None),
        ("union", Kind.UNION, None),
        ("enum", Kind.ENUM, None),
    ]:
        dict1 = btf1.filter(kind, filter)
        dict2 = btf2.filter(kind, filter)

        output_path = path / f"{name}-{key}.log"
        diff_result = check_diff(dict1, dict2, kind, output_path)
        results[(key, "Old")] = len(dict1)
        results[(key, "New")] = len(dict2)
        results[(key, "Common")] = len(diff_result.common)
        results[(key, "Added")] = len(diff_result.added)
        results[(key, "Removed")] = len(diff_result.removed)
        results[(key, "Changed")] = len(diff_result.changed)
        for k, v in diff_result.reasons.items():
            results[(key, f"- {k}")] = v

    df = pd.DataFrame(
        results.values(),
        index=pd.MultiIndex.from_tuples(results.keys()),
        columns=[name],
    )
    df.to_string(path / f"{name}.txt")
    return df


diff_btf(
    BTF("data/18.04-x86/4.15.0-213-generic.jsonl"),
    BTF("data/18.04-x86/4.18.0-25-generic.jsonl"),
    "4.15->4.18",
)

Diffing 4.15->4.18
Logging to output/18.04-x86/4.15->4.18-trace_event.log
Logging to output/18.04-x86/4.15->4.18-raw_tp.log
Logging to output/18.04-x86/4.15->4.18-lsm.log
Logging to output/18.04-x86/4.15->4.18-func.log
Logging to output/18.04-x86/4.15->4.18-struct.log
Logging to output/18.04-x86/4.15->4.18-union.log
Logging to output/18.04-x86/4.15->4.18-enum.log


Unnamed: 0,Unnamed: 1,4.15->4.18
trace_event,Old,417
trace_event,New,433
trace_event,Common,405
trace_event,Added,28
trace_event,Removed,12
...,...,...
enum,Removed,25
enum,Changed,66
enum,- Elem added,57
enum,- Elem removed,23


In [180]:
def get_json_paths(path):
    return sorted(
        (file for file in path.glob("*.jsonl")),
        key=lambda name: tuple(map(int, name.stem.split("-")[0].split("."))),
    )


get_json_paths(Path("data/18.04-x86"))

[PosixPath('data/18.04-x86/4.15.0-213-generic.jsonl'),
 PosixPath('data/18.04-x86/4.18.0-25-generic.jsonl'),
 PosixPath('data/18.04-x86/5.0.0-65-generic.jsonl'),
 PosixPath('data/18.04-x86/5.3.0-76-generic.jsonl'),
 PosixPath('data/18.04-x86/5.4.0-91-generic.jsonl')]

In [181]:
def diff_all_btf_files(path):
    paths = get_json_paths(path)

    results = []
    for path1, path2 in zip(paths[:-1], paths[1:]):
        btf1 = BTF(path1)
        btf2 = BTF(path2)
        name = f"{btf1.short_name}->{btf2.short_name}"
        df = diff_btf(btf1, btf2, name)
        results.append(df)

    btf1 = BTF(paths[0])
    btf2 = BTF(paths[-1])
    name = "Total"
    df = diff_btf(btf1, btf2, name)
    results.append(df)

    df = pd.concat(results, axis=1)
    df.to_string(Path("output") / f"{path.name}.txt")
    return df


diff_all_btf_files(Path("data/18.04-x86"))

Diffing 4.15->4.18
Logging to output/18.04-x86/4.15->4.18-trace_event.log
Logging to output/18.04-x86/4.15->4.18-raw_tp.log
Logging to output/18.04-x86/4.15->4.18-lsm.log
Logging to output/18.04-x86/4.15->4.18-func.log
Logging to output/18.04-x86/4.15->4.18-struct.log
Logging to output/18.04-x86/4.15->4.18-union.log
Logging to output/18.04-x86/4.15->4.18-enum.log
Diffing 4.18->5.0
Logging to output/18.04-x86/4.18->5.0-trace_event.log
Logging to output/18.04-x86/4.18->5.0-raw_tp.log
Logging to output/18.04-x86/4.18->5.0-lsm.log
Logging to output/18.04-x86/4.18->5.0-func.log
Logging to output/18.04-x86/4.18->5.0-struct.log
Logging to output/18.04-x86/4.18->5.0-union.log
Logging to output/18.04-x86/4.18->5.0-enum.log
Diffing 5.0->5.3
Logging to output/18.04-x86/5.0->5.3-trace_event.log
Logging to output/18.04-x86/5.0->5.3-raw_tp.log
Logging to output/18.04-x86/5.0->5.3-lsm.log
Logging to output/18.04-x86/5.0->5.3-func.log
Logging to output/18.04-x86/5.0->5.3-struct.log
Logging to output/1

Unnamed: 0,Unnamed: 1,4.15->4.18,4.18->5.0,5.0->5.3,5.3->5.4,Total
trace_event,Old,417,433,445,468,417
trace_event,New,433,445,468,477,477
trace_event,Common,405,430,444,468,402
trace_event,Added,28,15,24,9,75
trace_event,Removed,12,3,1,0,15
...,...,...,...,...,...,...
enum,Removed,25,11,18,7,35
enum,Changed,66,64,72,33,130
enum,- Elem added,57,57,65,31,121
enum,- Elem removed,23,13,14,2,34


In [182]:
result = {}
for path in sorted(Path("data").glob("*")):
    if path.is_dir():
        print(f"Diffing {path}")
        df = diff_all_btf_files(path)
        result[path.name] = df

Diffing data/16.04-x86
Diffing 4.4->4.8
Logging to output/16.04-x86/4.4->4.8-trace_event.log
Logging to output/16.04-x86/4.4->4.8-raw_tp.log
Logging to output/16.04-x86/4.4->4.8-lsm.log
Logging to output/16.04-x86/4.4->4.8-func.log
Logging to output/16.04-x86/4.4->4.8-struct.log
Logging to output/16.04-x86/4.4->4.8-union.log
Logging to output/16.04-x86/4.4->4.8-enum.log
Diffing 4.8->4.10
Logging to output/16.04-x86/4.8->4.10-trace_event.log
Logging to output/16.04-x86/4.8->4.10-raw_tp.log
Logging to output/16.04-x86/4.8->4.10-lsm.log
Logging to output/16.04-x86/4.8->4.10-func.log
Logging to output/16.04-x86/4.8->4.10-struct.log
Logging to output/16.04-x86/4.8->4.10-union.log
Logging to output/16.04-x86/4.8->4.10-enum.log
Diffing 4.10->4.13
Logging to output/16.04-x86/4.10->4.13-trace_event.log
Logging to output/16.04-x86/4.10->4.13-raw_tp.log
Logging to output/16.04-x86/4.10->4.13-lsm.log
Logging to output/16.04-x86/4.10->4.13-func.log
Logging to output/16.04-x86/4.10->4.13-struct.log


In [183]:
df = pd.concat(result, axis=1)

df = df.loc[(df != 0).any(axis=1)]

df.to_pickle("output/diff.pkl")

In [187]:
df = pd.read_pickle("output/diff.pkl")

df = df.astype(object)

# only keep the 20.04-x86 column
kinds = ["struct", "func", "trace_event", "lsm"]
distros = ["20.04-x86"]
df = df.loc[kinds, distros]

for kind in kinds:
    for row in df.loc[kind, :].index:
        if row == "Old": 
            continue
        for col in df.columns:
            val = df.loc[(kind, row), col]
            percentage = val / df.loc[(kind, "Old"), col] * 100
            df.loc[(kind, row), col] = f"{val} ({percentage:4.1f}\%)"

df = df.drop(index=["Old", "New", "Common"], level=1)

df.to_latex("output/diff.tex")

df

Unnamed: 0_level_0,Unnamed: 1_level_0,20.04-x86,20.04-x86,20.04-x86,20.04-x86,20.04-x86
Unnamed: 0_level_1,Unnamed: 1_level_1,5.4->5.8,5.8->5.11,5.11->5.13,5.13->5.15,Total
struct,Added,529 ( 6.3\%),513 ( 5.9\%),233 ( 2.6\%),283 ( 3.1\%),1426 (17.0\%)
struct,Removed,203 ( 2.4\%),156 ( 1.8\%),96 ( 1.1\%),117 ( 1.3\%),440 ( 5.2\%)
struct,Changed,973 (11.6\%),771 ( 8.8\%),476 ( 5.2\%),656 ( 7.1\%),1513 (18.0\%)
struct,- Field added,485 ( 5.8\%),439 ( 5.0\%),272 ( 3.0\%),365 ( 4.0\%),944 (11.2\%)
struct,- Field removed,315 ( 3.8\%),232 ( 2.7\%),154 ( 1.7\%),163 ( 1.8\%),533 ( 6.3\%)
struct,- Field type changed,219 ( 2.6\%),164 ( 1.9\%),111 ( 1.2\%),140 ( 1.5\%),400 ( 4.8\%)
struct,- Layout changed,897 (10.7\%),720 ( 8.3\%),438 ( 4.8\%),605 ( 6.6\%),1408 (16.8\%)
func,Added,3564 ( 7.4\%),3080 ( 6.2\%),1343 ( 2.6\%),2121 ( 4.1\%),8958 (18.6\%)
func,Removed,2000 ( 4.2\%),1076 ( 2.2\%),755 ( 1.5\%),1123 ( 2.1\%),3804 ( 7.9\%)
func,Changed,932 ( 1.9\%),1176 ( 2.4\%),549 ( 1.1\%),640 ( 1.2\%),2385 ( 5.0\%)
