In [40]:
%load_ext autoreload
%autoreload now

from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
from pathlib import Path
import json


class BTF:
    def __init__(self, path):
        self.path = Path(path)
        self.data = self.read_jsonl(self.path)
        self.data_by_kind = {}

    @staticmethod
    def read_jsonl(jsonl_path):
        assert jsonl_path.exists()
        assert jsonl_path.suffix == ".jsonl"

        with open(jsonl_path) as f:
            return [json.loads(line) for line in f]

    @property
    def short_name(self):
        linux_version = self.path.name.split("-")[0]
        assert linux_version.endswith(".0")
        return linux_version[:-2]

    def print(self):
        from collections import defaultdict

        print(f"File: {self.path}")

        kinds = defaultdict(int)
        print("Sample:")
        for e in self.data:
            if e["kind"] not in kinds:
                print(f"\t{e['kind']:10}: {e}")
            kinds[e["kind"]] += 1

        kinds = sorted(kinds.items(), key=lambda x: x[1], reverse=True)
        print(f"Kinds: {dict(kinds)}")

        print()

    def filter_on_kind(self, kind):
        if kind not in self.data_by_kind:
            self.data_by_kind[kind] = {
                e["name"]: e
                for e in self.data
                if e["kind"] == kind and e["name"] != "(anon)"
            }
        return self.data_by_kind[kind]

    def get(self, kind, name):
        return self.filter_on_kind(kind)[name]


btf1 = BTF("data/20.04-x86/5.13.0-52-generic.jsonl")


# d1.get(Kind.UNION, "intel_x86_pebs_dse")
# d1.get(Kind.STRUCT, "task_struct")
# d1.get(Kind.FUNC, "vfs_read")
btf1.get(Kind.ENUM, "nf_ip_hook_priorities")

{'kind': 'ENUM',
 'name': 'nf_ip_hook_priorities',
 'size': 4,
 'values': [{'name': 'NF_IP_PRI_FIRST', 'val': -2147483648},
  {'name': 'NF_IP_PRI_RAW_BEFORE_DEFRAG', 'val': -450},
  {'name': 'NF_IP_PRI_CONNTRACK_DEFRAG', 'val': -400},
  {'name': 'NF_IP_PRI_RAW', 'val': -300},
  {'name': 'NF_IP_PRI_SELINUX_FIRST', 'val': -225},
  {'name': 'NF_IP_PRI_CONNTRACK', 'val': -200},
  {'name': 'NF_IP_PRI_MANGLE', 'val': -150},
  {'name': 'NF_IP_PRI_NAT_DST', 'val': -100},
  {'name': 'NF_IP_PRI_FILTER', 'val': 0},
  {'name': 'NF_IP_PRI_SECURITY', 'val': 50},
  {'name': 'NF_IP_PRI_NAT_SRC', 'val': 100},
  {'name': 'NF_IP_PRI_SELINUX_LAST', 'val': 225},
  {'name': 'NF_IP_PRI_CONNTRACK_HELPER', 'val': 300},
  {'name': 'NF_IP_PRI_CONNTRACK_CONFIRM', 'val': 2147483647},
  {'name': 'NF_IP_PRI_LAST', 'val': 2147483647}]}

In [42]:
btf2 = BTF("data/20.04-x86/5.15.0-92-generic.jsonl")
btf2.print()

File: data/20.04-x86/5.15.0-92-generic.jsonl
Sample:
	INT       : {'kind': 'INT', 'name': 'long unsigned int'}
	CONST     : {'kind': 'CONST', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	VOLATILE  : {'kind': 'VOLATILE', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	ARRAY     : {'kind': 'ARRAY', 'name': '(anon)', 'nr_elems': 2, 'type': {'kind': 'INT', 'name': 'long unsigned int'}}
	PTR       : {'kind': 'PTR', 'type': {'kind': 'CONST', 'name': '(anon)', 'type': {'kind': 'INT', 'name': 'char'}}}
	TYPEDEF   : {'kind': 'TYPEDEF', 'name': '__s8', 'type': {'kind': 'INT', 'name': 'signed char'}}
	ENUM      : {'kind': 'ENUM', 'name': '(anon)', 'size': 4, 'values': [{'name': 'false', 'val': 0}, {'name': 'true', 'val': 1}]}
	FUNC_PROTO: {'kind': 'FUNC_PROTO', 'params': [{'name': '(anon)', 'type': {'kind': 'INT', 'name': 'int'}}], 'ret_type': {'name': 'void', 'kind': 'VOID'}}
	STRUCT    : {'kind': 'STRUCT', 'name': '(anon)', 'size': 4, 'member

In [43]:
from enum import Enum


class Reason(str, Enum):
    ADD = "Sub added\n"
    REMOVE = "Sub removed\n"
    REORDER = "Sub reordered\n"
    TYPE = "Sub type changed\n"
    RETURN = "Return type changed\n"
    LAYOUT = "Layout changed\n"
    VALUE = "Value changed\n"

    @staticmethod
    def for_kind(kind):
        if kind in [Kind.STRUCT, Kind.UNION]:
            return [Reason.ADD, Reason.REMOVE, Reason.REORDER, Reason.TYPE, Reason.LAYOUT]
        if kind == Kind.ENUM:
            return [Reason.ADD, Reason.REMOVE, Reason.VALUE]
        if kind == Kind.FUNC:
            return [Reason.ADD, Reason.REMOVE, Reason.REORDER, Reason.TYPE, Reason.RETURN]

In [44]:
from dataclasses import dataclass


@dataclass(frozen=True)
class DiffResult:
    added: set
    removed: set
    common: set
    changed: dict
    reasons: dict[str, int]


def print_as_list(name, s, num=10):
    print(f"{name} ({len(s)}): {list(s)[:num]}")


def diff_dict(old, new):
    added = {k: v for k, v in new.items() if k not in old}
    removed = {k: v for k, v in old.items() if k not in new}
    common = {k: (old[k], new[k]) for k in old.keys() if k in new}
    return added, removed, common


def check_diff_impl(d_old, d_new, kind, diff_fn):
    f_old = d_old.filter_on_kind(kind)
    f_new = d_new.filter_on_kind(kind)
    print_as_list(f"Old {kind}", f_old.keys())
    print_as_list(f"New {kind}", f_new.keys())

    added, removed, common = diff_dict(f_old, f_new)
    print_as_list(f"Added {kind}", added)
    print_as_list(f"Removed {kind}", removed)
    print_as_list(f"Common {kind}", common)

    changed = {
        name: diff_fn(old, new).strip().split("\n")
        for name, (old, new) in common.items()
        if old != new
    }
    print_as_list(f"Changed {kind}", changed.keys())

    reasons = {r.strip(): 0 for r in Reason.for_kind(kind)}
    for reason in changed.values():
        for line in reason:
            if not line.startswith("\t"):
                reasons[line] += 1
    print_as_list(f"Reasons {kind}", reasons.items())

    for name, reason in changed.items():
        print(name)
        for line in reason:
            print(f"\t{line}")

    return DiffResult(added, removed, common, changed, reasons)

In [45]:
def diff_struct(old, new):
    result = ""

    old_members = {m["name"]: m for m in old["members"]}
    new_members = {m["name"]: m for m in new["members"]}

    added, removed, common = diff_dict(old_members, new_members)

    # added field
    if added:
        result += Reason.ADD
        for name, value in added.items():
            result += f"\t{name:20}: {value['type']}\n"

    # removed field
    if removed:
        result += Reason.REMOVE
        for name, value in removed.items():
            result += f"\t{name:20}: {value['type']}\n"

    # fields reordered
    if [n for n in old_members if n in common] != [
        n for n in new_members if n in common
    ]:
        result += Reason.REORDER
        result += f"\t{'':20} {list(old_members)}\n"
        result += f"\t{'':20} {list(new_members)}\n"

    # fields changed type
    changed_types = {
        name: (old_value["type"], new_value["type"])
        for name, (old_value, new_value) in common.items()
        if old_value["type"] != new_value["type"]
    }
    if changed_types:
        result += Reason.TYPE
        for name, (old_type, new_type) in changed_types.items():
            result += f"\t{name:20}: {old_type}\n"
            result += f"\t{'':20}->{new_type}\n"

    # fields changed offset
    old_offset = {name: old_members[name]
                  ["bits_offset"] for name in old_members}
    new_offset = {name: new_members[name]
                  ["bits_offset"] for name in new_members}
    layout_changed = old_offset != new_offset or old["size"] != new["size"]
    if layout_changed and result == "":
        result += Reason.LAYOUT

    assert result, f"\n{old}\n{new}"
    return result


# check_diff_impl(
#     BTF("data/18.04-x86/4.15.0-213-generic.jsonl"),
#     BTF("data/18.04-x86/4.18.0-25-generic.jsonl"),
#     Kind.STRUCT,
#     diff_struct,
# )

In [46]:
# check_diff_impl(
#     BTF("data/18.04-x86/4.15.0-213-generic.jsonl"),
#     BTF("data/18.04-x86/4.18.0-25-generic.jsonl"),
#     Kind.UNION,
#     diff_struct,
# )

In [47]:
def diff_func(old, new):
    result = ""

    old_params = {p["name"]: p for p in old["type"]["params"]}
    new_params = {p["name"]: p for p in new["type"]["params"]}

    added, removed, common = diff_dict(old_params, new_params)

    # params added
    if added:
        result += Reason.ADD
        for name, value in added.items():
            result += f"\t{name:20}: {value['type']}\n"

    # params removed
    if removed:
        result += Reason.REMOVE
        for name, value in removed.items():
            result += f"\t{name:20}: {value['type']}\n"

    # params reordered
    if [n for n in old_params if n in common] != [n for n in new_params if n in common]:
        result += Reason.REORDER
        result += f"\t{'':20} {list(old_params)}\n"
        result += f"\t{'':20} {list(new_params)}\n"

    # params changed type
    changed_types = {
        name: (old_value["type"], new_value["type"])
        for name, (old_value, new_value) in common.items()
        if old_value["type"] != new_value["type"]
    }
    if changed_types:
        result += Reason.TYPE
        for name, (old_type, new_type) in changed_types.items():
            result += f"\t{name:20}: {old_type}\n"
            result += f"\t{'':20}->{new_type}\n"

    # changed return value
    old_ret = old["type"]["ret_type"]
    new_ret = new["type"]["ret_type"]
    if old_ret != new_ret:
        result += Reason.RETURN
        result += f"\t{'':20}: {old_ret}\n"
        result += f"\t{'':20}->{new_ret}\n"

    assert result, f"\n{old}\n{new}"
    return result

# check_diff_impl(
#     BTF("data/18.04-x86/4.15.0-213-generic.jsonl"),
#     BTF("data/18.04-x86/4.18.0-25-generic.jsonl"),
#     Kind.FUNC,
#     diff_func,
# )

In [48]:
def diff_enum(old, new):
    result = ""

    old_values = {v["name"]: v for v in old["values"]}
    new_values = {v["name"]: v for v in new["values"]}

    added, removed, common = diff_dict(old_values, new_values)

    # added value
    if added:
        result += Reason.ADD
        for name, value in added.items():
            result += f"\t{name:40}: {value['val']}\n"

    # removed value
    if removed:
        result += Reason.REMOVE
        for name, value in removed.items():
            result += f"\t{name:40}: {value['val']}\n"

    # values changed
    changed_values = {
        name: (old_value["val"], new_value["val"])
        for name, (old_value, new_value) in common.items()
        if old_value["val"] != new_value["val"]
    }
    if changed_values:
        result += Reason.VALUE
        for name, (old_val, new_val) in changed_values.items():
            result += f"\t{name:40}: {old_val} -> {new_val}\n"

    assert result, f"\n{old}\n{new}"
    return result


# check_diff_impl(
#     BTF("data/18.04-x86/4.15.0-213-generic.jsonl"),
#     BTF("data/18.04-x86/4.18.0-25-generic.jsonl"),
#     Kind.ENUM,
#     diff_enum,
# )

In [49]:
import sys


class FileLogger:
    def __init__(self, path):
        print(f"Logging to {path}")
        
        self.stdout = sys.stdout
        path.parent.mkdir(parents=True, exist_ok=True)
        self.log = open(path, "w")

    def write(self, message):
        self.log.write(message)

    def __enter__(self):
        sys.stdout = self
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout = self.stdout
        self.log.close()

In [50]:
def check_diff(btf1, btf2, kind, output_path):
    assert btf1.path.parent == btf2.path.parent
    name = f"{btf1.short_name}->{btf2.short_name}.{kind.name}.log"
    path = Path("output") / btf1.path.parent.name / name

    diff_fn = {
        Kind.STRUCT: diff_struct,
        Kind.UNION: diff_struct,
        Kind.FUNC: diff_func,
        Kind.ENUM: diff_enum,
    }[kind]
    with FileLogger(output_path):
        return check_diff_impl(btf1, btf2, kind, diff_fn)

In [51]:
import pandas as pd


def diff_btf(btf1, btf2):
    name = f"{btf1.short_name}->{btf2.short_name}"
    print(f"Diffing {name}")
    
    assert btf1.path.parent == btf1.path.parent
    path = Path("output") / btf1.path.parent.name

    results = {}
    for kind in [Kind.STRUCT, Kind.FUNC, Kind.ENUM, Kind.UNION]:
        output_path = path / f"{name}-{kind.name}.log"
        diff_result = check_diff(btf1, btf2, kind, output_path)
        kind = kind.name.lower()
        results[(kind, "common")] = len(diff_result.common)
        results[(kind, "added")] = len(diff_result.added)
        results[(kind, "removed")] = len(diff_result.removed)
        results[(kind, "changed")] = len(diff_result.changed)
        for k, v in diff_result.reasons.items():
            results[(kind, k)] = v

    df = pd.DataFrame(
        results.values(),
        index=pd.MultiIndex.from_tuples(results.keys()),
        columns=[name]
    )
    df.to_string(path / f"{name}.txt")
    return df


diff_btf(
    BTF("data/18.04-x86/4.15.0-213-generic.jsonl"),
    BTF("data/18.04-x86/4.18.0-25-generic.jsonl")
)

Diffing 4.15->4.18
Logging to output/18.04-x86/4.15->4.18-STRUCT.log
Logging to output/18.04-x86/4.15->4.18-FUNC.log
Logging to output/18.04-x86/4.15->4.18-ENUM.log
Logging to output/18.04-x86/4.15->4.18-UNION.log


Unnamed: 0,Unnamed: 1,4.15->4.18
struct,common,7193
struct,added,426
struct,removed,136
struct,changed,785
struct,Sub added,354
struct,Sub removed,261
struct,Sub reordered,27
struct,Sub type changed,189
struct,Layout changed,189
func,common,40918


In [52]:
def get_json_paths(path):
    return sorted(
        (file for file in path.glob("*.jsonl")),
        key=lambda name: tuple(map(int, name.stem.split("-")[0].split("."))),
    )


get_json_paths(Path("data/18.04-x86"))

[PosixPath('data/18.04-x86/4.15.0-213-generic.jsonl'),
 PosixPath('data/18.04-x86/4.18.0-25-generic.jsonl'),
 PosixPath('data/18.04-x86/5.0.0-65-generic.jsonl'),
 PosixPath('data/18.04-x86/5.3.0-76-generic.jsonl'),
 PosixPath('data/18.04-x86/5.4.0-91-generic.jsonl')]

In [57]:
def diff_all_btf_files(path):
    paths = get_json_paths(path)

    results = []
    for path1, path2 in list(zip(paths[:-1], paths[1:])) + [(paths[0], paths[-1])]:
        df = diff_btf(BTF(path1), BTF(path2))
        results.append(df)
    df = pd.concat(results, axis=1)
    df.to_string(Path("output") / f"{path.name}.txt")
    return df


diff_all_btf_files(Path("data/18.04-x86"))

Diffing 4.15->4.18
Logging to output/18.04-x86/4.15->4.18-STRUCT.log
Logging to output/18.04-x86/4.15->4.18-FUNC.log
Logging to output/18.04-x86/4.15->4.18-ENUM.log
Logging to output/18.04-x86/4.15->4.18-UNION.log
Diffing 4.18->5.0
Logging to output/18.04-x86/4.18->5.0-STRUCT.log
Logging to output/18.04-x86/4.18->5.0-FUNC.log
Logging to output/18.04-x86/4.18->5.0-ENUM.log
Logging to output/18.04-x86/4.18->5.0-UNION.log
Diffing 5.0->5.3
Logging to output/18.04-x86/5.0->5.3-STRUCT.log
Logging to output/18.04-x86/5.0->5.3-FUNC.log
Logging to output/18.04-x86/5.0->5.3-ENUM.log
Logging to output/18.04-x86/5.0->5.3-UNION.log
Diffing 5.3->5.4
Logging to output/18.04-x86/5.3->5.4-STRUCT.log
Logging to output/18.04-x86/5.3->5.4-FUNC.log
Logging to output/18.04-x86/5.3->5.4-ENUM.log
Logging to output/18.04-x86/5.3->5.4-UNION.log
Diffing 4.15->5.4
Logging to output/18.04-x86/4.15->5.4-STRUCT.log
Logging to output/18.04-x86/4.15->5.4-FUNC.log
Logging to output/18.04-x86/4.15->5.4-ENUM.log
Logging 

Unnamed: 0,Unnamed: 1,4.15->4.18,4.18->5.0,5.0->5.3,5.3->5.4,4.15->5.4
struct,common,7193,7507,7762,8160,7057
struct,added,426,347,432,279,1382
struct,removed,136,112,92,34,272
struct,changed,785,813,655,369,1293
struct,Sub added,354,437,352,193,779
struct,Sub removed,261,208,183,88,451
struct,Sub reordered,27,35,24,10,69
struct,Sub type changed,189,184,112,62,361
struct,Layout changed,189,191,168,100,214
func,common,40918,43288,44821,46561,39572


In [58]:
result = {}
for path in sorted(Path("data").glob("*")):
    if path.is_dir():
        print(f"Diffing {path}")
        df = diff_all_btf_files(path)
        result[path.name] = df

Diffing data/16.04-x86
Diffing 4.4->4.8
Logging to output/16.04-x86/4.4->4.8-STRUCT.log
Logging to output/16.04-x86/4.4->4.8-FUNC.log
Logging to output/16.04-x86/4.4->4.8-ENUM.log
Logging to output/16.04-x86/4.4->4.8-UNION.log
Diffing 4.8->4.10
Logging to output/16.04-x86/4.8->4.10-STRUCT.log
Logging to output/16.04-x86/4.8->4.10-FUNC.log
Logging to output/16.04-x86/4.8->4.10-ENUM.log
Logging to output/16.04-x86/4.8->4.10-UNION.log
Diffing 4.10->4.13
Logging to output/16.04-x86/4.10->4.13-STRUCT.log
Logging to output/16.04-x86/4.10->4.13-FUNC.log
Logging to output/16.04-x86/4.10->4.13-ENUM.log
Logging to output/16.04-x86/4.10->4.13-UNION.log
Diffing 4.13->4.15
Logging to output/16.04-x86/4.13->4.15-STRUCT.log
Logging to output/16.04-x86/4.13->4.15-FUNC.log
Logging to output/16.04-x86/4.13->4.15-ENUM.log
Logging to output/16.04-x86/4.13->4.15-UNION.log
Diffing 4.4->4.15
Logging to output/16.04-x86/4.4->4.15-STRUCT.log
Logging to output/16.04-x86/4.4->4.15-FUNC.log
Logging to output/16.0

In [59]:
pd.concat(result, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,16.04-x86,16.04-x86,16.04-x86,16.04-x86,16.04-x86,18.04-x86,18.04-x86,18.04-x86,18.04-x86,18.04-x86,20.04-x86,20.04-x86,20.04-x86,20.04-x86,20.04-x86
Unnamed: 0_level_1,Unnamed: 1_level_1,4.4->4.8,4.8->4.10,4.10->4.13,4.13->4.15,4.4->4.15,4.15->4.18,4.18->5.0,5.0->5.3,5.3->5.4,4.15->5.4,5.4->5.8,5.8->5.11,5.11->5.13,5.13->5.15,5.4->5.15
struct,common,6072,6432,6582,6881,5958,7193,7507,7762,8160,7057,8193,8566,8983,9099,7956
struct,added,410,228,384,374,1297,426,347,432,279,1382,529,513,233,283,1426
struct,removed,138,50,78,85,252,136,112,92,34,272,203,156,96,117,440
struct,changed,828,640,852,721,1391,785,813,655,369,1293,973,771,476,656,1513
struct,Sub added,404,247,381,339,780,354,437,352,193,779,485,439,272,365,944
struct,Sub removed,286,113,199,164,459,261,208,183,88,451,315,232,154,163,533
struct,Sub reordered,35,9,22,13,50,27,35,24,10,69,34,26,23,20,74
struct,Sub type changed,165,112,218,145,428,189,184,112,62,361,219,164,111,140,400
struct,Layout changed,214,279,253,234,271,189,191,168,100,214,229,166,73,162,246
func,common,35475,37538,38096,40100,34361,40918,43288,44821,46561,39572,46181,48669,50994,51214,44377
