# Function Duplication

This notebook generates the table for function duplication.

In [1]:
%reload_ext autoreload
%autoreload 2

In [5]:
from depsurf import (
    Version,
    VersionGroup,
    FuncGroup,
    FuncGroups,
    CollisionType,
    VERSION_DEFAULT,
)
from utils import OUTPUT_PATH
from typing import Dict, TextIO


def print_group(group: FuncGroup, file: TextIO):
    header = f"{group.name}"
    if group.num_funcs > 1:
        header += f" ({group.num_funcs})"
    print(header, file=file)
    for func in group.funcs:
        line = f"  {func.file}"
        line += f" at {func.loc}"
        line += " (global)" if func.external else " (static)"
        line += " (inline)" if func.inline_actual else " (not inline)"
        print(line, file=file)


def count_dup(v: Version):
    results: Dict[CollisionType, Dict[str, FuncGroup]] = {c: {} for c in CollisionType}

    func_groups = FuncGroups.from_dump(v.func_groups_path)
    for group in func_groups.iter_groups():
        results[group.collision_type][group.name] = group

    output_path = OUTPUT_PATH / "dup" / v.name
    output_path.mkdir(parents=True, exist_ok=True)
    for c in [
        CollisionType.INCLUDE_DUP,
        CollisionType.STATIC_STATIC,
        CollisionType.STATIC_GLOBAL,
    ]:
        path = output_path / f"{c.name.lower()}.txt"
        with open(path, "w") as f:
            for group in sorted(
                results[c].values(), key=lambda x: x.num_funcs, reverse=True
            ):
                print_group(group, file=f)
        print(f"Saved {len(results[c]):5} groups to {path}", flush=True)

    return {k: len(v) for k, v in results.items()}


# count_dup(VERSION_DEFAULT)

In [6]:
data = {}

for v in VersionGroup.LTS:
    data[v] = count_dup(v)

[       groups.py:47 ] INFO: Loading funcs from /Users/szhong/Code/DepSurf/data/dataset/func_groups/4.4.0-21-generic-amd64.jsonl


Saved  4042 groups to /Users/szhong/Code/DepSurf/data/output/dup/4.4.0-21-generic-amd64/include_dup.txt
Saved   404 groups to /Users/szhong/Code/DepSurf/data/output/dup/4.4.0-21-generic-amd64/static_static.txt
Saved    10 groups to /Users/szhong/Code/DepSurf/data/output/dup/4.4.0-21-generic-amd64/static_global.txt


[       groups.py:47 ] INFO: Loading funcs from /Users/szhong/Code/DepSurf/data/dataset/func_groups/4.15.0-20-generic-amd64.jsonl


Saved  4809 groups to /Users/szhong/Code/DepSurf/data/output/dup/4.15.0-20-generic-amd64/include_dup.txt
Saved   398 groups to /Users/szhong/Code/DepSurf/data/output/dup/4.15.0-20-generic-amd64/static_static.txt
Saved    26 groups to /Users/szhong/Code/DepSurf/data/output/dup/4.15.0-20-generic-amd64/static_global.txt


[       groups.py:47 ] INFO: Loading funcs from /Users/szhong/Code/DepSurf/data/dataset/func_groups/5.4.0-26-generic-amd64.jsonl


Saved  5461 groups to /Users/szhong/Code/DepSurf/data/output/dup/5.4.0-26-generic-amd64/include_dup.txt
Saved   411 groups to /Users/szhong/Code/DepSurf/data/output/dup/5.4.0-26-generic-amd64/static_static.txt
Saved    27 groups to /Users/szhong/Code/DepSurf/data/output/dup/5.4.0-26-generic-amd64/static_global.txt


[       groups.py:47 ] INFO: Loading funcs from /Users/szhong/Code/DepSurf/data/dataset/func_groups/5.15.0-25-generic-amd64.jsonl


Saved  6162 groups to /Users/szhong/Code/DepSurf/data/output/dup/5.15.0-25-generic-amd64/include_dup.txt
Saved   444 groups to /Users/szhong/Code/DepSurf/data/output/dup/5.15.0-25-generic-amd64/static_static.txt
Saved    26 groups to /Users/szhong/Code/DepSurf/data/output/dup/5.15.0-25-generic-amd64/static_global.txt


[       groups.py:47 ] INFO: Loading funcs from /Users/szhong/Code/DepSurf/data/dataset/func_groups/6.8.0-31-generic-amd64.jsonl


Saved  7418 groups to /Users/szhong/Code/DepSurf/data/output/dup/6.8.0-31-generic-amd64/include_dup.txt
Saved   498 groups to /Users/szhong/Code/DepSurf/data/output/dup/6.8.0-31-generic-amd64/static_static.txt
Saved    29 groups to /Users/szhong/Code/DepSurf/data/output/dup/6.8.0-31-generic-amd64/static_global.txt


In [4]:
from utils import save_latex, center_cell
import pandas as pd


table = {}
for version, dup_counts in data.items():
    col = {}
    col_max = max(dup_counts.values())
    for dup_type, count in dup_counts.items():
        text = f"{count / 1000:.1f}k" if count > 1000 else count
        col[dup_type] = text

    col_name = center_cell(version.short_version)
    table[("Linux Kernel Version", col_name)] = col

df = pd.DataFrame(table)
latex = df.to_latex(multicolumn_format="c", column_format="l|rrrrr")
save_latex(latex, "dup")
df

[  utils_latex.py:118] INFO: Saved dup to /Users/szhong/Code/DepSurf/paper/tabs/dup.tex


Unnamed: 0_level_0,Linux Kernel Version,Linux Kernel Version,Linux Kernel Version,Linux Kernel Version,Linux Kernel Version
Unnamed: 0_level_1,\multicolumn{1}{c}{4.4},\multicolumn{1}{c}{4.15},\multicolumn{1}{c}{5.4},\multicolumn{1}{c}{5.15},\multicolumn{1}{c}{6.8}
Unique Global,17.2k,20.1k,22.7k,26.6k,31.5k
Unique Static,35.7k,41.7k,48.2k,53.3k,60.2k
Static Duplication,4.0k,4.8k,5.5k,6.2k,7.4k
Static-Static Collision,404,398,411,444,498
Static-Global Collision,10,26,27,26,29
