Analysis pinned to Kernel version 5.10

In [7]:
#todo: make the regexes more strict.

In [8]:
import re
import itertools
from itertools import islice
from subprocess import run

from pathlib import Path
import sqlite3

KERNEL = Path(".")
OUTDIR = Path("../function_survey/output/")
all_calls = OUTDIR/"cscope_all_calls.txt"
kernel_tags = OUTDIR/"kernel_tags"
test_targets = OUTDIR/"cscope_test_targets"
all_c_code = OUTDIR/"all_c_code.txt"
blame_files = Path("../blame")


connection = sqlite3.connect(OUTDIR/"function_survey.db")
cursor = connection.cursor()

In [9]:
def parse(filename, expression):
    with open(filename) as f:
        return [re.match(expression, line) for line in f]
    
def head(iterable, n=10):
    return list(islice(iterable, n))

## Blame-Parser

In [None]:
def strict_match(pattern, line, flags=0):
    m = re.match(pattern, line, flags=0)
    if m:
        return m
    raise(ValueError, f"{pattern} did not match {line}")

def blame_contents(match):
    return match["contents"].split("|")

def parse_blame_file(filename):
    expression = (r"^(?P<hash>\w{40});"
                  r"(?P<previous_file_name>[^\s;]*);"
                  r"\t(?P<contents>.*)$"
                 )
    with open(filename) as f:
        yield from (blame_contents(strict_match(expression, line)) for line in f)

In [None]:
def parse_file(filename):
    functions = {}
    state = "Not in function" # switch to enum
    for i,contents in enumerate(
        parse_blame_file(filename)
    ):
        if state == "Not in function":
            if contents[0] == "begin_function":
                contained_names = []
                called_functions = []
                function_name = None
                state = "Function declaration"
            # else: ignore current line
        elif state == "Function declaration":
            if (contents[0] == "DECL"
                and contents[1] == "function"
               ):
                function_name = contents[2].split()[0]
                state = "Function declaration breakdown"
            # else: some functions have specifiers/names on the line(s) before the declaration
            # for example find_pa in arch/alpha/boot/bootp.c, line 41
        elif state == "Function declaration breakdown":
            if contents[0] == "block":
                state = "Function Body"
            # else: ignore specifiers/arguments
        elif state == "Function Body":
            if contents[0] == "end_function":
                functions[function_name] = called_functions#, contained_names
                state = "Not in function"
            elif contents[0] == "name":
                contained_names.append(contents[1])
                current_name = contents[1]
                state = "Identifying Name"
            # else: ignore anything that is not a call/end of function
        elif state == "Identifying Name":
            # this state is the most likely to be incorrect.
            # TODO: figure out what can be after a name token
            if contents[0] == "argument_list" and contents[1] == "(": #this will probably break on no arg functions
                # assume that the name is a function
                called_functions.append(current_name)
                current_name = None
                state = "Function Body"
            elif contents[0] == "name":
                contained_names.append(contents[1])
                current_name = contents[1]
                state = "Identifying Name"
            else:
                # assume name is not a function
                current_name = None
                state = "Function Body"
        else:
            assert False, f"invalid state {state}"
        
    return functions

def parse_kernel():
    kernel_files = {}
    for filename in blame_files.rglob("*.c.blame"):
        kernel_files[filename] = parse_file(filename)
    return kernel_files

In [None]:
# How does cregit handle ifdefs in functions?

In [None]:
# file (path), item_type (function/include/ifdef/etc), name

# Call table
# caller_id, callee_name

# Functions table
# function_id, function_name

# file contents
# file, item_type, item_id


def parse_whole(filename):
    lines = parse_blame_file(filename)
        
    for start, *rest in lines:
        if (start in ("begin_unit", "end_unit", "")):
            pass
        elif start.startswith("begin_"):
            item = start[len("begin_"):]
            if item == "function":
                yield ("function", *parse_function(lines_in_item(lines, item)))
            elif item == "include":
                yield ("include", *parse_include(lines_in_item(lines, item)))
            else:
                yield skip(lines, item)

def parse_function(lines):
    lines = iter(lines)
    function_name, specifiers = parse_function_decl(lines)
    callees = parse_function_body(lines)
    return (function_name, specifiers, callees)

def parse_function_decl(lines):
    specifiers = []
    function_name = None
    for start, *rest in lines:
        if start == "specifier":
            assert len(rest) == 1
            specifiers += rest
            # are there other specifiers than static
        elif start == "DECL":
            assert rest[0] == "function", (start, rest)
            function_name = rest[1].split()[0]
        elif start == "name":
            pass # we may be able to weed out the function name, and get the types if that is usefull
        elif start == "parameter_list" and rest == [")"]:
            break  # this marks the end of the function header
        else:
            pass # TODO: check what other declaration parts end up here
    assert function_name is not None
    return function_name, specifiers

def parse_function_body(lines):
    names = []
    callees = []
    
    contents = next(lines)
    # check if this holds for empty blocks
    assert (contents == ["block", "{"]), "function body must start with block"
    
    prev_name = None
    for start, *rest in lines:
        if start == "name":
            assert len(rest) == 1
            names += rest
            prev_name = rest[0]
        elif start == "argument_list":
            if rest in [["("], ["()"]]:
                callees.append(prev_name)
            prev_name = None
        else:
            prev_name = None # assume that argument lists always follow function names directly
    return callees
            
def parse_include(lines):
    assert len(lines) == 3, lines
    lines = iter(lines)
    assert next(lines) == ["include", "#"]
    assert next(lines) == ["directive", "include"]
    start, *rest = next(lines)
    assert start == "file" and len(rest) == 1
    return rest
    
def skip(lines, item):
    """Skips to the end of a begin/end pair"""
    lines_in_item(lines, item)
    return (item, "skipped")
        
def lines_in_item(lines, item):
    """Returns a list of all lines between the begin_? and end_? markers"""
    result = []
    
    start, *rest = contents = next(lines)
    while start != f"end_{item}":
        assert not start.startswith("end_"), "end of different item found"
        assert not start.startswith("begin_"), "start of different item found"
        result.append(contents)
        start, *rest = contents = next(lines)
    return result
            

def parse_kernel_2():
    for filename in blame_files.rglob("*.c.blame"):
        yield filename, parse_file(filename)

In [None]:
list(parse_whole(
    blame_files/"arch/alpha/kernel/bugs.c.blame")
)

In [None]:
kernel_files_2 = dict(parse_kernel_2())
dict(islice(kernel_files.items(), 10))

In [None]:
len(kernel_files_2)

In [None]:
len([func for file in kernel_files_2.values() for func in file]) #529075 same number of functions found

In [None]:
def reset_cregit_db():
    # all functions
    cursor.execute("CREATE TABLE IF NOT EXISTS cregit_functions (file, name)")
    # all calls from one function to annother
    cursor.execute("CREATE TABLE IF NOT EXISTS cregit_calls (file, caller, callee)")
    # clear tables
    cursor.execute("DELETE FROM cregit_functions")
    cursor.execute("DELETE FROM cregit_calls")

In [None]:
kernel_files = parse_kernel()
reset_cregit_db()
# trim ../blame/ and .blame from ends of path so that paths can be compared between methods
cursor.executemany("INSERT INTO cregit_calls VALUES (?,?,?)", (
    (str(file)[len("../blame/"):-len(".blame")], caller, callee)
    for file, functions in kernel_files.items()
    for caller, callees in functions.items()
    for callee in set(callees))
)
cursor.executemany("INSERT INTO cregit_functions VALUES (?,?)", (
    (str(file)[len("../blame/"):-len(".blame")], function_name)
    for file, file_functions in kernel_files.items()
    for function_name in file_functions
))

In [None]:
# get test functions (based on name, case insensitive)
print(head(cursor.execute("SELECT caller FROM cregit_calls WHERE caller LIKE '%test%'")))
# get tested functions
head(cursor.execute("SELECT COUNT(DISTINCT callee) FROM cregit_calls WHERE caller LIKE '%test%'"))

In [None]:
tested_functions = set(x[0] for x in cursor.execute("SELECT callee FROM cregit_calls WHERE caller LIKE '%test%'"))

In [None]:
# Total number of functions
print(head(cursor.execute("SELECT COUNT(*) FROM cregit_functions")))

In [None]:
print(head(cursor.execute("SELECT COUNT(DISTINCT name)*1.0/COUNT(*) FROM cregit_functions")))

In [None]:
list(cursor.execute(
    "WITH A AS (SELECT name FROM cregit_functions GROUP BY name HAVING COUNT(name)>3) SELECT COUNT(*) FROM A"))

In [None]:
list(cursor.execute(
    "SELECT name, COUNT(name) FROM cregit_functions GROUP BY name ORDER BY COUNT(name) DESC LIMIT 20"))

In [None]:
connection.commit()

# Comparing Ctags and Cregit

In [None]:
[(table_name, [x[1] for x in cursor.execute("PRAGMA table_info([%s])" % table_name)])
 for table_name in ("ctags", "cregit_functions", "cregit_calls")]

In [None]:
head(cursor.execute("SELECT COUNT(*) FROM ctags WHERE token_type='function'")) # includes variables, macros etc.

In [None]:
head(cursor.execute("SELECT COUNT(*) FROM cregit_functions"))

In [None]:
head(cursor.execute("""
SELECT COUNT(*) FROM ctags JOIN cregit_functions ON
ctags.file = cregit_functions.file
AND ctags.name = cregit_functions.name
"""))

In [None]:
head(cursor.execute("""
SELECT file, name FROM cregit_functions
EXCEPT SELECT file, name FROM ctags"""))

In [None]:
head(cursor.execute(
    "SELECT file, name FROM cregit_functions"))

In [None]:
head(cursor.execute(
    "SELECT file, name FROM ctags EXCEPT SELECT file, name FROM cregit_functions"))

In [None]:
head(cursor.execute(
    "SELECT file, name FROM cregit_functions EXCEPT SELECT file, name FROM ctags"))

## Cscope

In [None]:
def parse_cscope(filename):
    expression = r"(?P<path>^[^ ]+) (?P<funcname>[^ ]+) (?P<linenum>\d+) (?P<usage_line>.*)"
    return parse(filename, expression)

In [None]:
# get all function calls
!cscope -RL2 ".*" > {all_calls}
# get all calls from a function with test in the name
!cscope -RL2 ".*test.*" > {test_targets}

In [None]:
called_functions = set()
called_function_names = set()
for m in parse_cscope(all_calls):
    called_functions.add((m["funcname"], m["path"]))
    called_function_names.add(m["funcname"])

In [None]:
tested_functions = set()
tested_function_names = set()     
for m in parse_cscope(test_targets):
    tested_functions.add((m["funcname"], m["path"]))
    tested_function_names.add(m["funcname"])

In [None]:
len(tested_function_names - called_function_names), tested_function_names - called_function_names
# before the math.c fix, there were 187 functions in this set

In [None]:
# functions detected by cscope (getting called by the tests) but not by ctags
len(tested_function_names - func_names), len(tested_functions - functions)

In [None]:
target = KERNEL

!(cd {target}; cscope -RL0 ".*")

In [None]:
# the results of this cell seem to indicate that a single directory (arch/sh/math-emu) is tripping up cscope.
# removeing that file lets cscope run unimpeded,
def test_cscope(path):
    if not path.is_dir():
        print("fail on file:",path)
        return
    for p in path.iterdir():
        if p.is_dir():
            print(p)
            result = run('cscope -RL0 ".*"',cwd=p, capture_output=True, shell=True)
            # return code is a better way to identify if errors
            # could just list all dirs/files in kernel, and echo stderr to each
            # find -d (gives list of directories)
            if result.stderr not in {b'', b'cscope: no source files found\n'}:
                print(p, repr(result.stderr))
                test_cscope(p)

test_cscope(KERNEL)

In [None]:
result = run('cscope -RL0 ".*"', capture_output=True, shell=True)
result