In [4]:
import numpy as np
import pandas as pd
from sbfl.base import SBFL

# TODO: add functionality to convert gcov result files to X, y
# TODO: add functionality to convert Cobertura result files to X, y

#   e1,e2,e3,e4
X = np.array([
    [1,0,1,0], # coverage of test t0
    [0,0,1,1], # coverage of test t1
    [1,1,0,0]  # coverage of test t2
], dtype=bool)

y = np.array([
    1, # t0: PASS
    0, # t1: FAIL
    1  # t2: PASS
], dtype=bool)

X, y

(array([[ True, False,  True, False],
        [False, False,  True,  True],
        [ True,  True, False, False]]),
 array([ True, False,  True]))

In [4]:
# Calculate the suspiciousness scores
sbfl = SBFL(formula='Ochiai')
sbfl.fit(X, y)
print(sbfl.scores_)

[0.         0.         0.70710678 1.        ]


In [5]:
names = ['file', 'method']
elements = [
    ('file1.py', 'method1'),
    ('file2.py', 'method2'),
    ('file2.py', 'method3'),
    ('file2.py', 'method4')
]
df = sbfl.to_frame(elements=elements, names=names)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,score
file,method,Unnamed: 2_level_1
file1.py,method1,0.0
file2.py,method2,0.0
file2.py,method3,0.707107
file2.py,method4,1.0


In [6]:
df.max(level='file')

Unnamed: 0_level_0,score
file,Unnamed: 1_level_1
file1.py,0.0
file2.py,1.0


# SBFL engine - gcov 연동

In [2]:
from sbfl.utils import gcov_files_to_frame, get_sbfl_scores_from_frame

coverage_dirs = {
    '51': "sample/yara-buggy#1/yara-buggy#1-51",
    '52': "sample/yara-buggy#1/yara-buggy#1-52",
    '53': "sample/yara-buggy#1/yara-buggy#1-53",
}

coverage_files = {
    test: [
        os.path.join(coverage_dirs[test], fn)
        for fn in os.listdir(coverage_dirs[test]) if fn.endswith('gcov')
    ]
    for test in coverage_dirs
}

cov_df = gcov_files_to_frame(coverage_files, only_covered=True)
cov_df

Unnamed: 0_level_0,Unnamed: 1_level_0,51,52,53
source,line,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
threading.c,132,0,0,4
threading.c,135,0,0,4
threading.c,138,0,0,4
threading.c,142,0,0,4
threading.c,145,0,0,4
...,...,...,...,...
grammar.y,1258,0,0,4
grammar.y,1260,0,0,4
grammar.y,1479,0,0,4
grammar.y,1482,0,0,4


In [12]:
failing_tests = [
    '51', '52' # unknown
]

# coverage 정보 (cov_df) + failing tests 정보를 가지고 sbfl score를 계산
score_df = get_sbfl_scores_from_frame(
    cov_df, failing_tests, sbfl=SBFL(formula='Ochiai'))

score_df

Unnamed: 0_level_0,Unnamed: 1_level_0,score
source,line,Unnamed: 2_level_1
threading.c,132,1.0
threading.c,135,1.0
threading.c,138,1.0
threading.c,142,1.0
threading.c,145,1.0
...,...,...
grammar.y,1258,1.0
grammar.y,1260,1.0
grammar.y,1479,1.0
grammar.y,1482,1.0


In [13]:
# score 높은 순으로 정렬
score_df.sort_values(by='score', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,score
source,line,Unnamed: 2_level_1
threading.c,132,1.0
re_lexer.c,1860,1.0
re_lexer.c,1710,1.0
re_lexer.c,1713,1.0
re_lexer.c,1714,1.0
...,...,...
ahocorasick.c,790,0.0
ahocorasick.c,789,0.0
ahocorasick.c,788,0.0
ahocorasick.c,787,0.0
