## Var declarations

In [91]:
data_file_path = "/Users/lorenapacheco/Concordia/Masters/BugReportsMining/data/merged_data_production_bug_reports.json"
gzoltar_data_file_path = "/Users/lorenapacheco/Concordia/Masters/BugReportsMining/data/coverage_data_gzoltar.json"
projects_dir = "/Users/lorenapacheco/Concordia/Masters/open_source_repos_being_studied/"
output_file_path = "/Users/lorenapacheco/Concordia/Masters/BugReportsMining/data/coverage_percentages_defects4j.csv"



projects_github = {
    "Cli": "apache/commons-cli",
    "Closure": "google/closure-compiler",
    "Codec": "apache/commons-codec",
    "Collections": "apache/commons-collections",
    "Compress": "apache/commons-compress",
    "Csv": "apache/commons-csv",
    "Gson": "google/gson",
    "JacksonCore": "FasterXML/jackson-core",
    "JacksonDatabind": "FasterXML/jackson-databind",
    "Jsoup": "jhy/jsoup",
    "JxPath": "apache/commons-jxpath",
    "Mockito": "mockito/mockito",
    "Time" : "JodaOrg/joda-time",
    "fastjson": "alibaba/fastjson",
    "junit4": "junit-team/junit4"
}

## General methods

In [92]:
import json
import os

def json_file_to_dict(file):
    data = {}
    with open(os.path.join(file), 'r') as fp:
        data = json.load(fp)
    fp.close()
    return data

def dict_to_json_file(file, dic):
    folder = os.path.dirname(file)
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(os.path.join(file), 'w') as fp:
        json.dump(dic, fp, sort_keys=True, indent=4)
    fp.close()


In [97]:
import csv
import os


def read_file_lines(file_path, project_path):
    with open(project_path+file_path, 'r', encoding='utf-8', errors='ignore') as file:
        lines =file.readlines()
    file.close()
    return lines

def count_lines_of_code_for_coverage(file_path, project_path, covered_lines, begin_line=0, end_line=-1):

    lines = read_file_lines(file_path, project_path)
    if end_line==-1:
        end_line=len(lines)
    count = 0
    multiline_comment_active = False
    begin_pos = begin_line - 1
    end_pos = end_line - 1

    for i in range(begin_pos, end_pos + 1):
        line = lines[i]

        # If it is covered, it is a code line
        if i + 1 in covered_lines:
            count +=1
            continue

        # Ignore blocks inside multiline comments
        if line.strip().startswith("/*"):
            multiline_comment_active = True
            continue
        if multiline_comment_active:
            if line.strip().endswith("*/"):
                multiline_comment_active = False
            continue
        # Ignore single line comments
        if line.strip().startswith("//"):
            continue

        # Remove end-ine comments
        line = line.split("//")[0]
        # Count non-empty lines
        if line.strip() != "":
            # Ignoring closing braces only lines
            if line.strip() == "}":
                continue

            # Ignoring conditional lines (branches)
            if line.strip().startswith("if ") or line.strip().startswith("else ") or line.strip().startswith("else:"):
                continue

            # Ignoring loop definition lines
            if line.strip().startswith("for ") or line.strip().startswith("while "):
                continue

            # Treating statements with line break
            if line.strip().endswith("{") or line.strip().endswith("}") or line.strip().endswith(";") or line.strip().endswith(","):
                count = count + 1

    return count

def create_coverage_percent_file(obj):
    with open(output_file_path, 'w') as file:
        # create the csv writer object
        csv_writer = csv.writer(file)
        csv_writer.writerow(["Project", "Bug_id", "Average_coverage_buggy_files",
                             "Average_coverage_stack_trace_files", "Average_all_files_coverage"])
        for project in obj.keys():
            for bug_id in obj[project].keys():
                csv_writer.writerow([project, bug_id,
                                     obj[project][bug_id]["average_coverage_buggy_files"],
                                     obj[project][bug_id]["average_coverage_stack_trace_files"],
                                     obj[project][bug_id]["average_all_files_coverage"]])
    file.close()

def find_file(name, path):
    for root, dirs, files in os.walk(path):
        if os.path.basename(name) in files and root.endswith(os.path.dirname(name)):
            return os.path.abspath(os.path.join(root, os.path.basename(name)))
    return None



bugs_data = json_file_to_dict(data_file_path)
gzoltar_data = json_file_to_dict(gzoltar_data_file_path)
coverage_obj = {}

for project in gzoltar_data.keys():
    print(project)
    repo_name = projects_github[project].split("/")[1]
    for bug_id in gzoltar_data[project].keys():

        # Skipping bugs in which the gzoltar run failed
        # TODO: Run it again in case I have time
        if project=="Closure":
            if bug_id == "143" or bug_id == "152":
                continue

        if project=="Jsoup":
            if bug_id == "6" or bug_id == "34" or bug_id == "5":
                continue
        print(bug_id)

        commit_hash = bugs_data[project][bug_id]["buggy_commit"]
        project_path = projects_dir + repo_name + "/"
        os.chdir(project_path)
        checkout_command = "git checkout  --quiet " + commit_hash
        os.system(checkout_command)

        # TODO: Implement method level

        # Calculating buggy_files average coverage
        buggy_code_info = bugs_data[project][bug_id]["buggyMethods"]
        buggy_files = buggy_code_info.keys()
        sum_buggy_files_coverage = 0
        for buggy_file in buggy_files:
            buggy_files_covered_lines = []
            for gzoltar_file in gzoltar_data[project][bug_id].keys():
                if gzoltar_file in buggy_file:
                    for method in gzoltar_data[project][bug_id][gzoltar_file].keys():
                        buggy_files_covered_lines += gzoltar_data[project][bug_id][gzoltar_file] [method]
                    break
            buggy_file_lines_of_code = count_lines_of_code_for_coverage(buggy_file, project_path, buggy_files_covered_lines)
            buggy_file_coverage_percentage = 100 * len(buggy_files_covered_lines)/buggy_file_lines_of_code
            sum_buggy_files_coverage += buggy_file_coverage_percentage
        if not buggy_files:
            average_buggy_files_coverage = "N/A"
        else:
            average_buggy_files_coverage = sum_buggy_files_coverage/len(buggy_files)

        # Calculating stack trace files average coverage
        sum_st_files_coverage = 0
        stack_trace_files = bugs_data[project][bug_id]["stack_trace_files"]
        set_stack_trace_files = set(stack_trace_files) # Can be repeated because I am keeping the stack trace structure for other purposes (Method)
        internal_st_files_count = 0
        for st_file in set_stack_trace_files:
            st_file_path = find_file(st_file, project_path)
            if not st_file_path: # External file
                continue

            st_files_covered_lines = []
            for gzoltar_file in gzoltar_data[project][bug_id].keys():
                if gzoltar_file in st_file_path:
                    for method in gzoltar_data[project][bug_id][gzoltar_file].keys():
                        st_files_covered_lines += gzoltar_data[project][bug_id][gzoltar_file] [method]
                    break
            st_file_lines_of_code = count_lines_of_code_for_coverage(st_file_path, "", st_files_covered_lines)
            st_file_coverage_percentage = 100 * len(st_files_covered_lines)/st_file_lines_of_code
            sum_st_files_coverage += st_file_coverage_percentage
            internal_st_files_count +=1
        if internal_st_files_count == 0: # Only external files in the stack trace
            average_st_files_coverage = "N/A"
        else:
            average_st_files_coverage = sum_st_files_coverage/internal_st_files_count

        # Calculating all files average coverage - Obs: only considering files with coverage into the sum
        sum_all_files_coverage = 0
        all_files_count = 0
        gzoltar_file_problems = False
        for gzoltar_file in gzoltar_data[project][bug_id].keys():
            file_name = gzoltar_file.split("/")[-1]
            gzoltar_file_path = find_file(gzoltar_file, project_path)
            if gzoltar_file_path is None: # Gzoltar file not found. I found some cases like this and the file really do not exist in the buggy commit. Need to understand what is the problem with defects4j in this case
                gzoltar_file_problems = True
                continue
            file_covered_lines =[]
            for method in gzoltar_data[project][bug_id][gzoltar_file].keys():
                file_covered_lines += gzoltar_data[project][bug_id][gzoltar_file] [method]
            file_lines_of_code = count_lines_of_code_for_coverage(gzoltar_file_path, "", file_covered_lines)
            file_coverage_percentage = 100 * len(file_covered_lines)/file_lines_of_code
            sum_all_files_coverage += file_coverage_percentage
            all_files_count += 1
        average_all_files_coverage = sum_all_files_coverage/all_files_count
        print("WARNING: Some Gzoltar files were not found in the buggy commit")

        if not project in coverage_obj.keys():
            coverage_obj[project] = {}
        coverage_obj[project][bug_id] = {
            "average_coverage_buggy_files": average_buggy_files_coverage,
            "average_coverage_stack_trace_files": average_st_files_coverage,
            "average_all_files_coverage": average_all_files_coverage
        }
create_coverage_percent_file(coverage_obj)
print("Done")


Cli
14
5
Closure
106
125
2
27
34
37
55
Codec
8
Collections
28
Compress
1
12
14
17
18
23
32
Csv
12
4
Gson
12
8
JacksonCore
4
7
JacksonDatabind
104
14
15
17
25
28
32
58
61
69
70
72
9
90
91
93
98
Jsoup
54
78
80
82
84
90
JxPath
5
Mockito
17
25
30
31
4
5
Time
10
Done
