## Vars declaration

In [2]:
bug_reports_path = "/Users/lorenapacheco/Concordia/Masters/bug_mining-2"
defects4j_path = "/Users/lorenapacheco/Concordia/Masters/defects4j"
output_path = "/Users/lorenapacheco/Concordia/Masters/BugReportsMining/defects4j/"

stack_trace_regex = r'(?m)^.*?Exception.*(?:\n+^\s*at .*)+'
logs_regex = r'(ERROR|INFO|WARN|DEBUG|FATAL)\s+(?P<class>\w+(\.\w+)*)'

defects4j_projects_github = {
    "Cli": "apache/commons-cli",
    "Closure": "google/closure-compiler",
    "Codec": "apache/commons-codec",
    "Collections": "apache/commons-collections",
    "Compress": "apache/commons-compress",
    "Csv": "apache/commons-csv",
    "Gson": "google/gson",
    "JacksonCore": "FasterXML/jackson-core",
    "JacksonDatabind": "FasterXML/jackson-databind",
    "Jsoup": "jhy/jsoup",
    "JxPath": "apache/commons-jxpath",
    "Lang": "", # Projects commons-math and common-lang do not have the old commits in github. Will have to use deffects4j zip instead
    "Math": "",
    "Mockito": "mockito/mockito",
    "Time" : "JodaOrg/joda-time"
}

## Getting the bug reports with log snippets or stack traces

In [3]:
import re
import glob
import os
import json

bugs_data = {}
regex_result = {}

def json_file_to_dict(file):
    data = {}
    with open(os.path.join(file), 'r') as fp:
        data = json.load(fp)
    fp.close()
    return data

def find_regex(regex, bug_id, text_content):
    results = re.finditer(regex, text_content)
    if results:
        for log in results:
            if bug_id not in regex_result.keys():
                regex_result[bug_id] = []
            regex_result[bug_id].append(log.group())


def find_logs_and_stack_traces_txt (bug_id, text_content):
    find_regex(logs_regex, bug_id, text_content)
    find_regex(stack_trace_regex, bug_id, text_content)

def find_logs_and_stack_traces_json (bug_id, bug_report_json):
    string_fields_list = ["summary", "description"]
    for field in string_fields_list:
        if field in bug_report_json.keys():
            find_regex(logs_regex, bug_id, bug_report_json[field])
            find_regex(stack_trace_regex, bug_id,  bug_report_json[field])
    # going through comments
    if "comments" in bug_report_json.keys():
        for comment in bug_report_json["comments"]:
            find_regex(logs_regex, bug_id, comment["content"])
            find_regex(stack_trace_regex, bug_id,  comment["content"])

def dict_to_json_file(file, dic, folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(os.path.join(folder, file+'.json'), 'w') as fp:
        json.dump(dic, fp, sort_keys=True, indent=4)
    fp.close()

# txt files
for file in glob.glob(bug_reports_path +'/*.txt'):
    bug_id = os.path.basename(file).replace('.json', '')
    with open(file, 'r') as file_obj:
        file_content = file_obj.read()
    find_logs_and_stack_traces_txt(bug_id, file_content)


# json files
for file in glob.glob(bug_reports_path +'/*.json'):
    bug_id = os.path.basename(file).replace('.json', '')
    bug_report_json = json_file_to_dict(file)
    find_logs_and_stack_traces_json(bug_id, bug_report_json)

print(str(len(regex_result)) + " bug reports with logs found")

for bug_report_file in regex_result.keys():
    bug_report = bug_report_file.split(".")[0] #Removing the file extension
    project = bug_report.split("_")[0]
    bug_id = bug_report.split("_")[1]
    if project not in bugs_data.keys():
        bugs_data[project]={}
    bugs_data[project][bug_id] = {"log": regex_result[bug_report_file]}

dict_to_json_file("bug_reports_with_logs_data",bugs_data, output_path)
print("Collected info added to the file defects4j/bug_reports_with_logs_data")


91 bug reports with logs found
Collected info added to the file defects4j/bug_reports_with_logs_data


# Getting the commits for each of this bugs

In [4]:
import os
import csv
import fnmatch

def find_project_folder(project, path):
    pattern = fnmatch.translate(project.lower())
    results = []

    for root, dirs, files in os.walk(path):
        for name in dirs:
            if fnmatch.fnmatch(name, pattern):
                results.append(os.path.join(root, name))

    return results


bugs_data = json_file_to_dict(output_path + "bug_reports_with_logs_data.json")
for project in bugs_data:
    with open(defects4j_path + "/framework/projects/" + project + "/active-bugs.csv", 'r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if row[0] in bugs_data[project].keys():
                bug_id = row[0]
                bugs_data[project][bug_id]["buggy_commit"] = row[1]
                bugs_data[project][bug_id]["bugfix_commit"] = row[2]
                bugs_data[project][bug_id]["bug_report"] = row[3]

dict_to_json_file("bug_reports_with_logs_data",bugs_data, output_path)
print("Collected info added to the file defects4j/bug_reports_with_logs_data")

Collected info added to the file defects4j/bug_reports_with_logs_data


## Collecting information about the failing test in each bug

In [5]:
def find_file(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

def read_file_lines(file_name, path):
    path = find_file(file_name, path)
    if path:
        with open(path, 'r', encoding='utf-8', errors='ignore') as file:
            lines =file.readlines()
        file.close()
        return lines
    return []

bugs_data = json_file_to_dict(output_path + "bug_reports_with_logs_data.json")
for project in bugs_data:
    for bug_id in bugs_data[project]:
        bugs_data[project][bug_id] ["defects4j_failing_tests"] = []
        lines = read_file_lines(bug_id, defects4j_path + "/framework/projects/" + project + "/trigger_tests/")
        for line in lines:
            if line.startswith("--- "):
                failing_test_name = line.replace("--- ", "").replace("\n", "")
                bugs_data[project][bug_id] ["defects4j_failing_tests"].append(failing_test_name)

dict_to_json_file("bug_reports_with_logs_data",bugs_data, output_path)
print("Collected info added to the file defects4j/bug_reports_with_logs_data")

Collected info added to the file defects4j/bug_reports_with_logs_data


# Checking if these tests were introduced in the bug_fix commit - projects that have the commits on Github

In [6]:
from github import Github

g = Github("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") # TODO: Remove before commiting

bugs_data = json_file_to_dict(output_path + "bug_reports_with_logs_data.json")

for project in bugs_data:
    github_repo_id = defects4j_projects_github[project]
    if github_repo_id != "":
        repo = g.get_repo(github_repo_id)
        for bug_id in bugs_data[project]:
            commit_sha = bugs_data[project][bug_id]["bugfix_commit"]
            commit = repo.get_commit(commit_sha)
            tests_to_be_verified = bugs_data[project][bug_id]["defects4j_failing_tests"]
            flags_list = []
            for test in tests_to_be_verified:
                test_file = test.split("::")[0].split(".")[-1]
                test_case = test.split("::")[1]
                # Check if the specific test case was added in the commit
                test_added = False
                for file in commit.files:
                    if file.filename.endswith(test_file + ".java") and (file.status == "modified" or file.status == "added"):
                        for patch_line in file.patch.split('\n'):
                            if patch_line.startswith("+") and test_case + "(" in patch_line:
                                test_added = True
                                break
                flags_list.append(test_added)
            bugs_data[project][bug_id]["flag_failing_tests_added_in_bugfix"] = all(flags_list)

dict_to_json_file("bug_reports_with_logs_data",bugs_data, output_path)
print("Collected info added to the file defects4j/bug_reports_with_logs_data")

Collected info added to the file defects4j/bug_reports_with_logs_data


# Checking if these tests were introduced in the bug_fix commit - projects that do not have the commits on Github (math and lang)
Requires defects4j installed and it is necessary to run the script get_repos.sh to download the defects4j version of these repos

In [7]:
import subprocess

projects_list = {
    "Lang": "commons-lang.git",
    "Math": "commons-math.git"
}

bugs_data = json_file_to_dict(output_path + "bug_reports_with_logs_data.json")

# Define the working directory where the repositories will be checked out
working_dir = "/tmp/"

# Define the bugs and versions for which to obtain the modified lines
for project in projects_list.keys():
    folder = projects_list[project]
    os.chdir("/Users/lorenapacheco/Concordia/Masters/defects4j/project_repos/" + folder + "/") # Moving to defects4j folder
    bugs_details = bugs_data[project]
    for bug in bugs_details.keys():
        commit_sha = bugs_data[project][bug]["bugfix_commit"]
        output = subprocess.check_output(["git", "show", commit_sha])
        output_str = output.decode("utf-8")
        # Split the output into lines
        lines = output_str.split("\n")

        # Initialize variables to store file name and diff
        filename = None
        diffs = {}

        # Loop through the lines of the output
        for line in lines:
            # Check if the line starts with "diff --git a/"
            if line.startswith("diff --git a/"):
                # If yes, extract the file name
                filename = line[13:]
                # Reset the diff for the new file
                diffs[filename] = ""
            # Check if the line starts with "+++" or "---"
            elif line.startswith("+++") or line.startswith("---"):
                # If yes, skip the line
                continue
            # Check if the line starts with "+"
            elif line.startswith("+") and filename:
                # If yes, append the line to the diff
                diffs[filename] += line + "\n"
            # Check if the line starts with "-"
            elif line.startswith("-"):
                # If yes, append the line to the diff
                diffs[filename] += line + "\n"



        tests_to_be_verified = bugs_data[project][bug]["defects4j_failing_tests"]
        flags_list = []
        for test in tests_to_be_verified:
            test_file = test.split("::")[0].split(".")[-1]
            test_case = test.split("::")[1]
            # Check if the specific test case was added in the commit
            test_added = False
            for file in diffs.keys():
                if file.endswith(test_file + ".java"):
                    for patch_line in diffs[file].split('\n'):
                        if patch_line.startswith("+") and test_case + "(" in patch_line:
                            test_added = True
                            break
            flags_list.append(test_added)
        bugs_data[project][bug]["flag_failing_tests_added_in_bugfix"] = all(flags_list)

dict_to_json_file("bug_reports_with_logs_data",bugs_data, output_path)
print("Collected info added to the file defects4j/bug_reports_with_logs_data")

Collected info added to the file defects4j/bug_reports_with_logs_data
