# Extract Bug Reports With Stack Traces - Defects4j 2.0
This script is the first script to be executed. It extract all the basic necessary information for the defects4j bugs with stack traces.
The Chart bugs were skipped due to the fact that their repo is not git based

### Vars declaration
Update it according to the computer in which I am running the code and the current requirements

Required paths:
- bug_reports_path: It is the path to the folder that contains the textual information from the bug reports from defects4j. This files were extracted by An Ran, and are in this project's repo. They are either a json file or a txt file, depending on the project.
- defects4j_path: It is the path in which  the defects4j 2.0 repo is cloned (https://github.com/rjust/defects4j)
- output_file_path: It is the path to the file in which the json outputed by this script will be stored

In [8]:
import os
from secrets import base_path
import importlib


paths_dict=  {
        "bug_reports_textual_info_path": os.path.join(base_path, "DeepDiveBugReportsWithLogs", "data", "defects4j-2.0-bug-reports-textual-info", "an-ran-files"),
        "defects4j_path": os.path.join(base_path, "defects4j"),
        "output_file_path": os.path.join(base_path, "DeepDiveBugReportsWithLogs", "data", "bug_reports_with_stack_traces_details.json"),
        "bug_info_csv_path": os.path.join(base_path, "DeepDiveBugReportsWithLogs", "data", "nakhla_bugs_data.csv")
}

stack_trace_regex = r'(?m)^.*?Exception.*(?:\n+^\s*at .*)+'

defects4j_projects_folder = {
    # "Chart": "jfreechart",
    "Cli": "commons-cli.git",
    "Closure": "closure-compiler.git",
    "Codec": "commons-codec.git",
    "Collections": "commons-collections.git",
    "Compress": "commons-compress.git",
    "Csv": "commons-csv.git",
    "Gson": "gson.git",
    "JacksonCore": "jackson-core.git",
    "JacksonDatabind": "jackson-databind.git",
    "Jsoup": "jsoup.git",
    "JxPath": "commons-jxpath.git",
    "Lang": "commons-lang.git",
    "Math": "commons-math.git",
    "Mockito": "mockito.git",
    "Time" : "joda-time.git"
}

### Getting the bug reports with log snippets or stack traces
This block of code utilizes regex to look for log snippets and stack traces in the bugs textual information

In [25]:
import glob
import os
import utils
import importlib
importlib.reload(utils)


def find_stack_traces_in_txt_files (bug_id, text_content):
    utils.find_regex_and_add_results_to_dict(stack_trace_regex, text_content, regex_result, bug_id)

def find_stack_traces_in_json_files (bug_id, bug_report_json):
    string_fields_list = ["summary", "description"]
    for field in string_fields_list:
        if field in bug_report_json.keys():
            utils.find_regex_and_add_results_to_dict(stack_trace_regex, bug_report_json[field], regex_result, bug_id)
    # Iterating through all the comments
    if "comments" in bug_report_json.keys():
        for comment in bug_report_json["comments"]:
            utils.find_regex_and_add_results_to_dict(stack_trace_regex, comment["content"], regex_result, bug_id)

# txt files
bugs_data = {}
regex_result = {}
for file in glob.glob(os.path.join(paths_dict["bug_reports_textual_info_path"], "*.txt")):
    bug_id = os.path.basename(file).replace('.txt', '')
    if bug_id.startswith("Chart"):
        continue
    with open(file, 'r', encoding='utf-8') as file_obj:
        file_content = file_obj.read()
    find_stack_traces_in_txt_files(bug_id, file_content)


# json files
for file in glob.glob(os.path.join(paths_dict["bug_reports_textual_info_path"] , "*.json")):
    bug_id = os.path.basename(file).replace('.json', '')
    if bug_id.startswith("Chart"):
        continue
    bug_report_json = utils.json_file_to_dict(file)
    find_stack_traces_in_json_files(bug_id, bug_report_json)

number_of_bug_reports_with_logs = len(regex_result.keys())
print(str(number_of_bug_reports_with_logs) + " bug reports with logs found")

#for bug_report_file in regex_result.keys():
for bug_report in regex_result.keys():
    #bug_report = bug_report_file.split(".")[0] #Removing the file extension
    project = bug_report.split("_")[0]
    bug_id = bug_report.split("_")[1]
    if project not in bugs_data.keys():
        bugs_data[project] = {}
    bugs_data[project][bug_id] = {"stack_traces": regex_result[bug_report]}

utils.dict_to_json_file(paths_dict["output_file_path"],bugs_data)
print("Collected info added to the file " + paths_dict["output_file_path"])

91 bug reports with logs found
Collected info added to the file C:\Users\loren\Concordia\Masters\DeepDiveBugReportsWithLogs\data\bug_reports_with_stack_traces_details.json


### Getting the defects4j information for each of these bugs
Extracts the hash of the buggy commit, the bugfix commit and also the bug report ID.
Obs: it requires that defects4j is cloned locally

In [27]:
import os
import csv

bugs_data = utils.json_file_to_dict(paths_dict["output_file_path"])
for project in bugs_data:
    file_path = os.path.join(paths_dict["defects4j_path"], "framework", "projects", project, "active-bugs.csv")
    with open(file_path, 'r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if row[0] in bugs_data[project].keys():
                bug_id = row[0]
                bugs_data[project][bug_id]["buggy_commit"] = row[1]
                bugs_data[project][bug_id]["bugfix_commit"] = row[2]
                bugs_data[project][bug_id]["bug_report_id"] = row[3]
                bugs_data[project][bug_id]["bug_report_url"] = row[4]

utils.dict_to_json_file(paths_dict["output_file_path"],bugs_data)
print("Collected info added to the file " + paths_dict["output_file_path"])

Collected info added to the file C:\Users\loren\Concordia\Masters\DeepDiveBugReportsWithLogs\data\bug_reports_with_stack_traces_details.json


## Collecting information about the defects4j trigger tests for each bug

In [28]:
bugs_data = utils.json_file_to_dict(paths_dict["output_file_path"])
for project in bugs_data:
    for bug_id in bugs_data[project]:
        bugs_data[project][bug_id]["defects4j_trigger_tests"] = []
        path = os.path.join(paths_dict["defects4j_path"], "framework", "projects", project, "trigger_tests")
        lines = utils.read_file_lines(bug_id, path)
        for line in lines:
            if line.startswith("--- "):
                failing_test_name = line.replace("--- ", "").replace("\n", "")
                bugs_data[project][bug_id]["defects4j_trigger_tests"].append(failing_test_name)

utils.dict_to_json_file(paths_dict["output_file_path"], bugs_data)
print("Collected info added to the file " + paths_dict["output_file_path"])

Collected info added to the file C:\Users\loren\Concordia\Masters\DeepDiveBugReportsWithLogs\data\bug_reports_with_stack_traces_details.json


### Getting the bug report commit hash for each of these bugs
Extracts the report creation date, based on the tool found in the bug report url.
Obs: In the case of sourceforge.net, there is no api available. Since they are only 2 bugs, the creation date was manually collected and added to the manually_extracted_dates dictionary

In [29]:
# There is not api for https://sourceforge.net and https://code.google.com/archive
manually_extracted_dates = {
    "Chart": {
        "5": "2008-05-01T00:00:00Z"
    },
    "Time": {
        "14": "2012-05-22T00:00:00Z"
    },
    "Mockito": {
        "17": "2009-11-20T00:00:00Z" ,
        "22": "2014-04-06T00:00:00Z",
        "25": "2010-11-11T00:00:00Z",
        "30": "2010-10-15T00:00:00Z",
        "31": "2010-10-15T00:00:00Z",
        "35": "2009-06-19T00:00:00Z"
    }
}

bugs_data = utils.json_file_to_dict(paths_dict["output_file_path"])

for project in bugs_data:
    for bug_id in bugs_data[project]:
        bug_report_url = bugs_data[project][bug_id]["bug_report_url"]
        if "github" in bug_report_url:
            bugs_data[project][bug_id]["bug_report_creation_date"] = utils.extract_creation_date_from_github_issues(bug_report_url)
        elif "jira" in bug_report_url:
            bugs_data[project][bug_id]["bug_report_creation_date"] = utils.extract_creation_date_from_jira(bug_report_url)
        elif "storage.googleapis.com" in bug_report_url:
            bugs_data[project][bug_id]["bug_report_creation_date"] = utils.extract_creation_date_from_google_code(bug_report_url)
        else:
            try:
                bugs_data[project][bug_id]["bug_report_creation_date"] = manually_extracted_dates[project][bug_id]
            except KeyError:
                bugs_data[project][bug_id]["bug_report_creation_date"] = None
        if not bugs_data[project][bug_id]["bug_report_creation_date"]:
            print("No bug_report_creation_date found for bug " + project + "_" + bug_id)

utils.dict_to_json_file(paths_dict["output_file_path"], bugs_data)
print("Collected info added to the file " + paths_dict["output_file_path"])

Collected info added to the file C:\Users\loren\Concordia\Masters\DeepDiveBugReportsWithLogs\data\bug_reports_with_stack_traces_details.json


### Getting the bug report commit hash
Bug report commit hash is defined as of the first commit before the bug report creation

In [30]:
bugs_data = utils.json_file_to_dict(paths_dict["output_file_path"])

master_branches_dict = {
    "default": "master",
    "Lang": "trunk",
    "Math": "trunk"
}
for project in bugs_data:
    master_branch = master_branches_dict.get(project, master_branches_dict['default'])
    project_repo_path = os.path.join(paths_dict["defects4j_path"], "project_repos", defects4j_projects_folder[project])
    for bug_id in bugs_data[project]:
        bug_report_creation_date = bugs_data[project][bug_id]["bug_report_creation_date"]
        bug_report_commit_hash = utils.get_first_commit_before_date(project_repo_path, bug_report_creation_date, master_branch)
        bugs_data[project][bug_id]["bug_report_commit_hash"] = bug_report_commit_hash

utils.dict_to_json_file(paths_dict["output_file_path"], bugs_data)
print("Collected info added to the file " + paths_dict["output_file_path"])

Collected info added to the file C:\Users\loren\Concordia\Masters\DeepDiveBugReportsWithLogs\data\bug_reports_with_stack_traces_details.json
