# Getting the bug location for each bug
Extracting the information about the lines added and deleted in each file in the bugfix commit
Separates the data into code modifications and test modifications

### Vars declaration

In [2]:
import os
from secrets import base_path


paths_dict=  {
        "bugs_with_stack_traces_details_file_path": os.path.join(base_path, "DeepDiveBugReportsWithLogs", "data", "bug_reports_with_stack_traces_details.json"),
        "project_repos_dir":  os.path.join(base_path, "open_source_repos_being_studied"),
        "defects4j_path": os.path.join(base_path, "defects4j")
}

projects_github_id = {
        "Cli": "apache/commons-cli",
        "Closure": "google/closure-compiler",
        "Codec": "apache/commons-codec",
        "Collections": "apache/commons-collections",
        "Compress": "apache/commons-compress",
        "Csv": "apache/commons-csv",
        "Gson": "google/gson",
        "JacksonCore": "FasterXML/jackson-core",
        "JacksonDatabind": "FasterXML/jackson-databind",
        "Jsoup": "jhy/jsoup",
        "JxPath": "apache/commons-jxpath",
        "Mockito": "mockito/mockito",
        "Time" : "JodaOrg/joda-time",
        "fastjson": "alibaba/fastjson",
        "junit4": "junit-team/junit4"
}

### Getting the added and deleted lines for the projects that have the history available in GitHub

In [3]:
from github import Github
from secrets import github_token
import utils
import importlib
importlib.reload(utils)

bugs_data = utils.json_file_to_dict(paths_dict["bugs_with_stack_traces_details_file_path"])

g = Github(github_token)

for project in bugs_data:
        github_repo_id = projects_github_id.get(project, None)
        if github_repo_id:
                repo_owner = projects_github_id[project].split("/")[0]
                repo_name = projects_github_id[project].split("/")[1]
                repo = g.get_repo(f"{repo_owner}/{repo_name}")
                for bug_id in bugs_data[project]:
                        bugfix_commit_sha = bugs_data[project][bug_id]["bugfix_commit"]
                        bugfix_commit_data = repo.get_commit(bugfix_commit_sha)

                        for file in bugfix_commit_data.files:
                                # Getting the modified lines that are not related to tests to later identify the buggy methods
                                if file.filename.endswith(".java"):
                                        file_path = os.path.join(paths_dict["project_repos_dir"], repo_name, file.filename)
                                        added_line_numbers, deleted_line_numbers = utils.get_modified_line_numbers_github_lib(file.patch)
                                        if "test" not in file.filename.lower():
                                                key = "modified_code"
                                        else:
                                                key = "modified_tests"
                                        if key not in bugs_data[project][bug_id].keys():
                                                bugs_data[project][bug_id][key] = {}
                                        if file.filename not in bugs_data[project][bug_id][key].keys():
                                                bugs_data[project][bug_id][key][file.filename] = {}
                                        if added_line_numbers:
                                                bugs_data[project][bug_id][key][file.filename]["added_lines"] = added_line_numbers
                                        if deleted_line_numbers:
                                                bugs_data[project][bug_id][key][file.filename]["deleted_lines"] = deleted_line_numbers
                                        if file.previous_filename:
                                                bugs_data[project][bug_id][key][file.filename]["previous_filename"] = file.previous_filename

utils.dict_to_json_file(paths_dict["bugs_with_stack_traces_details_file_path"],bugs_data)
print("Collected info added to the file data/merged_data_production_bug_reports.json")

Collected info added to the file data/merged_data_production_bug_reports.json


### Getting the added and deleted lines for the projects that do not have the history available in GitHub

For the projects Math and Lang, it is not enough to clone them from GitHub because the old commits are not available there anymore.
Instead, you will need to have defects4j installed and it is necessary to run the script get_repos.sh to download the defects4j version of these repos (.git file)
After downloading the ".git" file via this script, run a `git clone ${PATH_TO_GIT_FILE}` inside the project_repos_dir folder to have these locally.

In [4]:
import subprocess
import utils
import importlib
importlib.reload(utils)


projects_list = {
        "Lang": "commons-lang",
        "Math": "commons-math"
}

bugs_data = utils.json_file_to_dict(paths_dict["bugs_with_stack_traces_details_file_path"])

# Define the working directory where the repositories will be checked out
working_dir = "/tmp/"

# Define the bugs and versions for which to obtain the modified lines
for project in projects_list.keys():
        folder = projects_list[project]
        project_path = os.path.join(paths_dict["project_repos_dir"], folder)
        os.chdir(project_path) # Moving to defects4j folder
        bugs_details = bugs_data[project]
        for bug_id in bugs_details.keys():
                bugfix_commit_sha = bugs_data[project][bug_id]["bugfix_commit"]
                output = subprocess.check_output(["git", "show", bugfix_commit_sha])
                output_str = output.decode("utf-8")
                # Split the output into lines
                lines = output_str.split("\n")

                output = utils.get_modified_line_numbers_git_cli(lines)
                for file_name in output.keys():
                        if file_name.endswith(".java"):
                                if "test" not in file_name.lower():
                                        key = "modified_code"
                                else:
                                        key = "modified_tests"
                                if key not in bugs_data[project][bug_id].keys():
                                        bugs_data[project][bug_id][key] = {}
                                if file_name not in bugs_data[project][bug_id][key].keys():
                                        bugs_data[project][bug_id][key][file_name] = {}
                                if "added" in output[file_name].keys():
                                        bugs_data[project][bug_id][key][file_name]["added_lines"] = output[file_name]["added"]
                                if "deleted" in output[file_name].keys():
                                        bugs_data[project][bug_id][key][file_name]["deleted_lines"] = output[file_name]["deleted"]

utils.dict_to_json_file(paths_dict["bugs_with_stack_traces_details_file_path"],bugs_data)
print("Collected info added to the file data/merged_data_production_bug_reports.json")

Collected info added to the file data/merged_data_production_bug_reports.json
