## Var declarations

In [1]:
data_file_path = "/Users/lorenapacheco/Concordia/Masters/BugReportsMining/data/merged_data_production_bug_reports.json"
projects_dir = "/Users/lorenapacheco/Concordia/Masters/open_source_repos_being_studied/"

projects_github = {
    "Cli": "apache/commons-cli",
    "Closure": "google/closure-compiler",
    "Codec": "apache/commons-codec",
    "Collections": "apache/commons-collections",
    "Compress": "apache/commons-compress",
    "Csv": "apache/commons-csv",
    "Gson": "google/gson",
    "JacksonCore": "FasterXML/jackson-core",
    "JacksonDatabind": "FasterXML/jackson-databind",
    "Jsoup": "jhy/jsoup",
    "JxPath": "apache/commons-jxpath",
    "Lang": "", # Projects commons-math and common-lang do not have the old commits in github. Will have to use deffects4j zip instead
    "Math": "",
    "Mockito": "mockito/mockito",
    "Time" : "JodaOrg/joda-time",
    "fastjson": "alibaba/fastjson",
    "junit4": "junit-team/junit4"
}

## General functions

In [2]:
import json
import os

def json_file_to_dict(file):
    data = {}
    with open(os.path.join(file), 'r') as fp:
        data = json.load(fp)
    fp.close()
    return data

def dict_to_json_file(file, dic):
    folder = os.path.dirname(file)
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(os.path.join(file), 'w') as fp:
        json.dump(dic, fp, sort_keys=True, indent=4)
    fp.close()

## Getting the added and deleted lines for the projects that have the history available in GitHub

I will get the method names in a Java code separately, once implementing it in Python is being problematic

In [4]:
from github import Github

def get_modified_line_numbers_github_lib(patch):
    added_line_numbers = []
    deleted_line_numbers = []
    lines = patch.split('\n')
    current_line_number = None
    current_removed_line_number = None

    for line in lines:
        if line.startswith("@@"):
            current_line_number = int(line.split(' ')[2].split(',')[0][1:])
            current_removed_line_number = int(line.split(' ')[1].split(',')[0][1:])
        elif line.startswith('+') and not line.startswith('+++'):
            # if not line.lstrip('+').lstrip().isspace() and line.lstrip('+').lstrip() != '':
            added_line_numbers.append(current_line_number)
            current_line_number += 1
        elif line.startswith('-') and not line.startswith('---'):
            #if not line.lstrip('-').lstrip().isspace() and line.lstrip('-').lstrip() != '':
            deleted_line_numbers.append(current_removed_line_number)
            current_removed_line_number += 1
        elif line.startswith(' '):
            current_line_number += 1
            current_removed_line_number += 1

    return added_line_numbers, deleted_line_numbers

bugs_data = json_file_to_dict(data_file_path)

access_token = "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" # TODO: Delete before commiting

g = Github(access_token)

for project in bugs_data:
    github_repo_id = projects_github[project]
    if github_repo_id != "":
        repo_owner = projects_github[project].split("/")[0]
        repo_name = projects_github[project].split("/")[1]
        repo = g.get_repo(f"{repo_owner}/{repo_name}")
        for bug_id in bugs_data[project]:
            commit_sha = bugs_data[project][bug_id]["bugfix_commit"]
            commit = repo.get_commit(commit_sha)

            for file in commit.files:
                # Getting the modified lines that are not related to tests to later identify the buggy methods
                if file.filename.endswith(".java"):
                    file_path = projects_dir + repo_name + "/" + file.filename
                    added_line_numbers, deleted_line_numbers = get_modified_line_numbers_github_lib(file.patch)
                    if "test" not in file.filename.lower():
                        key = "modified_code"
                    else:
                        key = "modified_tests"
                    if key not in bugs_data[project][bug_id].keys():
                        bugs_data[project][bug_id][key] = {}
                    if file.filename not in bugs_data[project][bug_id][key].keys():
                        bugs_data[project][bug_id][key][file.filename] = {}
                    if added_line_numbers:
                        bugs_data[project][bug_id][key][file.filename]["added_lines"] = added_line_numbers
                    if deleted_line_numbers:
                        bugs_data[project][bug_id][key][file.filename]["deleted_lines"] = deleted_line_numbers

dict_to_json_file(data_file_path,bugs_data)
print("Collected info added to the file data/merged_data_production_bug_reports.json")

Collected info added to the file data/merged_data_production_bug_reports.json


## Getting the added and deleted lines for the projects that do not have the history available in GitHub

Requires defects4j installed and it is necessary to run the script get_repos.sh to download the defects4j version of these repos

In [5]:
import subprocess

def get_modified_line_numbers_git_show(lines):
    file_line_numbers = {}
    current_file = None
    current_line_number = None
    current_removed_line_number = None

    for line in lines:
        if line.startswith("diff --git"):
            current_file = line.split(" ")[-1].strip()
            file_line_numbers[current_file] = {"added": [], "deleted": []}
        elif line.startswith("@@"):
            current_line_number = int(line.split(' ')[2].split(',')[0][1:])
            current_removed_line_number = int(line.split(' ')[1].split(',')[0][1:])
        elif line.startswith('+') and not line.startswith('+++'):
            if not line.lstrip('+').lstrip().isspace() and line.lstrip('+').lstrip() != '':
                file_line_numbers[current_file]["added"].append(current_line_number)
            if current_line_number is not None:
                current_line_number += 1
        elif line.startswith('-') and not line.startswith('---'):
            if not line.lstrip('-').lstrip().isspace() and line.lstrip('-').lstrip() != '':
                file_line_numbers[current_file]["deleted"].append(current_removed_line_number)
            if current_removed_line_number is not None:
                current_removed_line_number += 1
        elif line.startswith(' '):
            if current_line_number is not None:
                current_line_number += 1
            if current_removed_line_number is not None:
                current_removed_line_number += 1

    return file_line_numbers

projects_list = {
    "Lang": "commons-lang.git",
    "Math": "commons-math.git"
}

bugs_data = json_file_to_dict(data_file_path)


# Define the working directory where the repositories will be checked out
working_dir = "/tmp/"

# Define the bugs and versions for which to obtain the modified lines
for project in projects_list.keys():
    folder = projects_list[project]
    os.chdir("/Users/lorenapacheco/Concordia/Masters/defects4j/project_repos/" + folder + "/") # Moving to defects4j folder
    bugs_details = bugs_data[project]
    for bug_id in bugs_details.keys():
        commit_sha = bugs_data[project][bug_id]["bugfix_commit"]
        output = subprocess.check_output(["git", "show", commit_sha])
        output_str = output.decode("utf-8")
        # Split the output into lines
        lines = output_str.split("\n")

        output = get_modified_line_numbers_git_show(lines)
        for file_name in output.keys():
            if file_name.endswith(".java"):
                if "test" not in file_name.lower():
                    key = "modified_code"
                else:
                    key = "modified_tests"
                if key not in bugs_data[project][bug_id].keys():
                    bugs_data[project][bug_id][key] = {}
                if file_name not in bugs_data[project][bug_id][key].keys():
                    bugs_data[project][bug_id][key][file_name] = {}
                if "added" in output[file_name].keys():
                    bugs_data[project][bug_id][key][file_name]["added_lines"] = output[file_name]["added"]
                if "deleted" in output[file_name].keys():
                    bugs_data[project][bug_id][key][file_name]["deleted_lines"] = output[file_name]["deleted"]

dict_to_json_file(data_file_path,bugs_data)
print("Collected info added to the file data/merged_data_production_bug_reports.json")

Collected info added to the file data/merged_data_production_bug_reports.json


In [11]:
bugs_data = json_file_to_dict(data_file_path)


count = 0
count_t_evos = 0
for project in bugs_data.keys():
    if project!="Lang" and project !="Math":
        for bug_id in bugs_data[project].keys():
            count +=1
            if "t-evos" in bugs_data[project][bug_id]["origin"]:
                count_t_evos +=1
print("Number of bugs being used " + str(count))
print("Number from t-evos " + str(count_t_evos))

Number of bugs being used 73
Number from t-evos 15
