#### **Var definitions**
Change it whenever necessary

In [2]:
access_oauth_token = "token <PLACE_HOLDER>" #TODO: delete before commiting the code
jira_url = "https://issues.apache.org/jira/"
project_name = "CLI"
repo_name = "commons-cli"
base_path = '/Users/lorenapacheco/Concordia/Masters/'
out = base_path + "BugReportsMining/bug-reports/"
proj_path = base_path + repo_name
out_commits_dir = base_path + "BugReportsMining/commits/" + repo_name

#### **Bug reports mining**
Getting all the **Jira tickets** ids from the project that are **closed** and have the **bug issue type**

In [3]:
import os
import json
import requests
import pprint
from jira import JIRA

issue_numbers_list = []

def get_jira_closed_bug_issues(jira_url, project_name):
    issue_search_str = "project = "+ project_name + " AND issuetype = Bug AND status = Closed AND resolution in (Fixed, Done, Resolved) ORDER BY priority DESC, updated DESC"
    issues = jira.search_issues(issue_search_str, maxResults=False)
    for i in issues:
        issue_numbers_list.append(str(i))
    print(issue_numbers_list)
    print("\nNumber of issues: " + str(len(issue_numbers_list)))

jira = JIRA(server=jira_url)
get_jira_closed_bug_issues(jira_url, project_name)

['CLI-137', 'CLI-193', 'CLI-143', 'CLI-51', 'CLI-17', 'CLI-266', 'CLI-185', 'CLI-252', 'CLI-207', 'CLI-183', 'CLI-224', 'CLI-186', 'CLI-182', 'CLI-241', 'CLI-184', 'CLI-230', 'CLI-159', 'CLI-50', 'CLI-59', 'CLI-9', 'CLI-150', 'CLI-4', 'CLI-41', 'CLI-40', 'CLI-191', 'CLI-151', 'CLI-162', 'CLI-165', 'CLI-164', 'CLI-163', 'CLI-158', 'CLI-154', 'CLI-144', 'CLI-15', 'CLI-6', 'CLI-8', 'CLI-145', 'CLI-61', 'CLI-149', 'CLI-121', 'CLI-71', 'CLI-21', 'CLI-135', 'CLI-134', 'CLI-13', 'CLI-129', 'CLI-38', 'CLI-74', 'CLI-73', 'CLI-72', 'CLI-69', 'CLI-67', 'CLI-65', 'CLI-64', 'CLI-63', 'CLI-62', 'CLI-56', 'CLI-55', 'CLI-52', 'CLI-46', 'CLI-45', 'CLI-44', 'CLI-39', 'CLI-36', 'CLI-33', 'CLI-32', 'CLI-31', 'CLI-30', 'CLI-29', 'CLI-28', 'CLI-26', 'CLI-25', 'CLI-24', 'CLI-23', 'CLI-20', 'CLI-12', 'CLI-11', 'CLI-10', 'CLI-5', 'CLI-3', 'CLI-2', 'CLI-127', 'CLI-308', 'CLI-282', 'CLI-161', 'CLI-248', 'CLI-201', 'CLI-220', 'CLI-204', 'CLI-202', 'CLI-215', 'CLI-209', 'CLI-205', 'CLI-197', 'CLI-203', 'CLI-123', 

#### **Bug reports content extraction**
Extraction the content of each of these bug reports

In [4]:
output_folder = out + repo_name+ '/'

def get_bug_comments(issue):
    comments = issue.fields.comment.comments
    comments_list = []
    for c in comments:
        comment = {
            "author": {
                "login": c.author.key
            },
            "authorAssociation": "NONE",
            "body": c.body,
            "createdAt": c.created,
            "includesCreatedEdit": False,
            "isMinimized": False,
            "minimizedReason": "",
            "reactionGroups": []
        }
        comments_list.append(comment)
    return comments_list

def get_bug_details(issue_number):
    issue = jira.issue(issue_number)
    bug_details = {
        "title": issue.fields.summary,
        "body": issue.fields.description,
        "comments_content": get_bug_comments(issue),
        "createdAt": issue.fields.created,
        "id": issue_number,
        "labels":[
            {
                "name": issue.fields.issuetype.name,
                "id": issue.fields.issuetype.id,
                "description": issue.fields.issuetype.description,
                "color": None
            }
        ],
        "number": issue_number,
        "state": issue.fields.status.name,
        "updatedAt": issue.fields.updated,
        "url": jira_url + "browse/"+ issue_number
    }
    return bug_details
    

def dict_to_json_file(file, dic, folder=output_folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(os.path.join(folder, file+'.json'), 'w') as fp:
        json.dump(dic, fp, sort_keys=True, indent=4)
    fp.close()
        
for issue_number in issue_numbers_list:
    bug_details = get_bug_details(issue_number)
    dict_to_json_file(repo_name+'-'+str(issue_number), bug_details)
print ("Extraction completed")
    

Extraction completed


#### **Filtering for logs and stack traces**
Filtering only the bug reports with **logs** and **stack traces**

In [5]:
import re
import glob
from collections import defaultdict

# global dict of a dict for storing the log
bug_report_log = defaultdict(dict)
bug_report_stack_trace = defaultdict(dict)

def json_file_to_dict(file):
    data = {}
    with open(os.path.join(output_folder, file+'.json'), 'r') as fp:
        data = json.load(fp)
    fp.close()
    return data

def find_regex(regex, var,file_name, text_content):
    results = re.finditer(regex, text_content)
    if results:
        for log in results:
            bug_name = file_name.split('-')[-2] + '-' + file_name.split('-')[-1]
            var[proj][bug_name] = log.group() 
            
            
def find_logs_and_stack_traces (file_name, text_content):
    stack_trace_regex = r'(?m)^.*?Exception.*(?:\n+^\s*at .*)+'
    logs_regex = r'(ERROR|INFO|WARN|DEBUG|FATAL)\s+(?P<class>\w+(\.\w+)*)'
    find_regex(logs_regex, bug_report_log,file_name, text_content)
    find_regex(stack_trace_regex, bug_report_stack_trace,file_name, text_content)
    

# for every .json in bug-reports folder
for file in glob.glob(output_folder +'/*.json'):
    file_name = os.path.basename(file).replace('.json', '')
    proj = file_name.replace('-'+file_name.split('-')[-1], '').replace('-'+file_name.split('-')[-2], '').lower()
    bug_report = json_file_to_dict(file.replace('.json', ''))
    if not bug_report:
        continue
    
    title = bug_report['title'] if bug_report['title'] else ""
    description = bug_report['body'] if bug_report['body'] else ""
    text_content = title + '\n' + description + '\n' + '\n'.join([comment['body'] for comment in bug_report['comments_content'] if 'body' in comment and comment['body']])
    find_logs_and_stack_traces(file_name, text_content)

print("Bug reports with logs:" + str(len(bug_report_log[proj])))
print("Bug reports with stack traces:" + str(len(bug_report_stack_trace[proj])))

Bug reports with logs:0
Bug reports with stack traces:7


#### **Commits search**
Searching for the respective commits

**TODO:** Figure out a way to decide witch of the commits to use

In [6]:
from urllib.parse import quote

bug_reports_with_logs_commits = defaultdict(dict)
bug_reports_with_stack_traces_commits = defaultdict(dict)
cont = 0

def get_bug_report_commit(bug, proj_path):
    os.chdir(proj_path)
    log_grep_command = "git log --grep="+ bug+" --pretty=format:\"%H\" >> commit_output"
    os.system(log_grep_command)
    lines = []
    with open("commit_output", 'r') as fp:
        lines =fp.readlines()
        fp.close()
    os.system("rm commit_output")
    if (lines):
        commit_hash = lines[0]
        return(commit_hash)
    return ""

for bug in bug_report_log[proj].keys():
    commit = get_bug_report_commit(bug, proj_path)
    if (commit != ""):
        content = {
            "log": bug_report_log[proj][bug],
            "commit": commit.strip()
        }
        bug_reports_with_logs_commits[bug] = content
        cont = cont + 1

for bug in bug_report_stack_trace[proj].keys():
    commit = get_bug_report_commit(bug, proj_path)
    if (commit != ""):
        content = {
            "stack_trace": bug_report_stack_trace[proj][bug],
            "commit": commit.strip()
        }
        bug_reports_with_stack_traces_commits[bug] = content
        cont = cont + 1

if bug_reports_with_logs_commits:
    dict_to_json_file("bug_reports_with_logs_commits", bug_reports_with_logs_commits, out_commits_dir)
if bug_reports_with_stack_traces_commits:
    dict_to_json_file("bug_reports_with_stack_traces_commits", bug_reports_with_stack_traces_commits, out_commits_dir)

print("Extraction complete")
print("Number obtained: " + str(cont))

Extraction complete
Number obtained: 3
