#### **Bug reports mining**
Getting all the Github **issues** from **FastJson** repo that are **closed** and have the **bug**'s label

In [43]:
import os
import json
import requests
import pprint

access_oauth_token = "token <YOUR_TOKEN_HERE>"
repo = "alibaba/fastjson"

def get_git_closed_bug_issues(repo, media_type = "application/vnd.github+json"):
    issue_numbers_list = []
    current_page=1
    pagination_end = False
    
    #iteration throught the pagination
    while not pagination_end:
        issues_json = requests.get(
            "https://api.github.com/repos/"+repo+"/issues?state=closed&labels=bug&page="+str(current_page),
            headers={
                'Accept': media_type,
                'Authorization': access_oauth_token
            }
        ).json()
        if len(issues_json) > 0:
            for issue in issues_json:
                issue_numbers_list.append(issue["number"])
            current_page = current_page + 1
        else:
            pagination_end = True
            
    return issue_numbers_list


issue_numbers_list = get_git_closed_bug_issues(repo)

#### **Bug reports content extraction**
Extraction the content of each of these bug reports

In [53]:
out = '/Users/lorenapacheco/Concordia/Masters/BugReportsMining/bug-reports/'
output_folder = out + repo_name+ '/'

def git_request_br_json(issue_n, repo, media_type = 'application/vnd.github.v3+json'):
    retrieved_json = requests.get(
        'https://api.github.com/repos/'+repo+'/issues/'+str(issue_n),
        headers={
            'Accept': media_type,
            'Authorization': access_oauth_token
        }
    ).json()
    #pprint.pprint(retrieved_json)
    
    if 'comments_url' not in retrieved_json:
        return
    
    comment_url = retrieved_json['comments_url']
    retrieved_comments = requests.get(
        comment_url,
        headers={
            'Accept': media_type,
            'Authorization': access_oauth_token
        }
    ).json()
    retrieved_json['comments_content'] = retrieved_comments
    return retrieved_json

def dict_to_json_file(file, dic):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    with open(os.path.join(output_folder, file+'.json'), 'w') as fp:
        json.dump(dic, fp, sort_keys=True, indent=4)
    fp.close()


for issue_number in issue_numbers_list:
    try:
        retrieved_json = git_request_br_json(issue_number, repo)
    except ConnectionError:
        print('ConnectionError', repo, issue_number)
        break

    if not retrieved_json:
        print('Json not retrieved, something might be wrong')
        continue

    repo_name = repo.split('/')[1]
    dict_to_json_file(repo_name+'-'+str(issue_number), retrieved_json)
print ("Extraction completed")
        
        

Extraction completed


#### **Filtering for logs and stack traces**
Filtering only the bug reports with **logs** and **stack traces**

In [162]:
import re
import glob
from collections import defaultdict

# global dict of a dict for storing the log
bug_report_log = defaultdict(dict)
bug_report_stack_trace = defaultdict(dict)

def json_file_to_dict(file):
    data = {}
    with open(os.path.join(output_folder, file+'.json'), 'r') as fp:
        data = json.load(fp)
    fp.close()
    return data

def find_regex(regex, var,file_name, text_content):
    results = re.finditer(regex, text_content)
    if results:
        for log in results:
            bug_name = file_name.split('-')[1]
            var[proj][bug_name] = log.group() 
            
            
def find_logs_and_stack_traces (file_name, text_content):
    stack_trace_regex = r'(?m)^.*?Exception.*(?:\n+^\s*at .*)+'
    logs_regex = r'(ERROR|INFO|WARN|DEBUG|FATAL)\s+(?P<class>\w+(\.\w+)*)'
    find_regex(logs_regex, bug_report_log,file_name, text_content)
    find_regex(stack_trace_regex, bug_report_stack_trace,file_name, text_content)
    

# for every .json in bug-reports folder
for file in glob.glob(out+'/*/*.json'):
    file_name = os.path.basename(file).replace('.json', '')
    proj = file_name.replace('-'+file_name.split('-')[-1], '').lower()
    bug_report = json_file_to_dict(file.replace('.json', ''))
    if not bug_report:
        continue
    
    title = bug_report['title'] if bug_report['title'] else ""
    description = bug_report['body'] if bug_report['body'] else ""
    text_content = title + '\n' + description + '\n' + '\n'.join([comment['body'] for comment in bug_report['comments_content'] if 'body' in comment and comment['body']])
    
    find_logs_and_stack_traces(file_name, text_content)

print("Bug reports with logs:" + str(len(bug_report_log['fastjson'])))
print("Bug reports with stack traces:" + str(len(bug_report_stack_trace['fastjson'])))

Bug reports with logs:2
Bug reports with stack traces:47


#### **Commits/PR search**
Searching for the respective commits

In [160]:
from urllib.parse import quote

print(bug_report_log['fastjson'].keys())
print(bug_report_stack_trace['fastjson'].keys())



cont=0;

for bug in bug_report_stack_trace['fastjson'].keys():
    print(bug)
    file_name = "fastjson-" + bug
    proj = file_name.replace('-'+file_name.split('-')[-1], '').lower()
    file = out + proj + "/" + file_name
    bug_report = json_file_to_dict(file.replace('.json', ''))
    timeline_url = bug_report["timeline_url"]
    timeline = requests.get(
        timeline_url,
        headers={
            'Accept': 'application/vnd.github.v3+json',
            'Authorization': access_oauth_token
        }
    ).json()
    for action in timeline:
        if("commit_id" in action.keys() and action["commit_id"] != None):
            print(action["commit_id"])
            cont = cont + 1
    print("----")
            
print(cont)
    

dict_keys(['1645', '1367'])
dict_keys(['1987', '980', '918', '1699', '3309', '1945', '1569', '3057', '3796', '1482', '3637', '1036', '3119', '1962', '1651', '1240', '1407', '1112', '912', '1503', '1152', '1390', '2210', '1298', '3088', '1488', '3121', '1370', '1188', '1425', '802', '1645', '1583', '1306', '1941', '1367', '1593', '3280', '1177', '1834', '2388', '2306', '1603', '2351', '995', '1300', '1785'])
1987
cdd53c9907b5107313bfa6d6abc23c599d184c61
----
980
b6017818b3758d425119e9c0094f5862ae67bb43
----
918
1d18445f15f0b436c78f015c5357d37bb9667780
----
1699
f7d5b7dd4fd4e5f03240c7ed3d6830d6bd31e4d7
----
3309
13ae28e80ac17dea57cd299ae8b7a49b25ef35c9
----
1945
4fcc9c49b4327ba22de9d948bba43e12bb5c8fd2
f1ebf3eafd063e07c35ecee4f31fdc1cd58ea7e7
----
1569
0e9063e82267127f7464f6945dff85a9a8786127
b3c1cfa23666e592f6db5d955ba736458a8ac335
----
3057
----
3796
----
1482
92d94ecf7b70732e88b12ad427cb764e9f26ed55
----
3637
8d42fd87c00b9691d393c15998135b5281d895f5
----
1036
6b27030f1b98446273eb65a04