In [1]:
import csv
import json
import requests

f = open("fix_introduce_pairs_v1.json")
fix_induce_map = json.load(f) # one to many

# build a map for commit_id to project name
sstubs_f = open("sstubs.json")
sstubs = json.load(sstubs_f)

cmt_id_pj_name = {}
for bug in sstubs:
    project_name = bug["projectName"]
    commit_id = bug["fixCommitSHA1"]
    cmt_id_pj_name[commit_id] = project_name

# build a bug inducing commit dictionary
# key: inducing commit id
# value: 
#   bug_fixing_commit_ids: a set, the fixing commits' ids
#   bug_num: the number of bugs this inducing commit has
#   project_name: ok

bug_inducing_commits = {}

for item in fix_induce_map:
    fixing_id = item["bug_fixing_commit_id"] # one commit
    inducing_id_list = item["bug_inducing_commit_ids"] # list of commit
    for commit_id in inducing_id_list:
        if commit_id in bug_inducing_commits:
            if fixing_id not in bug_inducing_commits[commit_id]:
                # old inducing commit, but new fixing commit
                bug_inducing_commits[commit_id]['bug_fixing_commit_ids'].add(fixing_id)
                bug_inducing_commits[commit_id]['bug_num'] += 1
        else:
            # new inducing commit
            bug_inducing_commits[commit_id] = {
                'bug_fixing_commit_ids': {fixing_id},
                'bug_num': 1,
                'projectName': cmt_id_pj_name[fixing_id]
            }

print("the size of bug inducing commit is {}".format(len(bug_inducing_commits)))
# print a item in dic
for x, y in bug_inducing_commits.items():
    print(x)
    print(y)
    break

the size of bug inducing commit is 2199
e2110b5a6672c889b149b8d276f4374842709ba2
{'bug_fixing_commit_ids': {'92b7f309d809fbb4e74a0677f62b8fe39ff207db'}, 'bug_num': 1, 'projectName': 'Alluxio.alluxio'}


In [2]:
url_list1 = [
    'https://github.com/apache/camel'
]
url_list2 = [
    'https://github.com/hazelcast/hazelcast',
    'https://github.com/Alluxio/alluxio'
]
url_list3 = [
    'https://github.com/libgdx/libgdx',
    'https://github.com/wildfly/wildfly',
    'https://github.com/checkstyle/checkstyle',
]
url_list4 = [
    'https://github.com/google/closure-compiler',
    'https://github.com/spring-projects/spring-boot',
    'https://github.com/apache/flink',
    'https://github.com/facebook/presto',
]
url_list5 = [
    'https://github.com/netty/netty',
    'https://github.com/openhab/openhab',
    'https://github.com/apache/hbase',
    'https://github.com/Atmosphere/atmosphere',
    'https://github.com/openmrs/openmrs-core',
    
    'https://github.com/druid-io/druid',
    'https://github.com/gephi/gephi',
]

## init all three dictionary, and save data into them one by one

# key: project_name
# value: project_info (dic)
projects = {}
project_features = ['created_at', 'updated_at', 'size', 'stargazers_count', 'watchers_count',
                    'forks_count', 'network_count', 'subscribers_count']
# build the user_info table where
# key: project_user_names
# value: user_info (dic)
users = {}
user_features = ['followers', 'public_repos', 'created_at']
user_features_patch = ['contributions']

# build the project_info table where
# build the commit_info table where
# key: commit_id
# value: commit_info
commits = {}
commit_features = ['author_id', 'LOC', 'Add', 'Delete', 'Files', 'bug_num']

In [13]:
# print remaining access 
token = "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"
headers = {'Authorization': "Token " + token}
rate_limit_url = "https://api.github.com/rate_limit"
req = requests.get(rate_limit_url, headers=headers).json()
print(req)

{'resources': {'core': {'limit': 5000, 'used': 2808, 'remaining': 2192, 'reset': 1617578116}, 'search': {'limit': 30, 'used': 0, 'remaining': 30, 'reset': 1617576023}, 'graphql': {'limit': 5000, 'used': 0, 'remaining': 5000, 'reset': 1617579563}, 'integration_manifest': {'limit': 5000, 'used': 0, 'remaining': 5000, 'reset': 1617579563}, 'source_import': {'limit': 100, 'used': 0, 'remaining': 100, 'reset': 1617576023}, 'code_scanning_upload': {'limit': 500, 'used': 0, 'remaining': 500, 'reset': 1617579563}}, 'rate': {'limit': 5000, 'used': 2808, 'remaining': 2192, 'reset': 1617578116}}


In [7]:
url_list = url_list1 + url_list2 + url_list3 + url_list4

for url in url_list:
    items = url.split("/")
    owner = items[len(items)-2]
    repo = items[len(items)-1]
    
    project_api = 'https://api.github.com/repos/' + owner + '/' + repo
    contributor_api = 'https://api.github.com/repos/' + owner + '/' + repo + '/' + "contributors"
    project = requests.get(project_api, headers=headers).json()
    contributors = requests.get(contributor_api, headers=headers).json()

    # get project_info and save in a dictionary
    project_info = {}
    for i in range(len(project_features)):
        feature_str = project_features[i]
        project_info[feature_str] = project[feature_str]
    key = owner + '.' + repo
    projects[key] = project_info
    
    # get user_info and save in a dictionary
    for item in contributors:
        ## get contributor_info
        username = item['login']
        contributions = item['contributions']
        
        # get the user_info
        user_api = 'https://api.github.com/users/' + username
        user = requests.get(user_api, headers=headers).json()
        user_info = {}
        for i in range(len(user_features)):
            feature_str = user_features[i]
            user_info[feature_str] = user[feature_str]
        user_info['contributions'] = contributions
        key = username
        users[key] = user_info
    
print(projects.keys())
print("total number of users is {}".format(len(users.keys())))


#print("an item for project")
#for x, y in projects.items():
#    print(x)
#    print(y)
#    break
#print("an item for user")
#for x, y in users.items():
#    print(x)
#    print(y)
#    break


dict_keys(['apache.camel', 'hazelcast.hazelcast', 'Alluxio.alluxio', 'libgdx.libgdx', 'wildfly.wildfly', 'checkstyle.checkstyle', 'google.closure-compiler', 'spring-projects.spring-boot', 'apache.flink', 'facebook.presto'])
total number of users is 299


In [10]:
for key, value in bug_inducing_commits.items():
    bug_num = value["bug_num"]
    projectName = value["projectName"]
    owner = projectName.split('.')[0]
    pg_name = projectName.split('.')[1]
    commit_api = 'https://api.github.com/repos/{}/{}/commits/{}'.format(owner, pg_name, key)
    commit = requests.get(commit_api, headers=headers).json()
    
    # get commit_info and save in a dictionary
    author_id = commit['author']
    if author_id is None:
        continue
    author_id = author_id['login']
    loc = commit['stats']['total']
    additions = commit['stats']['additions']
    deletions = commit['stats']['deletions']
    files_in_change = len(commit['files'])
    commit_info = {
        'author_id': author_id,
        'LOC': loc,
        'Add': additions,
        'Delete': deletions,
        'Files': files_in_change,
        "bug_num": bug_num
    }
    commits[key] = commit_info

print("an item for commit")
for x, y in commits.items():
    print(x)
    print(y)
    break
print("total number of commit is {}".format(len(commits.keys())))

an item for commit
e2110b5a6672c889b149b8d276f4374842709ba2
{'author_id': 'madanadit', 'LOC': 11, 'Add': 6, 'Delete': 5, 'Files': 1, 'bug_num': 1}


In [12]:
print("total number of commit is {}".format(len(commits.keys())))

total number of commit is 1826


In [19]:
# go through the cm_pj_usr table
# use project_name to get project_info
# use pj_usr_name to get user_info
# use commits to get commit_info
# save data in format (bug_num, project_info, user_info, commit_info) into a csv

with open('final_data.csv', 'w') as f:
    csv_write = csv.writer(f)
    csv_head = project_features + user_features + user_features_patch + commit_features + ['commit_id']
    csv_write.writerow(csv_head)

    for key, value in commits.items():
        # get the three dictionary
        project_name = bug_inducing_commits[key]['projectName']
        project_info = projects[project_name]
        
        commit_info = value
        
        user_key = commit_info['author_id']
        if user_key not in users.keys():
            continue
        user_info = users[user_key]
        
        # get the data
        data_row = []
        for i in range(len(project_features)):
            feature_str = project_features[i]
            data_row.append(project_info[feature_str])

        for i in range(len(user_features)):
            feature_str = user_features[i]
            data_row.append(user_info[feature_str])

        for i in range(len(user_features_patch)):
            feature_str = user_features_patch[i]
            data_row.append(user_info[feature_str])

        for i in range(len(commit_features)):
            feature_str = commit_features[i]
            data_row.append(commit_info[feature_str])
        data_row.append(key)

        # write into csv
        csv_write.writerow(data_row)