In [None]:
import sys
sys.path.append('../lib')
import extractor

## Read the jira files into the datasets

In [None]:
project_set = ['ace', 'activemq', 'aurora', 'beam', 'cassandra', 'couchdb', 'hbase', 'hive', 'incubator-systemml', 'maven', 'spark', 'zookeeper']

datasets = {}
total = 0
issue_extractor = extractor.JiraIssueExtractor()
for project in project_set:
    filePath = './data-jira/' + project + '-issues/'
    # Get the useful information from a jira-files dir
    datasets[project] = issue_extractor.generate_dataset(filePath)
    total += len(datasets[project])
    print('------ %s -------' % project)
    print('Number of patches:', len(datasets[project]))
    print('Percentage of bugs: %.2f' % (1.0 * sum([1 for (issueId, dp) in datasets[project].items() if dp['type'] == 'Bug']) / len(datasets[project])))
    
print('Number of patches in total:', total)

## Remove uncompleted jira-issues which has no title/descrption/comment/type/priority

In [None]:
issue_extractor.remove_uncomplete_issues(project_set, datasets)
print('Number of patches left: ', sum([len(datasets[project]) for project in project_set]))

## Dump the dataset into pickle

In [None]:
issue_extractor.dump_into_pickle(datasets, 'data-jira/jira-issues.pickle')

## Analyze the number of multiple types and priorities

In [None]:
# import the labeler lib
import sys
sys.path.append('../lib')
import labeler
from labeler import apache_type

In [None]:
jira_type = [labeler.FEATURE, labeler.MAINTENANCE, labeler.IMPROVEMENT, labeler.SKIP]
for project in project_set:
    print('---------------------------- %s --------------------------' % project)
    for jt in jira_type:
        print('---------- %s ---------' % jt)
        temp_set = [issue for issue_id, issue in datasets[project].items() if jt in apache_type[issue['type']]]
        if not len(temp_set):
            continue   
        print('     Percentage of Blocker: %.2f' % (1.0 * sum([1 for issue in temp_set if issue['priority'] == 'Blocker']) / len(temp_set)))
        print('     Percentage of Critical: %.2f' % (1.0 * sum([1 for issue in temp_set if issue['priority'] == 'Critical']) / len(temp_set)))
        print('     Percentage of Major: %.2f' % (1.0 * sum([1 for issue in temp_set if issue['priority'] == 'Major']) / len(temp_set)))
        print('     Percentage of Minor: %.2f' % (1.0 * sum([1 for issue in temp_set if issue['priority'] == 'Minor']) / len(temp_set)))
        print('     Percentage of Trivial: %.2f' % (1.0 * sum([1 for issue in temp_set if issue['priority'] == 'Trivial']) / len(temp_set)))        
    
print('Number of patches in total:', total)