## Phase 1: Filter out qualified commits

## Load data

In [None]:
import git
import os
import json

proj_lst = ['vue', 'preact', 'express', 'element', 'webpack', 'prettier', 'axios', 'puppeteer',
            'mithril.js', 'ember.js', 'jquery', 'atom', 'three.js', 'pdf.js', 'serverless']

def retrieve_first_parent_commits(project_name):
    """
    Assumptions:
        project with name project_name is stored at ../repos/{project_name}
        first-parent commit info is stored at ../data/icse-js/commit-stat/{project_name}-comit-stat.json
    """

    repo_path = '../repos/' + project_name
    commit_stat_path = '../data/icse-js/commit-stat/%s-commit-stat.json' % project_name

    repo = git.Repo(repo_path)

    if os.path.isfile(commit_stat_path):
        print('Loading json file for %s...' % project_name)
        with open(commit_stat_path, 'r') as f:     
            commits = json.load(f)
    else:
        commits = []
        for c in repo.iter_commits(first_parent=True):
            n_add = 0
            n_del = 0
            files = []
            for path, stat in c.stats.files.items():
                files.append(path)
                n_add += stat['insertions']
                n_del += stat['deletions']

            commits.append({
                'hash': c.hexsha,
                'email': c.author.email.lower(),
                'additions': n_add,
                'deletions': n_del,
                'file_list': files
            })
        print('Writing json file for %s...' % project_name)
        with open(commit_stat_path, 'w+') as f:
            json.dump(commits, f)

    print(len(commits))
    return commits

commit_info = {}
for proj_name in proj_lst:
    commit_info[proj_name] = retrieve_first_parent_commits(proj_name)

## Select all qualified commits

In [None]:
import re
fname_regexes = [re.compile('.+\.js$'),
                  re.compile('^(?!dist/).+'),
                  re.compile('^(?!test(s)?/).+'),
                  re.compile('^(?!build/).+'),
                  re.compile('^(?!spec/).+'),
                  re.compile('^(?!bin/).+'),
                  re.compile('^(?!doc(s)?/).+')]

def fname_filter(fname, fname_regexes):
    for regex in fname_regexes:
        if not regex.match(fname):
            return False
    return True

def select_qualified_commits(commits):
    qualified_commits = []
    for commit in commits:
        qualified = False
        if commit['email'].endswith('noreply.github.com'):
            continue
        for file in commit['file_list']:
            if fname_filter(file, fname_regexes):
                qualified = True
        if qualified:
            qualified_commits.append(commit)
    return qualified_commits

all_qualified_commits = {}
for proj_name in proj_lst:
    all_qualified_commits[proj_name] = select_qualified_commits(commit_info[proj_name])
    print('%s: %d' % (proj_name, len(all_qualified_commits[proj_name])))

## Add some random commits

In [None]:
import random

# params
ratio = 0.3

def add_rand_commits(qualified_commits, commits):
    num_rand_commits = int(len(qualified_commits) * ratio)

    # get remaining commits
    qualified_sha_set = set([c['hash'] for c in qualified_commits])
    remaining_commits = []
    for commit in commits:
        if commit['hash'] not in qualified_sha_set:
            remaining_commits.append(commit)

    # get random commits
    rand_commits = random.sample(remaining_commits, min(num_rand_commits, len(remaining_commits)))

    for commit in rand_commits:
        if not commit['email'].endswith('noreply.github.com'):
            qualified_commits.append(commit)

for proj_name in proj_lst:
    add_rand_commits(all_qualified_commits[proj_name], commit_info[proj_name])
    print('%s: %d' % (proj_name, len(all_qualified_commits[proj_name])))

## Phase 2: Select commit pairs

In [None]:
def organize_by_author_email(qualified_commits):
    email2commits = {}
    for commit in qualified_commits:
        email = commit['email']
        if email not in email2commits:
            email2commits[email] = [commit]
        else:
            email2commits[email].append(commit)
        
    # sanity check
    for email, commit_list in email2commits.items():
        assert(len(commit_list) > 0)
        
    return email2commits

all_email2commits = {}
print('-' * 10 + 'Number of Authors' + '-' * 10)
for proj_name in proj_lst:
    all_email2commits[proj_name] = organize_by_author_email(all_qualified_commits[proj_name])
    print('%s: %d' % (proj_name, len(all_email2commits[proj_name])))

In [None]:
# params
max_per_author = 50
max_ratio = 10  # max ratio of commit sizes in a comparison

def select_commit_pairs(email2commits):
    commit_pairs = {}
    for email, indv_commits in email2commits.items():
        if len(indv_commits) == 1:
            continue
        commit_pairs[email] = []

        selected = random.sample(indv_commits, min(max_per_author, len(indv_commits)))
        selected = sorted(selected, key=lambda x: x['hash'])
        for i in range(-1, len(selected) - 1):
            c1 = selected[i]
            c2 = selected[i + 1]
            assert c1['email'].lower() == c2['email'].lower()

            n1 = c1['additions'] + c1['deletions']
            n2 = c2['additions'] + c2['deletions']
            if n1 == 0 or n2 == 0:
                continue
            if int(max(n1 / n2, n2 / n1)) > max_ratio:
                continue
            commit_pairs[email].append((c1['hash'], c2['hash']))
        
        if len(commit_pairs[email]) == 0:
            del commit_pairs[email]

    # sanity check
    for email, pair_lst in commit_pairs.items():
        assert(len(pair_lst) > 0)

    return commit_pairs

all_email2pairlst = {}
core_threshold = 10
print('-' * 10 + 'Number of Core Contributors' + '-' * 10)
for proj_name in proj_lst:
    all_email2pairlst[proj_name] = select_commit_pairs(all_email2commits[proj_name])
    core_cnt = 0
    for email, pair_lst in all_email2pairlst[proj_name].items():
        if len(pair_lst) >= core_threshold:
            core_cnt += 1
    print('%s: %d' % (proj_name, core_cnt))

## Output all_project_pairs as json

In [None]:
OUTPUT_FILE = '../data/icse-js/all_email2pairlst.json'

with open(OUTPUT_FILE, 'w+') as f:
    json.dump(all_project_pairs, f)

In [None]:
pairs_cnt = 0
dev_cnt = 0

num_pair_threshold = 10

for project_name, project_pairs in all_email2pairlst.items():
    for email, pair_list in project_pairs.items():
        if len(pair_list) >= num_pair_threshold:
            dev_cnt += 1
            pairs_cnt += len(pair_list)
            
print('Number of developers with more than %s pairs: ' % str(num_pair_threshold), dev_cnt)
print(pairs_cnt)

## First Batch Emails
people whose # of pairs greater than or equal to `than num_pair_threshold`

In [None]:
with open('../data/icse-js/first-batch-emails.txt', 'w+') as f:
    for project_name, project_pairs in all_email2commits.items():
        for email, pair_list in project_pairs.items():
            if len(pair_list) >= num_pair_threshold:
                f.write('%s,%s\n' % (project_name, email))

## Second Batch Emails
people whose # of pairs smaller than `num_pair_threshold`

In [None]:
with open('../data/icse-js/second-batch-emails.txt', 'w+') as f:
    for project_name, project_pairs in all_email2commits.items():
        for email, pair_list in project_pairs.items():
            if len(pair_list) < num_pair_threshold:
                f.write('%s,%s\n' % (project_name, email))

## Check if a developer contributes to mulitple projects

In [None]:
import itertools

def check_developer(project_name, email):
    print('%s %s: %d' % (project_name, email, len(all_email2pairlst[project_name][email])))

email_set_list = []
for project_name, project_pairs in all_email2pairlst.items():
    email_set_list.append((project_name, set(project_pairs.keys())))

for name_s1_tuple, name_s2_tuple in itertools.combinations(email_set_list, 2):
    project_name1, s1 = name_s1_tuple
    project_name2, s2 = name_s2_tuple
    inter = s1 & s2
    if len(inter) > 0:
        print('-' * 50)
        print('WARNING: intersection exists between %s and %s' % (project_name1, project_name2))
        for email in inter:
            check_developer(project_name1, email)
            check_developer(project_name2, email)
        print('-' * 50)