## Phase 1: Filter out qualified commits

## Load data

In [None]:
import json

project_name = 'vue'

JSON_FILE = '../data/icse-js/commit-stat/%s.commits.json' % project_name

commits = None
with open(JSON_FILE, 'r') as json_file:
    commits = json.load(json_file)
    
print(len(commits))

In [None]:
import pickle

PICKLE_FILE = '../data/js-share/%s-commit-share.pickle' % project_name
with open(PICKLE_FILE, 'rb') as f:
    commit_shares = pickle.load(f)

print(len(commit_shares))

In [None]:
import re
fname_regexes = [re.compile('.+\.js$'),
                  re.compile('^(?!dist/).+'),
                  re.compile('^(?!test(s)?/).+'),
                  re.compile('^(?!packages/).+'),
                  re.compile('^(?!spec/).+'),
                  re.compile('^(?!build/).+'),
                  re.compile('^(?!bin/).+'),
                  re.compile('^(?!doc(s)?/).+')]

def fname_filter(fname, fname_regexes):
    for regex in fname_regexes:
        if not regex.match(fname):
            return False
    return True

## Select all qualified commits

In [None]:
qualified_commits = []
qualified_shas = set()
for commit in commits:
    qualified = False
    if commit['hash'] not in commit_shares:
        continue
    if commit['email'].endswith('noreply.github.com'):
        continue
    for file in commit['file_list']:
        if fname_filter(file, fname_regexes):
            qualified = True
    if qualified:
        qualified_commits.append(commit)
        qualified_shas.add(commit['hash'])
print(len(qualified_commits))

## Add some random commits

In [None]:
import random
ratio = 0.2
num_rand_commits = int(len(qualified_commits) * ratio)
while num_rand_commits > 0:
    commit = random.choice(commits)
    if commit['hash'] not in qualified_shas and commit['hash'] in commit_shares and not commit['email'].endswith('noreply.github.com'):
        qualified_shas.add(commit['hash'])
        qualified_commits.append(commit)
        num_rand_commits -= 1
    
print(len(qualified_commits))

## Output to csv

In [None]:
OUTPUT_FILE = '../data/icse-js/selected-commits/%s-selected-commits.csv' % project_name

def write_line(f, commit):
    f.write(','.join([commit['email'], commit['hash'], str(commit['additions']), str(commit['deletions'])]) + '\n')
    
with open(OUTPUT_FILE, 'w+') as f:
    f.write('email,hash,additions,deletions\n')
    for commit in qualified_commits:
        write_line(f, commit)

## Phase 2: Select commit pairs

In [None]:
email2commits = {}
for commit in qualified_commits:
    email = commit['email'].lower()
    if email not in email2commits:
        email2commits[email] = [commit]
    else:
        email2commits[email].append(commit)
print('Number of authors: ', str(len(email2commits)))

In [None]:
max_per_author = 50
# max ratio of commit sizes in a comparison
max_ratio = 10

commit_pairs = {}
for email, indv_commits in email2commits.items():
    if len(indv_commits) == 1:
        continue
    commit_pairs[email] = []
    selected = random.sample(indv_commits, min(max_per_author, len(indv_commits)))
    selected = sorted(selected, key=lambda x: x['hash'])
    for i in range(-1, len(selected) - 1):
        c1 = selected[i]
        c2 = selected[i + 1]
        assert c1['email'].lower() == c2['email'].lower()
    
        n1 = c1['additions'] + c1['deletions']
        n2 = c2['additions'] + c2['deletions']
        if n1 == 0 or n2 == 0:
            continue
        if int(max(n1 / n2, n2 / n1)) > max_ratio:
            continue
        commit_pairs[email].append((c1['hash'], c2['hash']))

In [None]:
length_dist = []
for email, pair_list in commit_pairs.items():
    if len(pair_list) > 10:
        print(email)
    length_dist.append(len(pair_list))

In [None]:
try:
    all_project_pairs
except NameError:
    all_project_pairs = {}
all_project_pairs[project_name] = commit_pairs

In [None]:
all_project_pairs.keys()

## Output all_project_pairs as json

In [None]:
OUTPUT_FILE = '../data/icse-js/selected-commits/all_project_pairs.json'

with open(OUTPUT_FILE, 'w+') as f:
    json.dump(all_project_pairs, f)

In [None]:
cnt = 0
for project_name, project_pairs in all_project_pairs.items():
    for email, pair_list in project_pairs.items():
        if len(pair_list) > 10:
            cnt += len(pair_list)
print(cnt)