### Load data
JSON files in the `data/commits` folder are generated by `tools/repo_stats/stats_commit.py`.

In [None]:
import json

JSON_FILE = '../data/commits/hotot.commits.txt'

commits = None
with open(JSON_FILE, 'r') as json_file:
    commits = json.load(json_file)

### Define selector

*Both stats and selection of commits are based on the selectors.*

A selector takes two commits `a` and `b`, and returns a predefined label.

In [None]:
selectors = []

# 0: Ratio of changed LOCs
def loc_ratio(a, b):
    S = 'small'
    M = 'moderate'
    L = 'large'
    
    a_loc = a['additions'] + a['deletions']
    b_loc = b['additions'] + b['deletions']
    if not a_loc or not b_loc: return L
    ratio = a_loc / b_loc if a_loc >= b_loc else b_loc / a_loc
    if ratio < 2:
        return S
    elif ratio < 5:
        return M
    else:
        return L

selectors.append(('Ratio of LOCs', loc_ratio))

# 1: Addition vs. deletion: which is dominant

def add_vs_del(a, b):
    A = 'addition' # Both commits have more additions than deletions.
    D = 'deletion' # Both commits have more deletions than additions.
    B = 'both' # One commit has more additions and the other has more deletions.
    
    if a['additions'] > a['deletions'] and b['additions'] > b['deletions']:
        return A
    elif a['additions'] < a['deletions'] and b['additions'] < b['deletions']:
        return D
    else:
        return B

selectors.append(('Addition vs. deletion', add_vs_del))

# 2: Share files
def file_shared(a, b):
    S = 'shared'
    N = 'none'
    
    a_set = set(a['file_list'])
    b_set = set(b['file_list'])
    return S if a_set & b_set else N

selectors.append(('Sharing files', file_shared))

### Basic stats of commits
Randomly pick up commit paris and show distributions by the above selectors.

In [None]:
import pprint
import random

def get_stats(pairs):
    stats = [{ } for x in selectors]

    for (a, b) in pairs:
        for i, (name, func) in enumerate(selectors):
            label = func(a, b)
            if label not in stats[i]:
                stats[i][label] = 1 / len(pairs)
            else:
                stats[i][label] += 1 / len(pairs)
    return stats

def print_stats(stats):
    for i, (name, func) in enumerate(selectors):
        print(i, name)
        pprint.pprint(stats[i])

pairs = []
for _ in range(len(commits)):
    a = commits[random.randrange(0, len(commits))]
    b = commits[random.randrange(0, len(commits))]
    pairs.append((a, b))
print_stats(get_stats(pairs))

### Select commit pairs according to required distribution
The requirement specifies the fraction of each label in one selector. Items of the requirement should be in the same order as the selectors.

NOTE: If the requirement for a label is 0, any commit with that label is acceptable. Therefore, only omit a label covering few commits.

In [None]:
requirement = [{'large': 0.4, 'moderate': 0.3, 'small': 0.3},
               {'addition': 0.5, 'both': 0.5, 'deletion': 0},
               {'shared': 0.7, 'none': 0.3}]

NUM_PAIRS = 20
OUTPUT_FILE = 'survey.csv'

def accept(a, b, stats):
    for i, (name, func) in enumerate(selectors):
        label = func(a, b)
        if not requirement[i][label]: continue
        bar = stats[i][label] / requirement[i][label]
        for label in stats[i].keys():
            if not requirement[i][label]: continue
            if stats[i][label] / requirement[i][label] < bar:
                return False
    return True

def select_next(stats):
    while True:
        a = commits[random.randrange(0, len(commits))]
        b = commits[random.randrange(0, len(commits))]
        if accept(a, b, stats):
            return (a, b)

def init_stats(stats):
    for i, req in enumerate(requirement):
        for label in req.keys():
            stats[i][label] = 0

def update_stats(a, b, stats):
    for i, (name, func) in enumerate(selectors):
        label = func(a, b)
        stats[i][label] += 1
        
stats = [{ } for x in selectors]
init_stats(stats)

pairs = []
for i in range(NUM_PAIRS):
    (a, b) = select_next(stats)
    pairs.append((a, b))
    update_stats(a, b, stats)
print_stats(get_stats(pairs))

with open(OUTPUT_FILE, 'w') as out_file:
    for pair in pairs:
        print('%s: %s' % (pair[0]['url'], pair[0]['summary']), file=out_file)
        print('%s: %s' % (pair[1]['url'], pair[1]['summary']), file=out_file)