In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

In [None]:
# import dependencies and set default alpha value
import math
import pickle
import pprint
import numpy as np
import matplotlib.pyplot as plt
from graphs.call_commit_graph import CallCommitGraph
from scipy.stats import spearmanr, kendalltau

default_alpha = 0.8
pp = pprint.PrettyPrinter(indent=4)

def write_hash_to_csv(hh, fname):
    with open(fname, "w+") as f:
        for key, value in hh.items():
            f.write(str(key) + ',' + str(value) + '\n')
            
def write_dots_to_csv(xs, ys, fname, header):
    with open(fname, "w+") as f:
        f.write(header)
        for x, y in zip(xs, ys):
            f.write(str(x) + ',' + str(y) + '\n')

## Load Call-Commit Graph

In [None]:
# set your own pickle_path (relative to this notebook) and repo_name

# pickle_path = "../data/httpd-finished.pickle"
# repo_name = "httpd"

#pickle_path = "../data/call-commit-graphs/flink-1st-7100.pickle"
#repo_name = "flink"

#pickle_path = "../data/call-commit-graphs/kafka-finished.pickle"
#repo_name = "kafka"

pickle_path = "../data/call-commit-graphs/systemml-finished.pickle"
repo_name = "systemml"

repo_path = "../repos/" + repo_name
with open(pickle_path, "rb") as pf:
    ccg = pickle.load(pf)
ccg.set_repo_path(repo_path)

## DevRank Share

In [None]:
# Run DevRank on Call-Commit Graph and them map developer name to share
# WARNING: this code doesn't work in the case that two developers have same name
def get_name_to_share(alpha, sha_to_type={}, coefs=[1, 1, 1, 1]):
    sorted_share, email_to_name = ccg.devrank_developers(alpha, sha_to_type=sha_to_type, coefs=coefs)
    name_to_share = {}
    for email, share in sorted_share:
        name = email_to_name[email]
        if name in name_to_share:
            name_to_share[name] += share
        else:
            name_to_share[name] = share
    share_sum = 0
    for n in name_to_share:
        share_sum += name_to_share[n]
    for n in name_to_share:
        name_to_share[n] = name_to_share[n] / share_sum
    return name_to_share, email_to_name
    
name_to_share, email_to_name = get_name_to_share(default_alpha)

# Uncomment the following line to see sorted results
# sorted(name_to_share.items(), key=lambda x: x[1], reverse=True)

In [None]:
# Write DevRank result of different alphas to csv
for alpha in np.arange(0.1, 1.0, 0.1):
    name_to_share, _ = get_name_to_share(alpha)
    write_hash_to_csv(name_to_share, "../temp/%s-devrank-%.2f.csv" % (repo_name, alpha))

## LocRank

In [None]:
# Extract LOC info from ccg.history
email_to_loc = {}
for sha in ccg.history:
    actor = ccg.repo.commit(sha).author
    email = actor.email
    if email not in email_to_loc:
        email_to_loc[email] = 0
    for func in ccg.history[sha]:
        email_to_loc[email] += ccg.history[sha][func]
        
# WARNING: this code doesn't work in the case that two people have same name
name_to_loc = {}
for email, loc in email_to_loc.items():
    name = email_to_name[email]
    if name in name_to_loc:
        name_to_loc[name] += loc
    else:
        name_to_loc[name] = loc
        
# Uncomment the following line to see sorted results
# sorted(name_to_loc.items(), key=lambda x: x[1], reverse=True)

In [None]:
# Write LocRank result to csv
write_hash_to_csv(name_to_loc, "../temp/%s-loc.csv" % repo_name)

## CommitRank

In [None]:
# Extract number of commits info from ccg.history
email_to_noc = {}
for sha in ccg.history:
    actor = ccg.repo.commit(sha).author
    email = actor.email
    if email not in email_to_noc:
        email_to_noc[email] = 0
    email_to_noc[email] += 1
    
# WARNING: this code doesn't work in the case that two people have same name
name_to_noc = {}
for email, noc in email_to_noc.items():
    name = email_to_name[email]
    if name in name_to_noc:
        name_to_noc[name] += noc
    else:
        name_to_noc[name] = noc
        
# Uncomment the following line to see sorted results
#sorted(name_to_noc.items(), key=lambda x: x[1], reverse=True)

In [None]:
write_hash_to_csv(name_to_noc, "../temp/%s-noc.csv" % repo_name)

## Calculate Gini Coefficient

In [None]:
import os, sys
pwd = os.path.abspath('../')
if pwd not in sys.path: sys.path.append(pwd)

import numpy as np
from tools.excel_charts.gini.gini import gini

def get_gini(name_to_values):
    return gini(np.array([float(x) for x in name_to_values.values()]))

def get_gini_top(name_to_values, percent):
    shares = sorted([float(x) for x in name_to_values.values()], reverse=True)
    return gini(np.array(shares[:int(len(shares) * percent)]))

noc_gini = get_gini(name_to_noc)
loc_gini = get_gini(name_to_loc)
for a in np.arange(0.0, 1.0, 0.1):
    n2s, _ = get_name_to_share(a)
    print(a, noc_gini, loc_gini, get_gini(n2s), get_gini_top(n2s, 0.2), sep=',')


## Basic Validation of Analysis Result

In [None]:
# If a person gets no share, he/she must have made no contribution
ccg.update_shares(default_alpha)
for sha in ccg.history:
    if ccg.share[sha] == 0:
        try:
            assert(len(ccg.history[sha]) == 0)
        except:
            import pdb
            pdb.set_trace()

In [None]:
# Print out number of commits analyzed, manually check it against total number of commits on github
print("Number of commits analyzed: ", len(ccg.visited))

In [None]:
# Get all distinct author and committer emails/names
# Number of distinct committer emails is also the upper bound for number of committers)
all_commits = list(ccg.repo.iter_commits())
author_emails = set()
author_names = set()
committer_emails = set()
committer_names = set()
for commit in all_commits:
    author_emails.add(commit.author.email)
    author_names.add(commit.author.name)
    committer_emails.add(commit.committer.email)
    committer_names.add(commit.committer.name)
print("Number of distinct author emails: ", len(author_emails))
print("Number of distinct author names: ", len(author_names))
print("Number of distinct committer emails: ", len(committer_emails))
print("Number of distinct committer names: ", len(committer_names))

## Get Developer Role from Apache

In [None]:
# APR
# https://projects.apache.org/committee.html?apr
# Copy the roster as a string below
pmc_string = "Aaron Bannert,  Brian Havard, Bradley Nicholes,  Bojan Smojver,  Branko Čibej,  Colm MacCarthaigh,  Eric Covener,  Davi Arnaut,  Dirk-Willem van Gulik,  Brian Fitzpatrick,  Guenter Knauf,  Greg Ames,  Gregg Lewis Smith,  Greg Stein,  Christophe Jaillet,  Justin Erenkrantz,  Jean-Frederic Clere,  Jim Jagielski,  Joe Orton,  Cliff Woolley,  Karl Fogel,  Martin Kraemer,  Max Oliver Bowsher,  Graham Leggett,  Mladen Turk,  Nick Kew,  Paul Querna,  Rainer Jung,  Garrett Rooney,  Ruediger Pluem,  Sascha Schumann,  Stefan Fritsch,  Bill Stoddard,  Sander Striker,  Ben Collins-Sussman,  Thom May,  Jeff Trawick,  William A. Rowe Jr.,  Wilfredo Sanchez,  Yann Ylavic"
committer_string = "Aaron Bannert,  Allan K. Edwards,  Ben Laurie,  Brian Havard,  Bradley Nicholes,  Bojan Smojver,  Branko Čibej,  Brian Pane,  Chuck Murcko,  Jean-Jacques Clar,  Ken Coar,  Colm MacCarthaigh,  Eric Covener,  Davi Arnaut,  Dirk-Willem van Gulik,  Doug MacEachern,  David Reid,  Tony Finch,  Roy T. Fielding,  Brian Fitzpatrick,  Guenter Knauf,  Greg Ames,  Gregg Lewis Smith,  Greg Stein,  Henry Jen,  Hyrum Kurt Wright,  Ian Holsman,  Issac Goldstand,  Ivan Zhakov,  Christophe Jaillet,  Justin Erenkrantz,  Jean-Frederic Clere,  Jim Jagielski,  Joe Orton,  Cliff Woolley,  Karl Fogel,  Madhusudan Mathihalli,  Martin Kraemer,  Max Oliver Bowsher,  Graham Leggett,  Mladen Turk,  André Malo,  Neil Conway,  Nick Kew,  Victor J. Orlikowski,  Philip M. Gollucci,  Daniel Earl Poirier,  Paul Querna,  Ryan Bloom,  Paul J. Reder,  Rainer Jung,  Garrett Rooney,  Ruediger Pluem,  Ralf S. Engelschall,  Sascha Schumann,  Sander Temme,  Stefan Fritsch,  Bill Stoddard,  Sander Striker,  Ben Collins-Sussman,  Takashi Sato,  Thomas J. Donovan,  Jeff Trawick,  William A. Rowe Jr.,  Wilfredo Sanchez,  Yann Ylavic"

In [None]:
# httpd
# https://projects.apache.org/committee.html?httpd
# Copy the roster as a string below
pmc_string = "Brian Havard,  Bradley Nicholes,  Bojan Smojver,  Ben Reser,  Brian McCallister,  Chris Darroch,  Colm MacCarthaigh,  Eric Covener,  Dirk-Willem van Gulik,  Stephen Henson,  Daniel Ruggeri,  Luca Toscano,  Roy T. Fielding,  Guenter Knauf,  Graham Phillip Dumpleton,  Greg Ames,  Gregg Lewis Smith,  Greg Stein,  Daniel Gruno,  Stefan Eissing,  Igor Galić,  Issac Goldstand,  Christophe Jaillet,  Jacob Champion,  Justin Erenkrantz,  Jean-Frederic Clere,  Jim Jagielski,  Joe Schaefer,  Joe Orton,  Kaspar Brand,  Astrid Malo,  Lars Eilebrecht,  Lucien Gentis,  Graham Leggett,  Mark J. Cox,  André Malo,  Nick Kew,  Tony Stevenson,  Philip M. Gollucci,  Paul Querna,  Rich Bowen,  Rainer Jung,  Ruediger Pluem,  Sander Temme,  Stefan Fritsch,  Steffen Land,  Jeff Trawick,  William A. Rowe Jr.,  Yann Ylavic"
committer_string = "Aaron Bannert,  Andrew William John Ford,  Allan K. Edwards,  Ask Bjørn Hansen,  Andreas Steinmetz,  Ben Laurie,  Jem Berkes,  Jesus Blanco Izquierdo,  Bradley Nicholes,  Bojan Smojver,  Ben Reser,  Brian McCallister,  Chris Darroch,  Chuck Murcko,  Jean-Jacques Clar,  Ken Coar,  Colm MacCarthaigh,  Eric Covener,  Daniel Ferradal,  David Harris,  Dirk-Willem van Gulik,  Doug MacEachern,  David Shane Holden,  Stephen Henson,  Daniel Ruggeri,  Edward Lu,  Luca Toscano,  Erik Abele,  Fabien Coelho,  Roy T. Fielding,  Guenter Knauf,  Guy Ferraiolo,  Geoffrey Young,  Philippe Chiasson,  Graham Phillip Dumpleton,  Greg Ames,  Gregory Trubetskoy,  Vincent Deffontaines,  Gregg Lewis Smith,  Greg Stein,  Daniel Gruno,  Ian Holsman,  Stefan Eissing,  Igor Galić,  Issac Goldstand,  Ivan Alexis Barrera Andrade,  Jacek Prucia,  Christophe Jaillet,  Jacob Champion,  Justin Erenkrantz,  Jean-Frederic Clere,  James Paul Gallacher,  Jim Jagielski,  Jim Winstead Jr.,  Jan Kaluža,  Joe Schaefer,  Joe Orton,  John Sachs,  Jason S. Lingohr,  Hiroaki Kawai,  Kaspar Brand,  Keith Wannamaker,  Astrid Malo,  Evgeny Kotkov,  Lars Eilebrecht,  Lucien Gentis,  Luis Gil,  Jeon Jeongho,  Madhusudan Mathihalli,  Mads Toftum,  Manoj Kasichainula,  Martin Kraemer,  Matt Sergeant,  Max Kellermann,  Maxime Petazzoni,  Matthieu Estrade,  Graham Leggett,  Mark J. Cox,  Mike Rumph,  Mladen Turk,  André Malo,  Niklas Edmundsson,  Nilgun Belma Buguner,  Nick Kew,  Nicolas Lehuen,  Vincent Bray,  Victor J. Orlikowski,  Parinkumar Shah,  Tony Stevenson,  Chris Pepper,  Philip M. Gollucci,  Daniel Earl Poirier,  Ryan Pan,  Paul Querna,  Rich Bowen,  Paul J. Reder,  Rian Hunter,  Rici Lake,  Rainer Jung,  Ruediger Pluem,  Ralf S. Engelschall,  Sascha Schumann,  Sander Temme,  Stefan Fritsch,  Joshua Slive,  Ilia Soldatenko,  Steffen Land,  Steve Hay,  Sander Striker,  Stefan Sperling,  Takashi Sato,  Thomas J. Donovan,  David Wheeler,  Frank Gingras,  Jeff Trawick,  William A. Rowe Jr.,  Wilfredo Sanchez,  Yann Ylavic,  Yoshiki Hayashi"

In [None]:
# flink
# https://projects.apache.org/committee.html?flink
# Copy the roster as a string below
pmc_string = "Aljoscha Krettek,  Chesnay Schepler,  Fabian Hueske,  Alan Gates,  Greg Hogan,  Gyula Fora,  Henry Saputra,  Kostas Tzoumas,  Márton Balassi,  Maximilian Michels,  Robert Metzger,  Stephan Ewen,  Sebastian Schelter,  Till Rohrmann,  Timo Walther,  Tzu-Li (Gordon) Tai,  Ufuk Celebi,  Vasiliki Kalavri,  Daniel Warneke"
committer_string = "Aljoscha Krettek,  Lungu Andra,  ChengXiang Li,  Chesnay Schepler,  Chiwan Park,  Dawid Wysakowicz,  Fabian Hueske,  Alan Gates,  Greg Hogan,  Gyula Fora,  Henry Saputra,  Jark Wu,  Jincheng Sun,  Kostas Kloudas,  Kostas Tzoumas,  Kurt Young,  Márton Balassi,  Matthias J. Sax,  Maximilian Michels,  Robert Metzger,  Paris Carbone,  Stephan Ewen,  Shaoxuan Wang,  Xiaogang Shi,  Stefan Richter,  Sebastian Schelter,  Till Rohrmann,  Theodore Vasiloudis,  Timo Walther,  Tzu-Li (Gordon) Tai,  Ufuk Celebi,  Vasiliki Kalavri,  Daniel Warneke"

In [None]:
# kakfa
# https://projects.apache.org/committee.html?kafka
# Copy the roster as a string below
pmc_string = "Guozhang Wang,  Gwen Shapira,  Ismael Juma,  Jakob Homan,  Jason Gustafson,  Joel Jacob Koshy,  Jay Kreps,  Joe Stein,  Jun Rao,  Neha Narkhede,  Prashanth Menon"
committer_string = "Alan Cabrera,  Chris Burroughs,  Damian Guy,  David Arthur,  Ewen Cheslack-Postava,  Geir Magnusson Jr,  Grant Henke,  Guozhang Wang,  Gwen Shapira,  Henry Saputra,  Ismael Juma,  Jakob Homan,  Jason Gustafson,  Joel Jacob Koshy,  Jay Kreps,  Joe Stein,  Becket Qin,  Jun Rao,  Neha Narkhede,  Owen O'Malley,  Prashanth Menon,  Phillip Rhodes,  Rajini Sivaram,  Harsha,  Sriram"

In [None]:
# Systemml
# https://projects.apache.org/committee.html?systemml
# Copy the roster as a string below
pmc_string = "Arvind Surve,  Alexandre V. Evfimievski,  DB Tsai,  Jon Deron Eriksson,  Mike Dusenberry,  Faraz Makari,  Frederick Reiss,  Felix Schueler,  Glenn Weidner,  Holden Karau,  Henry Saputra,  Joseph Kurata Bradley,  Luciano Resende,  Matthias Boehm,  Xiangrui Meng,  Nakul Jindal,  Niketan Pansare,  Prithviraj Sen,  Patrick Wendell,  Rich Bowen,  Berthold Reinwald,  Reynold Xin,  Shirish Tatikonda"
committer_string = "Arvind Surve,  Alexandre V. Evfimievski,  DB Tsai,  Jon Deron Eriksson,  Mike Dusenberry,  Faraz Makari,  Frederick Reiss,  Felix Schueler,  Glenn Weidner,  Holden Karau,  Henry Saputra,  Joseph Kurata Bradley,  Luciano Resende,  Matthias Boehm,  Xiangrui Meng,  Nakul Jindal,  Niketan Pansare,  Prithviraj Sen,  Patrick Wendell,  Rich Bowen,  Berthold Reinwald,  Reynold Xin,  Shirish Tatikonda"

In [None]:
# Parse roster strings
pmc_names = set([n.strip() for n in pmc_string.split(',')])
committer_names = set([n.strip() for n in committer_string.split(',')])

# Basic statistic of the roster
print("Number of PMC members: ", len(pmc_names))
print("Number of committers: ", len(committer_names))
print("Number of committers in PMC: ", len(pmc_names & committer_names))
print("Number of committers not in PMC: ", len(committer_names) - len(pmc_names & committer_names))
print("Number of PMC members not in committers: ", len(pmc_names - committer_names))
print("Total number of people: ", len(committer_names) + len(pmc_names - committer_names))
print("In PMC but not a committer:\n")
for n in pmc_names:
    if n not in committer_names:
        print("\t" + n)

## Compare top k contributors given by DevRank and LocRank

In [None]:
# Plot out the tendency
def get_top_k(name_to_x, k):
    return [pair[0] for pair in sorted(name_to_x.items(), key=lambda x: x[1], reverse=True)[:k]]

xs = range(1, len(name_to_share) + 1)
ys = []
for k in xs:
    top_k_devrank = get_top_k(name_to_share, k)
    top_k_locrank = get_top_k(name_to_loc, k)
    perc = len(set(top_k_devrank).intersection(set(top_k_locrank))) / k
    ys.append(perc)
    
plt.plot(xs, ys)
plt.show()

## Fig2 in the paper: Overlapping of LocRank and DevRank

In [None]:
def figure2(alpha):
    """This function needs name_to_loc, so make sure you run that cell first"""
    xs = [0.05 * i for i in range(1, 21)]
    ys = []
    name_to_share, _ = get_name_to_share(alpha)
    num_authors = len(name_to_share)
    for r in xs:
        k = math.floor(r * num_authors)
        top_k_devrank = get_top_k(name_to_share, k)
        top_k_locrank = get_top_k(name_to_loc, k)
        perc = len(set(top_k_devrank).intersection(set(top_k_locrank))) / k
        ys.append(perc)

    plt.plot(xs, ys)
    plt.ylim((0.8, 1))
    plt.show()
    
    return xs, ys

for alpha in [0.2, 0.8]:
    xs, ys = figure2(alpha)
    write_dots_to_csv(xs, ys, "../temp/%s-fig2-%.2f.csv" % (repo_name, alpha), "Percentage,Overlapping\n")

## Fig3 in the paper: Developers' relative share changes between LocRank and DevRank

In [None]:
def figure3(alpha):
    """This function needs name_to_loc, so make sure you run that cell first"""
    name_to_share, _ = get_name_to_share(alpha)
    # sorted in descending order
    names_sorted_by_loc = [pair[0] for pair in sorted(name_to_loc.items(), key=lambda x: x[1], reverse=True)]
    loc_sum = 0
    for n in name_to_loc:
        loc_sum += name_to_loc[n]
    
    ys, xs = [], []
    for idx, n in enumerate(names_sorted_by_loc):
        xs.append(idx)
        y = name_to_share[n] * loc_sum / name_to_loc[n]
        ys.append(y)
    
    plt.plot(xs, ys, 'o')
    plt.ylim((0, 2))
    plt.show()
    
    return xs, ys

for alpha in [0.2, 0.8]:
    xs, ys = figure3(alpha)
    write_dots_to_csv(xs, ys, "../temp/%s-fig3-%.2f.csv" % (repo_name, alpha), "LocRank,DevRank\n")

## Fig5 in the paper: DevRank scaled by commit type

In [None]:
"""
0：bug
1: improvement
2: feature
3: maintenance
"""

c2t, c2pt = None, None
with open('../data/commit2type.pickle', 'rb') as pf:
    c2t = pickle.load(pf)
with open('../data/predict_commit2type.pickle', 'rb') as pf:
    c2pt = pickle.load(pf)

In [None]:
def get_relative_share_lists(alpha, sha_to_type, weight_vectors = [[1, 1, 1, 1]]):
    orig_n2s, _ = get_name_to_share(alpha)
    # sorted in descending order
    sorted_names = [pair[0] for pair in sorted(orig_n2s.items(), key=lambda x: x[1], reverse=True)]
    results = []
    for coefs in weight_vectors:
        n2s, _ = get_name_to_share(alpha, sha_to_type=sha_to_type, coefs=coefs)
        relative_shares = [n2s[n] / orig_n2s[n] for n in sorted_names if orig_n2s[n]]
        results.append(relative_shares)
    return results

ALPHA = 0.8
WEIGHTS = [[2, 1, 1, 1], [1, 1, 2, 1]] # 2x bug, 2x feature

pred = get_relative_share_lists(ALPHA, c2pt[repo_name])
scaled = get_relative_share_lists(ALPHA, c2t[repo_name], WEIGHTS)

In [None]:
import math

def fig5_preview(headers, columns):
    for col in columns:
        plt.plot(range(len(col)), col, 'o')
    plt.ylim((0, 3))
    plt.legend(headers, loc='best')
    plt.show()

def fig5_csv(headers, columns, file_name):
    with open(file_name, 'w') as f:
        f.write('Developers,')
        for h in headers:
            f.write(h + ',')
        f.write('\n')
        for i in range(len(columns[0])):
            f.write(str(i) + ',')
            for col in columns:
                f.write(str(col[i]) + ',')
            f.write('\n')

fig5_preview(['bug x 2', 'feature x 2'], scaled)
fig5_csv(['bug$\\\\times2$', 'feature$\\\\times2$'], scaled, 'dev-context-' + repo_name + '.csv')

## Accuracy of predicting committers

In [None]:
# Table 1 in the paper
# Committer prediction performance of CommitRank, LocRank, and DevRank

k = len(committer_names)

name_to_share, _ = get_name_to_share(0.8)
top_k_devrank = get_top_k(name_to_share, k)
top_k_locrank = get_top_k(name_to_loc, k)
top_k_nocrank = get_top_k(name_to_noc, k)

print("DevRank accuracy: ", len(set(top_k_devrank).intersection(committer_names)) / k)
print("LocRank accuracy: ", len(set(top_k_locrank).intersection(committer_names)) / k)
print("NocRank accuracy: ", len(set(top_k_nocrank).intersection(committer_names)) / k)

In [None]:
name_to_share, _ = get_name_to_share(0.2)
top_2k_devrank = get_top_k(name_to_share, 2 * k)
top_2k_locrank = get_top_k(name_to_loc, 2 * k)
top_2k_nocrank = get_top_k(name_to_noc, 2 * k)

print("DevRank recall: ", len(set(top_2k_devrank).intersection(committer_names)) / k)
print("LocRank recall: ", len(set(top_2k_locrank).intersection(committer_names)) / k)
print("NocRank recall: ", len(set(top_2k_nocrank).intersection(committer_names)) / k)

In [None]:
xs = range(1, len(name_to_share) + 1)
ys_loc = []
ys_dev = []
for k in xs:
    top_k_devrank = set(get_top_k(name_to_share, k))
    top_k_locrank = set(get_top_k(name_to_loc, k))
    ys_dev.append(len(top_k_devrank.intersection(committer_names)))
    ys_loc.append(len(top_k_locrank.intersection(committer_names)))
plt.plot(xs, ys_dev, color='b', label="DevRank")
plt.plot(xs, ys_loc, color='r', label="LocRank")
plt.legend(loc='best')
plt.show()

## Mismatch between Apache Roster and Analysis Result

In [None]:
# NOTE: Only run this cell if you're analyzing apr repo
roster_spelling = ["Branko Čibej", "William A. Rowe Jr.", "Gregg Lewis Smith", "André Malo"]
repo_spelling = ["Branko Cibej", "William A. Rowe Jr", "Gregg L. Smith", "Andre Malo"]

In [None]:
# NOTE: Only run this cell if you're analyzing httpd repo
roster_spelling = ["André Malo", "Ask Bjørn Hansen", "William A. Rowe Jr."]
repo_spelling = ["Andre Malo", "Ask Bjorn Hansen", "William A. Rowe Jr"]

In [None]:
# NOTE: Only run this cell if you're analyzing flink repo
roster_spelling = ['Ufuk Celebi']
repo_spelling = ['uce']

In [None]:
# Fix different spelling between repo and apache roster
for i in range(len(roster_spelling)):
    committer_names.remove(roster_spelling[i])
    committer_names.add(repo_spelling[i])

In [None]:
# Compare names in analysis result with names in roster
sorted_name_to_share = sorted(name_to_share.items(), key=lambda x: x[1], reverse=True)
name_to_devrank = {}
for idx, pair in enumerate(sorted_name_to_share, 1):
    name_to_devrank[pair[0]] = idx
    
apache_only = committer_names - set(name_to_devrank.keys())
repo_only = set(name_to_devrank.keys()) - committer_names
print("Number of people present in Apache roster but absent from analysis result: ", len(apache_only))
#pp.pprint(apache_only)
print("Number of people present in analysis result but absent in Apache roster: ", len(repo_only))
#pp.pprint(repo_only)

## Correlation Coefficient and Significance Test

In [None]:
initial_alpha = 0.05
step = 0.05
for i in range(19):
    alpha = initial_alpha + step * i
    name_to_share, _ = get_name_to_share(alpha)

    truth = {}
    for n in name_to_share:
        if n in committer_names:
            if n in pmc_names:
                truth[n] = 2
            else:
                truth[n] = 1

    truth_lst = []
    pred_lst = []
    loc_lst = []
    for n in truth:
        truth_lst.append(truth[n])
        pred_lst.append(name_to_share[n])
        loc_lst.append(name_to_loc[n])

    print("Alpha: %.2f" % alpha, spearmanr(truth_lst, pred_lst))
    # print("Alpha: %.2f" % alpha, spearmanr(pred_lst, loc_lst))
    # print("Alpha: %.2f" % alpha, kendalltau(pred_lst, truth_lst))
    # print("Alpha: %.2f" % alpha, kendalltau(pred_lst, loc_lst))

## Gini Coefficients over Alpha

In [None]:
initial_alpha = 0.05
step = 0.05
for i in range(19):
    alpha = initial_alpha + step * i
    name_to_share, _ = get_name_to_share(alpha)
    
    print("Overall Gini: %.2f" % alpha, get_gini(name_to_share))
    top = sorted(name_to_share.items(), key=lambda x: x[1], reverse=True)[:10]
    print("Top Gini: %.2f" % alpha, gini(numpy.array([x[1] for x in top])))
    pp.pprint(top)
    print()