In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

## Load Call-Commit Graph
We have two options:
1. Analyze target repo from scratch and save result in a pickle file
2. Load existing analysis result from a pickle file

In [None]:
# import dependencies and set default alpha value
import pickle
import pprint
from graphs.call_commit_graph import CallCommitGraph
from scipy.stats import spearmanr, kendalltau
default_alpha = 0.85
pp = pprint.PrettyPrinter(indent=4)

In [None]:
# Analyzing from scratch, this will generate a pickle file at the end of analysis
ccg = CallCommitGraph("repos/httpd")
ccg.process(from_beginning=True, into_branches=True, max_branch_length=200)

In [None]:
# Load existing analysis result
# Apache Portable Runtime
#ccg = pickle.load(open("data/apr-finished.pickle", "rb"))

# Apache HTTP Server (httpd)
# Note: the httpd repo must exist to correctly load the pickle file
ccg = None
with open("data/httpd-finished.pickle", "rb") as httpd_pickle:
    ccg = pickle.load(httpd_pickle)

## DevRank Share

In [None]:
# Run DevRank on Call-Commit Graph and them map developer name to share
# WARNING: this code doesn't work in the case that two developers have same name
def get_name_to_share(alpha):
    sorted_share, email_to_name = ccg.devrank_developers(alpha)
    name_to_share = {}
    for email, share in sorted_share:
        name = email_to_name[email]
        if name in name_to_share:
            name_to_share[name] += share
        else:
            name_to_share[name] = share
    return name_to_share, email_to_name
name_to_share, email_to_name = get_name_to_share(default_alpha)
sorted(name_to_share.items(), key=lambda x: x[1], reverse=True)

## LOC

In [None]:
# Extract LOC info from ccg.history
email_to_loc = {}
for sha in ccg.history:
    actor = ccg.repo.commit(sha).author
    email = actor.email
    if email not in email_to_loc:
        email_to_loc[email] = 0
    for func in ccg.history[sha]:
        email_to_loc[email] += ccg.history[sha][func]
        
# WARNING: this code doesn't work in the case that two people have same name
name_to_loc = {}
for email, loc in email_to_loc.items():
    name = email_to_name[email]
    if name in name_to_loc:
        name_to_loc[name] += loc
    else:
        name_to_loc[name] = loc
sorted(name_to_loc.items(), key=lambda x: x[1], reverse=True)

## Calculate Gini Coefficient

In [None]:
import numpy
from tools.excel_charts.gini.gini import gini

def get_gini(name_to_values):
    return gini(numpy.array([float(x) for x in name_to_values.values()]))

print(get_gini(name_to_share))
print(get_gini(name_to_loc))

## Basic Validation of Analysis Result

In [None]:
# If a person gets no share, he/she must have made no contribution
ccg.update_shares(default_alpha)
for sha in ccg.history:
    if ccg.share[sha] == 0:
        assert(len(ccg.history[sha]) == 0)

In [None]:
# Print out number of commits analyzed, manually check it against total number of commits on github
print("Number of commits analyzed: ", len(ccg.visited))

In [None]:
# Get all distinct author and committer emails/names
# Number of distinct committer emails is also the upper bound for number of committers)
all_commits = list(ccg.repo.iter_commits())
author_emails = set()
author_names = set()
committer_emails = set()
committer_names = set()
for commit in all_commits:
    author_emails.add(commit.author.email)
    author_names.add(commit.author.name)
    committer_emails.add(commit.committer.email)
    committer_names.add(commit.committer.name)
print("Number of distinct author emails: ", len(author_emails))
print("Number of distinct author names: ", len(author_names))
print("Number of distinct committer emails: ", len(committer_emails))
print("Number of distinct committer names: ", len(committer_names))

## Get Developer Role from Apache

In [None]:
# NOTE: Only run this cell if you're analyzing apr repo
# https://projects.apache.org/committee.html?apr
# Copy the roster as a string below
pmc_string = "Aaron Bannert,  Brian Havard, Bradley Nicholes,  Bojan Smojver,  Branko Čibej,  Colm MacCarthaigh,  Eric Covener,  Davi Arnaut,  Dirk-Willem van Gulik,  Brian Fitzpatrick,  Guenter Knauf,  Greg Ames,  Gregg Lewis Smith,  Greg Stein,  Christophe Jaillet,  Justin Erenkrantz,  Jean-Frederic Clere,  Jim Jagielski,  Joe Orton,  Cliff Woolley,  Karl Fogel,  Martin Kraemer,  Max Oliver Bowsher,  Graham Leggett,  Mladen Turk,  Nick Kew,  Paul Querna,  Rainer Jung,  Garrett Rooney,  Ruediger Pluem,  Sascha Schumann,  Stefan Fritsch,  Bill Stoddard,  Sander Striker,  Ben Collins-Sussman,  Thom May,  Jeff Trawick,  William A. Rowe Jr.,  Wilfredo Sanchez,  Yann Ylavic"
committer_string = "Aaron Bannert,  Allan K. Edwards,  Ben Laurie,  Brian Havard,  Bradley Nicholes,  Bojan Smojver,  Branko Čibej,  Brian Pane,  Chuck Murcko,  Jean-Jacques Clar,  Ken Coar,  Colm MacCarthaigh,  Eric Covener,  Davi Arnaut,  Dirk-Willem van Gulik,  Doug MacEachern,  David Reid,  Tony Finch,  Roy T. Fielding,  Brian Fitzpatrick,  Guenter Knauf,  Greg Ames,  Gregg Lewis Smith,  Greg Stein,  Henry Jen,  Hyrum Kurt Wright,  Ian Holsman,  Issac Goldstand,  Ivan Zhakov,  Christophe Jaillet,  Justin Erenkrantz,  Jean-Frederic Clere,  Jim Jagielski,  Joe Orton,  Cliff Woolley,  Karl Fogel,  Madhusudan Mathihalli,  Martin Kraemer,  Max Oliver Bowsher,  Graham Leggett,  Mladen Turk,  André Malo,  Neil Conway,  Nick Kew,  Victor J. Orlikowski,  Philip M. Gollucci,  Daniel Earl Poirier,  Paul Querna,  Ryan Bloom,  Paul J. Reder,  Rainer Jung,  Garrett Rooney,  Ruediger Pluem,  Ralf S. Engelschall,  Sascha Schumann,  Sander Temme,  Stefan Fritsch,  Bill Stoddard,  Sander Striker,  Ben Collins-Sussman,  Takashi Sato,  Thomas J. Donovan,  Jeff Trawick,  William A. Rowe Jr.,  Wilfredo Sanchez,  Yann Ylavic"

In [None]:
# NOTE: Only run this cell if you're analyzing httpd repo
# https://projects.apache.org/committee.html?httpd
# Copy the roster as a string below
pmc_string = "Brian Havard,  Bradley Nicholes,  Bojan Smojver,  Ben Reser,  Brian McCallister,  Chris Darroch,  Colm MacCarthaigh,  Eric Covener,  Dirk-Willem van Gulik,  Stephen Henson,  Daniel Ruggeri,  Luca Toscano,  Roy T. Fielding,  Guenter Knauf,  Graham Phillip Dumpleton,  Greg Ames,  Gregg Lewis Smith,  Greg Stein,  Daniel Gruno,  Stefan Eissing,  Igor Galić,  Issac Goldstand,  Christophe Jaillet,  Jacob Champion,  Justin Erenkrantz,  Jean-Frederic Clere,  Jim Jagielski,  Joe Schaefer,  Joe Orton,  Kaspar Brand,  Astrid Malo,  Lars Eilebrecht,  Lucien Gentis,  Graham Leggett,  Mark J. Cox,  André Malo,  Nick Kew,  Tony Stevenson,  Philip M. Gollucci,  Paul Querna,  Rich Bowen,  Rainer Jung,  Ruediger Pluem,  Sander Temme,  Stefan Fritsch,  Steffen Land,  Jeff Trawick,  William A. Rowe Jr.,  Yann Ylavic"
committer_string = "Aaron Bannert,  Andrew William John Ford,  Allan K. Edwards,  Ask Bjørn Hansen,  Andreas Steinmetz,  Ben Laurie,  Jem Berkes,  Jesus Blanco Izquierdo,  Bradley Nicholes,  Bojan Smojver,  Ben Reser,  Brian McCallister,  Chris Darroch,  Chuck Murcko,  Jean-Jacques Clar,  Ken Coar,  Colm MacCarthaigh,  Eric Covener,  Daniel Ferradal,  David Harris,  Dirk-Willem van Gulik,  Doug MacEachern,  David Shane Holden,  Stephen Henson,  Daniel Ruggeri,  Edward Lu,  Luca Toscano,  Erik Abele,  Fabien Coelho,  Roy T. Fielding,  Guenter Knauf,  Guy Ferraiolo,  Geoffrey Young,  Philippe Chiasson,  Graham Phillip Dumpleton,  Greg Ames,  Gregory Trubetskoy,  Vincent Deffontaines,  Gregg Lewis Smith,  Greg Stein,  Daniel Gruno,  Ian Holsman,  Stefan Eissing,  Igor Galić,  Issac Goldstand,  Ivan Alexis Barrera Andrade,  Jacek Prucia,  Christophe Jaillet,  Jacob Champion,  Justin Erenkrantz,  Jean-Frederic Clere,  James Paul Gallacher,  Jim Jagielski,  Jim Winstead Jr.,  Jan Kaluža,  Joe Schaefer,  Joe Orton,  John Sachs,  Jason S. Lingohr,  Hiroaki Kawai,  Kaspar Brand,  Keith Wannamaker,  Astrid Malo,  Evgeny Kotkov,  Lars Eilebrecht,  Lucien Gentis,  Luis Gil,  Jeon Jeongho,  Madhusudan Mathihalli,  Mads Toftum,  Manoj Kasichainula,  Martin Kraemer,  Matt Sergeant,  Max Kellermann,  Maxime Petazzoni,  Matthieu Estrade,  Graham Leggett,  Mark J. Cox,  Mike Rumph,  Mladen Turk,  André Malo,  Niklas Edmundsson,  Nilgun Belma Buguner,  Nick Kew,  Nicolas Lehuen,  Vincent Bray,  Victor J. Orlikowski,  Parinkumar Shah,  Tony Stevenson,  Chris Pepper,  Philip M. Gollucci,  Daniel Earl Poirier,  Ryan Pan,  Paul Querna,  Rich Bowen,  Paul J. Reder,  Rian Hunter,  Rici Lake,  Rainer Jung,  Ruediger Pluem,  Ralf S. Engelschall,  Sascha Schumann,  Sander Temme,  Stefan Fritsch,  Joshua Slive,  Ilia Soldatenko,  Steffen Land,  Steve Hay,  Sander Striker,  Stefan Sperling,  Takashi Sato,  Thomas J. Donovan,  David Wheeler,  Frank Gingras,  Jeff Trawick,  William A. Rowe Jr.,  Wilfredo Sanchez,  Yann Ylavic,  Yoshiki Hayashi"

In [None]:
# Parse roster strings
pmc_names = set([n.strip() for n in pmc_string.split(',')])
committer_names = set([n.strip() for n in committer_string.split(',')])

In [None]:
# Basic statistic of the roster
print("Number of PMC members: ", len(pmc_names))
print("Number of committers: ", len(committer_names))
print("Number of committers in PMC: ", len(pmc_names & committer_names))
print("Number of committers not in PMC: ", len(committer_names) - len(pmc_names & committer_names))
print("Number of PMC members not in committers: ", len(pmc_names - committer_names))
print("Total number of people: ", len(committer_names) + len(pmc_names - committer_names))
print("In PMC but not a committer:\n")
for n in pmc_names:
    if n not in committer_names:
        print("\t" + n)

## Mismatch between Apache Roster and Analysis Result

In [None]:
# NOTE: Only run this cell if you're analyzing apr repo
roster_spelling = ["Branko Čibej", "William A. Rowe Jr.", "Gregg Lewis Smith", "André Malo"]
repo_spelling = ["Branko Cibej", "William A. Rowe Jr", "Gregg L. Smith", "Andre Malo"]

In [None]:
# NOTE: Only run this cell if you're analyzing httpd repo
roster_spelling = ["André Malo", "Ask Bjørn Hansen", "William A. Rowe Jr."]
repo_spelling = ["Andre Malo", "Ask Bjorn Hansen", "William A. Rowe Jr"]

In [None]:
# Fix different spelling between repo and apache roster
for i in range(len(roster_spelling)):
    committer_names.remove(roster_spelling[i])
    committer_names.add(repo_spelling[i])

In [None]:
# Compare names in analysis result with names in roster
sorted_name_to_share = sorted(name_to_share.items(), key=lambda x: x[1], reverse=True)
name_to_devrank = {}
for idx, pair in enumerate(sorted_name_to_share, 1):
    name_to_devrank[pair[0]] = idx
    
apache_only = committer_names - set(name_to_devrank.keys())
repo_only = set(name_to_devrank.keys()) - committer_names
print("Number of people present in Apache roster but absent from analysis result: ", len(apache_only))
#pp.pprint(apache_only)
print("Number of people present in analysis result but absent in Apache roster: ", len(repo_only))
#pp.pprint(repo_only)

## Correlation Coefficient and Significance Test

In [None]:
initial_alpha = 0.05
step = 0.05
for i in range(19):
    alpha = initial_alpha + step * i
    name_to_share, _ = get_name_to_share(alpha)

    truth = {}
    for n in name_to_share:
        if n in committer_names:
            if n in pmc_names:
                truth[n] = 2
            else:
                truth[n] = 1

    truth_lst = []
    pred_lst = []
    loc_lst = []
    for n in truth:
        truth_lst.append(truth[n])
        pred_lst.append(name_to_share[n])
        loc_lst.append(name_to_loc[n])

    print("Alpha: %.2f" % alpha, spearmanr(truth_lst, pred_lst))
    # print("Alpha: %.2f" % alpha, spearmanr(pred_lst, loc_lst))
    # print("Alpha: %.2f" % alpha, kendalltau(pred_lst, truth_lst))
    # print("Alpha: %.2f" % alpha, kendalltau(pred_lst, loc_lst))

## Gini Coefficients over Alpha

In [None]:
initial_alpha = 0.05
step = 0.05
for i in range(19):
    alpha = initial_alpha + step * i
    name_to_share, _ = get_name_to_share(alpha)
    
    print("Overall Gini: %.2f" % alpha, get_gini(name_to_share))
    top = sorted(name_to_share.items(), key=lambda x: x[1], reverse=True)[:10]
    print("Top Gini: %.2f" % alpha, gini(numpy.array([x[1] for x in top])))
    pp.pprint(top)
    print()