In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from __future__ import print_function, division

from matplotlib import pyplot as plt
import pandas
import src
import gensim
import os
import os.path
import csv
import functools
import itertools
import collections
import scipy
import scipy.stats
from operator import itemgetter
from pprint import pprint

def fake(*args, **kwargs):
    print('Fake called with', str(args), str(kwargs))
    sys.exit(1)

# fake out the create_model so we don't accidentally attempt to create data
src.common.create_model = fake

In [3]:
print(os.getcwd())
if os.getcwd().endswith('notebooks'):
    os.chdir('..')
print(os.getcwd())

/home/cscorley/git/triage/notebooks
/home/cscorley/git/triage


In [38]:
def wilcoxon(x, y):
    T, p = scipy.stats.wilcoxon(x.dropna(), y.dropna(), correction=True)

    nonzeros = sum(1 for a, b in zip(x, y) if (a - b) != 0)
    S = sum(range(1, nonzeros + 1))

    #assert any([item == 0 for item in x]), "x has 0"
    #assert any([item == 0 for item in y]), "y has 0"

    assert S >= T, "%f %f" % (S, T)

    Td = S - T
    rsp1 = Td / S
    rsp2 = T / S
    r = rsp1 - rsp2
    # From this information alone, the remaining rank sum can be computed, because
    # it is the total sum S minus T, or in this case 45 - 18 = 27. Next, the two
    # rank-sum proportions are 27/45 = 60% and 18/45 = 40%. Finally, the rank
    # correlation is the difference between the two proportions (.60 minus .40),
    # hence r = .20.
    return T, p, r

In [5]:
kwargs = dict(model="lda", level="file", source=["changeset"], force=False, rankpath='', config=dict())
default_model_config, model_config_string = src.main.get_default_model_config(kwargs)
default_changeset_config, changeset_config_string = src.main.get_default_changeset_config()

model_config = dict(default_model_config)

model_config.update({
    'alpha_base': 1,
    'eta_base': 1,
})

changeset_config = dict(default_changeset_config)

def get_config_string(config):
    return '-'.join([unicode(v) for k, v in sorted(config.items()) if not k.endswith("_base")])

alpha_bases = ['auto', 1, 2, 5]
eta_bases = ['auto', 1, 2, 5]
num_topics = [100, 200, 500]

def get_rank_name(kind, experiment, changeset_config, model_config):
    """
    kind = [changeset, release, temporal]
    experiment = [triage, feature_location]
    """
    cs_str = get_config_string(changeset_config)
    model_config = dict(model_config)
    m_str = get_config_string(model_config)
    return '-'.join([kind, experiment, 'lda', cs_str, m_str, 'file', 'ranks']).lower() + '.csv.gz'

model_sweep = list()
for a, e, K in itertools.product(alpha_bases, eta_bases, num_topics):
    m = dict(model_config)
    m['alpha_base'] = a
    m['eta_base'] = e

    if a != 'auto':
        a /= K
    if e != 'auto':
        e /= K
    m['alpha'] = a
    m['eta'] = e
    m['num_topics'] = K
    model_sweep.append(m)

corpus_sweep = list()
b = [True, False]
for a, c, m, r in itertools.product(b, repeat=4):
    conf = dict(changeset_config)
    conf['include_additions'] = a
    conf['include_context'] = c
    conf['include_message'] = m
    conf['include_removals'] = r
    if any(conf.values()):
        corpus_sweep.append(conf)

In [6]:


# All of the sweep files need renaming throughout.  The *do* use the seed, it's just not named correctly
model_config_string = model_config_string.replace("seed1-", "")

kwargs.update({'changeset_config': changeset_config,
               'changeset_config_string': changeset_config_string})

kwargs.update({'model_config': model_config,
               'model_config_string': model_config_string})

# load project info
projects = src.common.load_projects(kwargs)

#projects = src.common.load_projects(dict(model="lda", level="file", rankpath='', config=dict()), "../data")
projects

[Project(name='tika', printable_name='Tika v1.8', version='v1.8', ref='refs/tags/1.8', data_path='data/tika/', full_path='data/tika/v1.8/', src_path='data/tika/v1.8/src/', model_config={'passes': 1, 'num_topics': 500, 'algorithm': 'batch', 'decay': 0.5, 'eta_base': 1, 'alpha_base': 1, 'eta': 0.002, 'iterations': 1000, 'offset': 1.0, 'alpha': 0.002, 'max_bound_iterations': 1000}, source=['changeset'], model_config_string=u'batch-0.002-0.5-0.002-1000-1000-500-1.0-1', force=False, level='file', changeset_config_string=u'True-True-False-True', model='lda', config={}, changeset_config={'include_removals': True, 'include_message': False, 'include_additions': True, 'include_context': True}, rankpath=''),
 Project(name='pig', printable_name='Pig v0.14.0', version='v0.14.0', ref='refs/tags/release-0.14.0', data_path='data/pig/', full_path='data/pig/v0.14.0/', src_path='data/pig/v0.14.0/src/', model_config={'passes': 1, 'num_topics': 500, 'algorithm': 'batch', 'decay': 0.5, 'eta_base': 1, 'alpha

In [7]:
cs_dit = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'triage', c, model_config)) for c in corpus_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data_path, project.version, rankname)
    if os.path.exists(rankpath):
        #  cs_dit.append(project._replace(rankpath=rankpath, config=config))
        project = project._replace(changeset_config=config, rankpath=rankpath, config=config)
        cs_dit.append(
            project._replace(
                model_config_string=get_config_string(project.model_config),
                changeset_config_string=get_config_string(project.changeset_config)
            ))
    else:
        print(rankpath)

cs_flt = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'feature_location', c, model_config)) for c in corpus_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data_path, project.version,  rankname)
    if os.path.exists(rankpath):
        #cs_flt.append(project._replace(rankpath=rankpath, config=config))
        project = project._replace(changeset_config=config, rankpath=rankpath, config=config)
        cs_flt.append(
            project._replace(
                model_config_string=get_config_string(project.model_config),
                changeset_config_string=get_config_string(project.changeset_config)
            ))
    else:
        print(rankpath)

        
ms_dit = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'triage', changeset_config, c)) for c in model_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data_path, project.version,  rankname)
    if os.path.exists(rankpath):
        #ms_dit.append(project._replace(rankpath=rankpath, config=config))
        project = project._replace(model_config=config, rankpath=rankpath, config=config)
        ms_dit.append(
            project._replace(
                model_config_string=get_config_string(project.model_config),
                changeset_config_string=get_config_string(project.changeset_config)
            ))
    else:
        print(rankpath)
        
ms_flt = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'feature_location', changeset_config, c)) for c in model_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data_path, project.version,  rankname)
    if os.path.exists(rankpath):
        #ms_flt.append(project._replace(rankpath=rankpath, config=config))
        project = project._replace(model_config=config, rankpath=rankpath, config=config)
        ms_flt.append(
            project._replace(
                model_config_string=get_config_string(project.model_config),
                changeset_config_string=get_config_string(project.changeset_config)
            ))
    else:
        print(rankpath)

In [8]:
cs_dit[0]
src.triage.run_experiment(cs_dit[0])

{'changeset': [(4, '1269', u'Nick_Burch_<nick@apache.org>'),
  (6, '1483', u'Chris_Mattmann_<mattmann@apache.org>'),
  (7, '1548', u'Tim_Allison_<tallison@apache.org>'),
  (3, '1489', u'Tim_Allison_<tallison@apache.org>'),
  (9, '1547', u'Tyler_Palsulich_<tpalsulich@apache.org>'),
  (7, '1544', u'Tim_Allison_<tallison@apache.org>'),
  (1, '1589', u'Nick_Burch_<nick@apache.org>'),
  (7, '1542', u'Tim_Allison_<tallison@apache.org>'),
  (3, '1541', u'Chris_Mattmann_<mattmann@apache.org>'),
  (1, '1580', u'Chris_Mattmann_<mattmann@apache.org>'),
  (19, '1581', u'Hong-Thai_Nguyen_<thaichat04@apache.org>'),
  (1, '1587', u'grossws_<grossws@unknown>'),
  (12, '1584', u'Tim_Allison_<tallison@apache.org>'),
  (6, '1549', u'Chris_Mattmann_<mattmann@apache.org>'),
  (1, '1578', u'Ann_Bryant_Burgess_<aburgess@apache.org>'),
  (1, '1521', u'Nick_Burch_<nick@apache.org>'),
  (5, '1561', u'Chris_Mattmann_<mattmann@apache.org>'),
  (3, '1563', u'Nick_Burch_<nick@apache.org>'),
  (4, '936', u'Jukka_Lau

In [9]:
corpus_df = pandas.DataFrame(columns=["Subject", "Task", "Issue", "Rank", "Item", "Additions", "Removals", "Context", "Message"])
for item in cs_dit:
    df = pandas.DataFrame(columns=corpus_df.columns)
    result = src.triage.run_experiment(item)["changeset"]
    df["Rank"] = [x for x, y, z in result]
    df["Issue"] = [y for x, y, z in result]
    df["Item"] = [z for x, y, z in result]
    df["Subject"] = df.Subject.fillna(item.printable_name)
    df["Additions"] = df.Additions.fillna(item.config['include_additions'])
    df["Removals"] = df.Removals.fillna(item.config['include_removals'])
    df["Context"] = df.Context.fillna(item.config['include_context'])
    df["Message"] = df.Message.fillna(item.config['include_message'])
    df["Task"] = df.Task.fillna("DIT")
    corpus_df = corpus_df.append(df, ignore_index=True)
    
for item in cs_flt:
    df = pandas.DataFrame(columns=corpus_df.columns)
    result = src.feature_location.run_experiment(item)["changeset"]
    df["Rank"] = [x for x, y, z in result]
    df["Issue"] = [y for x, y, z in result]
    df["Item"] = [z for x, y, z in result]
    df["Subject"] = df.Subject.fillna(item.printable_name)
    df["Additions"] = df.Additions.fillna(item.config['include_additions'])
    df["Removals"] = df.Removals.fillna(item.config['include_removals'])
    df["Context"] = df.Context.fillna(item.config['include_context'])
    df["Message"] = df.Message.fillna(item.config['include_message'])
    df["Task"] = df.Task.fillna("FLT")
    corpus_df = corpus_df.append(df, ignore_index=True)

In [10]:
model_df = pandas.DataFrame(columns=["Subject", "Task", "Issue", "Rank", "Item", "alpha", "eta", "K"])
for item in ms_dit:
    df = pandas.DataFrame(columns=model_df.columns)
    result = src.triage.run_experiment(item)["changeset"]
    df["Rank"] = [x for x, y, z in result]
    df["Issue"] = [y for x, y, z in result]
    df["Item"] = [z for x, y, z in result]
    df["Subject"] = df.Subject.fillna(item.printable_name)
    df["alpha"] = df.alpha.fillna(item.config['alpha_base'])
    df["eta"] = df.eta.fillna(item.config['eta_base'])
    df["K"] = df.K.fillna(item.config['num_topics'])
    df["Task"] = df.Task.fillna("DIT")
    model_df = model_df.append(df, ignore_index=True)
    
for item in ms_flt:
    df = pandas.DataFrame(columns=model_df.columns)
    result = src.feature_location.run_experiment(item)["changeset"]
    df["Rank"] = [x for x, y, z in result]
    df["Issue"] = [y for x, y, z in result]
    df["Item"] = [z for x, y, z in result]
    df["Subject"] = df.Subject.fillna(item.printable_name)
    df["alpha"] = df.alpha.fillna(item.config['alpha_base'])
    df["eta"] = df.eta.fillna(item.config['eta_base'])
    df["K"] = df.K.fillna(item.config['num_topics'])
    df["Task"] = df.Task.fillna("FLT")
    model_df = model_df.append(df, ignore_index=True)

# Corpus analysis

In [11]:
len(corpus_sweep)

15

#### These are inequal because of the False, False, False, False configuration being invalid.

In [12]:
corpus_df.groupby(["Subject", "Task", "Additions"]).Issue.apply(len)

Subject            Task  Additions
BookKeeper v4.3.0  DIT   False        1148
                         True         1312
                   FLT   False        1001
                         True         1144
Mahout v0.10.0     DIT   False         931
                         True         1064
                   FLT   False         350
                         True          400
OpenJPA v2.3.0     DIT   False         959
                         True         1096
                   FLT   False         917
                         True         1048
Pig v0.14.0        DIT   False        1554
                         True         1776
                   FLT   False        1218
                         True         1392
Tika v1.8          DIT   False         280
                         True          320
                   FLT   False         252
                         True          288
ZooKeeper v3.5.0   DIT   False        2513
                         True         2872
                   

In [13]:
(corpus_df.groupby(["Task", "Additions"]).Rank.apply(src.utils.calculate_mrr),
 '********************************************',
 corpus_df.groupby(["Task", "Removals"]).Rank.apply(src.utils.calculate_mrr),
 '********************************************',
 corpus_df.groupby(["Task", "Context"]).Rank.apply(src.utils.calculate_mrr),
 '********************************************',
 corpus_df.groupby(["Task", "Message"]).Rank.apply(src.utils.calculate_mrr),
)

(Task  Additions
 DIT   False        0.355573
       True         0.364590
 FLT   False        0.361306
       True         0.418083
 Name: Rank, dtype: float64,
 '********************************************',
 Task  Removals
 DIT   False       0.373944
       True        0.348516
 FLT   False       0.398596
       True        0.385454
 Name: Rank, dtype: float64,
 '********************************************',
 Task  Context
 DIT   False      0.340561
       True       0.377725
 FLT   False      0.385554
       True       0.396867
 Name: Rank, dtype: float64,
 '********************************************',
 Task  Message
 DIT   False      0.358320
       True       0.362187
 FLT   False      0.376123
       True       0.405118
 Name: Rank, dtype: float64)

In [14]:
(corpus_df.groupby(["Subject", "Task", "Additions"]).Rank.apply(src.utils.calculate_mrr),
 '********************************************',
 corpus_df.groupby(["Subject", "Task", "Removals"]).Rank.apply(src.utils.calculate_mrr),
 '********************************************',
 corpus_df.groupby(["Subject", "Task", "Context"]).Rank.apply(src.utils.calculate_mrr),
 '********************************************',
 corpus_df.groupby(["Subject", "Task", "Message"]).Rank.apply(src.utils.calculate_mrr),
)

(Subject            Task  Additions
 BookKeeper v4.3.0  DIT   False        0.646312
                          True         0.633168
                    FLT   False        0.402220
                          True         0.500213
 Mahout v0.10.0     DIT   False        0.291689
                          True         0.319166
                    FLT   False        0.291606
                          True         0.288826
 OpenJPA v2.3.0     DIT   False        0.311652
                          True         0.372377
                    FLT   False        0.263736
                          True         0.311058
 Pig v0.14.0        DIT   False        0.214010
                          True         0.169551
                    FLT   False        0.321887
                          True         0.379533
 Tika v1.8          DIT   False        0.350845
                          True         0.393482
                    FLT   False        0.435476
                          True         0.493537
 Zoo

In [46]:
# what the fuck was I doing here?
res = pandas.DataFrame(columns=["Subject", "Task", "Config", "ExcludeMRR", "IncludeMRR", "p"])
for k in ["Additions", "Removals", "Context", "Message"]:
    for key, group in corpus_df.groupby(["Subject", "Task"]):
        subject, task = key
        sub = group.groupby(k).groups
        f = corpus_df.ix[sub[False]].Rank
        t = corpus_df.ix[sub[True]].Rank
        stat, p = scipy.stats.mannwhitneyu(f, t)
        # Must use Mann Whitney here instead of Wilcoxon because the FFFF config (all False) creates an offset in the total number of ranks!
        
        res = res.append(
            dict(
                zip(res.columns, 
                    [subject, task, k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
            , ignore_index=True)
    sub = corpus_df.groupby(["Task", k]).groups
    f = corpus_df.ix[sub[("DIT", False)]].Rank
    t = corpus_df.ix[sub[("DIT", True)]].Rank
    stat, p = scipy.stats.mannwhitneyu(f, t)
    res = res.append(
        dict(
            zip(res.columns, 
                ["Overall", "DIT", k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
        , ignore_index=True)
    f = corpus_df.ix[sub[("FLT", False)]].Rank
    t = corpus_df.ix[sub[("FLT", True)]].Rank
    stat, p = scipy.stats.mannwhitneyu(f, t)
    res = res.append(
        dict(
            zip(res.columns, 
                ["Overall", "FLT", k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
        , ignore_index=True)        

In [47]:
res[(res.ExcludeMRR > res.IncludeMRR) & (res.p < 0.01)]

Unnamed: 0,Subject,Task,Config,ExcludeMRR,IncludeMRR,p
16,Mahout v0.10.0,DIT,Removals,0.327144,0.288143,0.0008426824
20,Pig v0.14.0,DIT,Removals,0.220149,0.16418,2.857885e-12
24,ZooKeeper v3.5.0,DIT,Removals,0.378937,0.348919,1.159353e-08
26,Overall,DIT,Removals,0.373944,0.348516,3.629592e-13
27,Overall,FLT,Removals,0.398596,0.385454,0.001726129
52,ZooKeeper v3.5.0,DIT,Message,0.372189,0.354824,0.0008063377


In [49]:
res[(res.ExcludeMRR <= res.IncludeMRR) & (res.p < 0.01)]

Unnamed: 0,Subject,Task,Config,ExcludeMRR,IncludeMRR,p
1,BookKeeper v4.3.0,FLT,Additions,0.40222,0.500213,1.223079e-09
2,Mahout v0.10.0,DIT,Additions,0.291689,0.319166,0.00504991
4,OpenJPA v2.3.0,DIT,Additions,0.311652,0.372377,3.54704e-06
7,Pig v0.14.0,FLT,Additions,0.321887,0.379533,7.796939e-05
10,ZooKeeper v3.5.0,DIT,Additions,0.351252,0.373143,1.506603e-09
11,ZooKeeper v3.5.0,FLT,Additions,0.421908,0.470905,8.444801e-05
12,Overall,DIT,Additions,0.355573,0.36459,0.0006921688
13,Overall,FLT,Additions,0.361306,0.418083,6.912758e-14
28,BookKeeper v4.3.0,DIT,Context,0.592102,0.680602,1.904592e-10
34,Pig v0.14.0,DIT,Context,0.171824,0.206464,9.967249999999999e-42


In [19]:
# for key, group in corpus_df.groupby(["Subject", "Task"]):
#     ranks = dict()
#     for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
#         ranks[subkey] = subgroup.Rank

#     print(key, scipy.stats.friedmanchisquare(*ranks.values()))
#     for x, y in itertools.combinations(corpus_df.groupby(["Additions", "Removals", "Context", "Message"]).groups.keys(), r=2):
#         stat, p = scipy.stats.wilcoxon(ranks[x], ranks[y])
#         if p < 0.01:
#             print(x, y, p, "******")
#         else:
#             print(x, y, p)
#     print()

In [93]:
friedman_df = pandas.DataFrame(columns=["Subject", "Task", "$\chi^2(15)$", "p-value"])

for key, group in corpus_df.groupby(["Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
        assert subkey not in ranks
        ranks[subkey] = subgroup.Rank

    stat, p = scipy.stats.friedmanchisquare(*ranks.values())
    friedman_df = friedman_df.append(
        dict(
            zip(friedman_df.columns, 
                ["all subject systems", key, stat, p]))
        , ignore_index=True)

for key, group in corpus_df.groupby(["Subject", "Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
        assert subkey not in ranks
        ranks[subkey] = subgroup.Rank

    stat, p = scipy.stats.friedmanchisquare(*ranks.values())
    friedman_df = friedman_df.append(
        dict(
            zip(friedman_df.columns, 
                [key[0], key[1], stat, p]))
        , ignore_index=True)
    
friedman_df

Unnamed: 0,Subject,Task,$\chi^2(15)$,p-value
0,all subject systems,DIT,1026.420224,3.351323e-210
1,all subject systems,FLT,204.925199,5.4081739999999996e-36
2,BookKeeper v4.3.0,DIT,269.159657,3.082638e-49
3,BookKeeper v4.3.0,FLT,120.17871,5.805074999999999e-19
4,Mahout v0.10.0,DIT,50.551436,4.935036e-06
5,Mahout v0.10.0,FLT,13.595833,0.4802349
6,OpenJPA v2.3.0,DIT,124.629893,7.768937e-20
7,OpenJPA v2.3.0,FLT,39.360167,0.0003207992
8,Pig v0.14.0,DIT,887.094,2.512999e-180
9,Pig v0.14.0,FLT,53.36931,1.645994e-06


In [110]:
FIG_TEX = """\\begin{table}
\\begin{spacing}{1.2}
\\centering
\\parbox{.45\\linewidth}{
\\centering
\\caption{Friedman test results for FLT}
\\label{table:combo-friedman-flt}
%s}
\\hfill
\\parbox{.45\\linewidth}{
\\centering
\\caption{Friedman test results for DIT}
\\label{table:combo-friedman-dit}
%s}
\\end{spacing}
\\end{table}
"""

dit_friedman = friedman_df[friedman_df.Task == "DIT"]
del dit_friedman["Task"]

flt_friedman = friedman_df[friedman_df.Task == "FLT"]
del flt_friedman["Task"]

formatters = {
    "p-value": lambda x: ("$p<%f" % bon).rstrip("0") + "$" if x < bon else "$%.4f$" % x ,
    "Subject": lambda x: x.replace("all", "\\midrule\nAll") if x.startswith("all") else x,

}

flt_tex = flt_friedman.sort("Subject").to_latex(index=False,
                                            escape=False, # needed so it doesn't screw up formatters
                                            formatters=formatters)

dit_tex = dit_friedman.sort("Subject").to_latex(index=False,
                                            escape=False, # needed so it doesn't screw up formatters
                                            formatters=formatters)

print(FIG_TEX % (flt_tex, dit_tex))

with open(os.path.expanduser("~/git/dissertation/tables/combo_friedman_results.tex"), "wt") as f:
    print(FIG_TEX % (flt_tex, dit_tex), file=f)

\begin{table}
\begin{spacing}{1.2}
\centering
\parbox{.45\linewidth}{
\centering
\caption{Friedman test results for FLT}
\label{table:combo-friedman-flt}
\begin{tabular}{lrr}
\toprule
                      Subject &  $\chi^2(15)$ &  p-value \\
\midrule
            BookKeeper v4.3.0 &    120.178710 & $p<0.01$ \\
               Mahout v0.10.0 &     13.595833 & $0.4802$ \\
               OpenJPA v2.3.0 &     39.360167 & $p<0.01$ \\
                  Pig v0.14.0 &     53.369310 & $p<0.01$ \\
                    Tika v1.8 &     39.141344 & $p<0.01$ \\
             ZooKeeper v3.5.0 &     71.613098 & $p<0.01$ \\
 \midrule
All subject systems &    204.925199 & $p<0.01$ \\
\bottomrule
\end{tabular}
}
\hfill
\parbox{.45\linewidth}{
\centering
\caption{Friedman test results for DIT}
\label{table:combo-friedman-dit}
\begin{tabular}{lrr}
\toprule
                      Subject &  $\chi^2(15)$ &  p-value \\
\midrule
            BookKeeper v4.3.0 &    269.159657 & $p<0.01$ \\
               Mahout v0.

In [76]:
cres = pandas.DataFrame(columns=["Subject", "Task", "Config", "Config2", "MRR", "MRR2", "T", "p", "r"])

for key, group in corpus_df.groupby(["Subject", "Task"]):
    for c1, c2 in itertools.combinations(corpus_sweep, r=2):
        a = group[
            (group.Additions == c1["include_additions"]) &
            (group.Removals == c1["include_removals"]) &
            (group.Context == c1["include_context"]) &
            (group.Message == c1["include_message"])
                 ].Rank
        b = group[
            (group.Additions == c2["include_additions"]) &
            (group.Removals == c2["include_removals"]) &
            (group.Context == c2["include_context"]) &
            (group.Message == c2["include_message"])
                 ].Rank
        stat, p, r = wilcoxon(a,b)
        cres = cres.append(
            dict(
                zip(cres.columns, 
                    [key[0], key[1],
                     get_config_string(c1),
                     get_config_string(c2),
                     src.utils.calculate_mrr(a), 
                     src.utils.calculate_mrr(b),
                     stat, p,r]))
            , ignore_index=True)

for key, group in corpus_df.groupby(["Task"]):
    for c1, c2 in itertools.combinations(corpus_sweep, r=2):
        a = group[
            (group.Additions == c1["include_additions"]) &
            (group.Removals == c1["include_removals"]) &
            (group.Context == c1["include_context"]) &
            (group.Message == c1["include_message"])
                 ].Rank
        b = group[
            (group.Additions == c2["include_additions"]) &
            (group.Removals == c2["include_removals"]) &
            (group.Context == c2["include_context"]) &
            (group.Message == c2["include_message"])
                 ].Rank
        stat, p, r = wilcoxon(a,b)
        cres = cres.append(
            dict(
                zip(cres.columns, 
                    ["All", key,
                     get_config_string(c1),
                     get_config_string(c2),
                     src.utils.calculate_mrr(a), 
                     src.utils.calculate_mrr(b),
                     stat, p,r]))
            , ignore_index=True)


In [78]:
len(list(itertools.combinations(range(15),2)))

105

In [77]:
bon=0.01/len(list(itertools.combinations(range(15),2)))
bon, len(cres[(cres.p < bon)]), len(cres[(cres.p < (0.01))]), len(cres)

(9.523809523809524e-05, 335, 515, 1470)

In [85]:
bon = 0.01

In [86]:
d = cres[(cres.Subject == "All") &
         (cres.Task == "DIT") &
         (cres.p < bon)]
len(d), d

(86,
      Subject Task                  Config                 Config2       MRR  \
 1260     All  DIT     True-True-True-True    True-True-True-False  0.378414   
 1263     All  DIT     True-True-True-True    True-False-True-True  0.378414   
 1264     All  DIT     True-True-True-True   True-False-True-False  0.378414   
 1265     All  DIT     True-True-True-True   True-False-False-True  0.378414   
 1266     All  DIT     True-True-True-True  True-False-False-False  0.378414   
 1267     All  DIT     True-True-True-True    False-True-True-True  0.378414   
 1268     All  DIT     True-True-True-True   False-True-True-False  0.378414   
 1269     All  DIT     True-True-True-True   False-True-False-True  0.378414   
 1270     All  DIT     True-True-True-True  False-True-False-False  0.378414   
 1271     All  DIT     True-True-True-True   False-False-True-True  0.378414   
 1272     All  DIT     True-True-True-True  False-False-True-False  0.378414   
 1273     All  DIT     True-True-Tr

In [87]:
f = cres[(cres.Subject == "All") &
         (cres.Task == "FLT") &
         (cres.p < bon)]
len(f), f

(49,
      Subject Task                  Config                 Config2       MRR  \
 1372     All  FLT     True-True-True-True    False-True-True-True  0.431519   
 1374     All  FLT     True-True-True-True   False-True-False-True  0.431519   
 1375     All  FLT     True-True-True-True  False-True-False-False  0.431519   
 1376     All  FLT     True-True-True-True   False-False-True-True  0.431519   
 1378     All  FLT     True-True-True-True  False-False-False-True  0.431519   
 1385     All  FLT    True-True-True-False    False-True-True-True  0.451749   
 1386     All  FLT    True-True-True-False   False-True-True-False  0.451749   
 1387     All  FLT    True-True-True-False   False-True-False-True  0.451749   
 1388     All  FLT    True-True-True-False  False-True-False-False  0.451749   
 1389     All  FLT    True-True-True-False   False-False-True-True  0.451749   
 1391     All  FLT    True-True-True-False  False-False-False-True  0.451749   
 1399     All  FLT    True-True-Fal

# Model analysis

In [21]:
model_df[:10]

Unnamed: 0,Subject,Task,Issue,Rank,Item,alpha,eta,K
0,Tika v1.8,DIT,1269,9,Tyler_Palsulich_<tpalsulich@apache.org>,auto,auto,100
1,Tika v1.8,DIT,1483,15,Chris_Mattmann_<mattmann@apache.org>,auto,auto,100
2,Tika v1.8,DIT,1548,5,Tim_Allison_<tallison@apache.org>,auto,auto,100
3,Tika v1.8,DIT,1489,4,Tim_Allison_<tallison@apache.org>,auto,auto,100
4,Tika v1.8,DIT,1547,4,Tyler_Palsulich_<tpalsulich@apache.org>,auto,auto,100
5,Tika v1.8,DIT,1544,3,Tim_Allison_<tallison@apache.org>,auto,auto,100
6,Tika v1.8,DIT,1589,1,Nick_Burch_<nick@apache.org>,auto,auto,100
7,Tika v1.8,DIT,1542,9,Tim_Allison_<tallison@apache.org>,auto,auto,100
8,Tika v1.8,DIT,1541,11,Chris_Mattmann_<mattmann@apache.org>,auto,auto,100
9,Tika v1.8,DIT,1580,1,Chris_Mattmann_<mattmann@apache.org>,auto,auto,100


In [22]:
# for key, group in model_df.groupby(["Subject", "Task"]):
#     ranks = dict()
#     for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
#         ranks[subkey] = subgroup.Rank

#     print(key, scipy.stats.friedmanchisquare(*ranks.values()))
#     for x, y in itertools.combinations(model_df.groupby(["alpha", "eta", "K"]).groups.keys(), r=2):
#         stat, p = scipy.stats.wilcoxon(ranks[x], ranks[y])
#         if p < 0.01:
#             print(x, y, p, "******")
#         else:
#             print(x, y, p)
#     print()

In [23]:
for key, group in model_df.groupby(["Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
        assert subkey not in ranks
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))

print('----')

for key, group in model_df.groupby(["Subject", "Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
        assert subkey not in ranks
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))

DIT (2765.6842770138119, 0.0)
FLT (1268.1662872188358, 9.0338652843896644e-235)
----
('BookKeeper v4.3.0', 'DIT') (914.49405131140975, 3.6823690924705265e-161)
('BookKeeper v4.3.0', 'FLT') (127.67039786455341, 2.2117762416697342e-09)
('Mahout v0.10.0', 'DIT') (384.55290252857844, 1.6150248713121185e-54)
('Mahout v0.10.0', 'FLT') (150.24132992327392, 1.0048201595768155e-12)
('OpenJPA v2.3.0', 'DIT') (184.24751696601817, 3.8012496245453142e-18)
('OpenJPA v2.3.0', 'FLT') (381.52881843862752, 6.1398023875961168e-54)
('Pig v0.14.0', 'DIT') (354.7128566365742, 8.0054132695147136e-49)
('Pig v0.14.0', 'FLT') (399.58170379169212, 2.0758617659852637e-57)
('Tika v1.8', 'DIT') (184.08385594624323, 4.0448423099902757e-18)
('Tika v1.8', 'FLT') (113.12362651907993, 2.2419304091526999e-07)
('ZooKeeper v3.5.0', 'DIT') (1938.788023570414, 0.0)
('ZooKeeper v3.5.0', 'FLT') (313.26915032741539, 4.9771759841601103e-41)


In [24]:
res = pandas.DataFrame(columns=["Subject", "Task", "Config", "Config2", "MRR", "MRR2", "p"])
for k in ["alpha", "eta", "K"]:
    for key, group in model_df.groupby(["Subject", "Task"]):
        ranks = dict()
        for subkey, subgroup in group.groupby(k):
            assert subkey not in ranks
            ranks[subkey] = subgroup.Rank
        
        for each in itertools.combinations(ranks.keys(), r=2):
            f, t = each
            stat, p = scipy.stats.wilcoxon(ranks[f], ranks[t], correction=True)
            res = res.append(
                dict(
                    zip(res.columns, 
                        [key[0], key[1], k + "=" + str(f), k + "=" + str(t), src.utils.calculate_mrr(ranks[f]), src.utils.calculate_mrr(ranks[t]), p]))
                , ignore_index=True)

ores = pandas.DataFrame(columns=["Subject", "Task", "Config", "Config2", "MRR", "MRR2", "p"])
for k in ["alpha", "eta", "K"]:
    for key, group in model_df.groupby(["Task"]):
        ranks = dict()
        for subkey, subgroup in group.groupby(k):
            assert subkey not in ranks
            ranks[subkey] = subgroup.Rank
        
        for each in itertools.combinations(ranks.keys(), r=2):
            f, t = each
            stat, p = scipy.stats.wilcoxon(ranks[f], ranks[t], correction=True)
            ores = ores.append(
                dict(
                    zip(ores.columns, 
                        ["Overall", key, k + "=" + str(f), k + "=" + str(t), src.utils.calculate_mrr(ranks[f]), src.utils.calculate_mrr(ranks[t]), p]))
                , ignore_index=True)

In [25]:
len(res[res.p < 0.01]), len(res[res.p >= 0.01])

(113, 67)

In [26]:
len(ores[ores.p < 0.01]), len(ores[ores.p >= 0.01])

(21, 9)

In [27]:
ores

Unnamed: 0,Subject,Task,Config,Config2,MRR,MRR2,p
0,Overall,DIT,alpha=1,alpha=2,0.334535,0.333357,0.1864625
1,Overall,DIT,alpha=1,alpha=5,0.334535,0.337464,0.1804564
2,Overall,DIT,alpha=1,alpha=auto,0.334535,0.334525,0.05330456
3,Overall,DIT,alpha=2,alpha=5,0.333357,0.337464,0.9951688
4,Overall,DIT,alpha=2,alpha=auto,0.333357,0.334525,0.01806949
5,Overall,DIT,alpha=5,alpha=auto,0.337464,0.334525,0.03322544
6,Overall,FLT,alpha=1,alpha=2,0.362646,0.362229,8.218030000000001e-17
7,Overall,FLT,alpha=1,alpha=5,0.362646,0.359742,1.432597e-32
8,Overall,FLT,alpha=1,alpha=auto,0.362646,0.362723,2.996998e-14
9,Overall,FLT,alpha=2,alpha=5,0.362229,0.359742,5.572488e-26


In [28]:
res[(res.MRR > res.MRR2) & (res.p < 0.01)]

Unnamed: 0,Subject,Task,Config,Config2,MRR,MRR2,p
5,BookKeeper v4.3.0,DIT,alpha=5,alpha=auto,0.570063,0.556589,2.943637e-08
6,BookKeeper v4.3.0,FLT,alpha=1,alpha=2,0.435806,0.433651,3.515726e-04
8,BookKeeper v4.3.0,FLT,alpha=1,alpha=auto,0.435806,0.434269,2.587723e-03
11,BookKeeper v4.3.0,FLT,alpha=5,alpha=auto,0.435948,0.434269,1.299403e-09
12,Mahout v0.10.0,DIT,alpha=1,alpha=2,0.294484,0.289548,1.208079e-03
13,Mahout v0.10.0,DIT,alpha=1,alpha=5,0.294484,0.291443,7.416468e-08
19,Mahout v0.10.0,FLT,alpha=1,alpha=5,0.277402,0.260541,2.482687e-09
21,Mahout v0.10.0,FLT,alpha=2,alpha=5,0.285476,0.260541,7.583579e-07
22,Mahout v0.10.0,FLT,alpha=2,alpha=auto,0.285476,0.276755,6.389131e-07
40,Pig v0.14.0,DIT,alpha=2,alpha=auto,0.186871,0.185495,3.795334e-18


In [29]:
res[(res.MRR < res.MRR2) & (res.p < 0.01)]

Unnamed: 0,Subject,Task,Config,Config2,MRR,MRR2,p
1,BookKeeper v4.3.0,DIT,alpha=1,alpha=5,0.555225,0.570063,1.479346e-08
3,BookKeeper v4.3.0,DIT,alpha=2,alpha=5,0.557563,0.570063,1.235308e-07
7,BookKeeper v4.3.0,FLT,alpha=1,alpha=5,0.435806,0.435948,4.77028e-11
9,BookKeeper v4.3.0,FLT,alpha=2,alpha=5,0.433651,0.435948,1.672122e-08
10,BookKeeper v4.3.0,FLT,alpha=2,alpha=auto,0.433651,0.434269,0.00167676
15,Mahout v0.10.0,DIT,alpha=2,alpha=5,0.289548,0.291443,0.0001057157
16,Mahout v0.10.0,DIT,alpha=2,alpha=auto,0.289548,0.29462,0.0008291831
17,Mahout v0.10.0,DIT,alpha=5,alpha=auto,0.291443,0.29462,5.302349e-08
18,Mahout v0.10.0,FLT,alpha=1,alpha=2,0.277402,0.285476,2.701432e-07
23,Mahout v0.10.0,FLT,alpha=5,alpha=auto,0.260541,0.276755,8.730452e-09


In [30]:
t = res[(res.Config == "alpha=1") | (res.Config2 == "alpha=1")]
t

Unnamed: 0,Subject,Task,Config,Config2,MRR,MRR2,p
0,BookKeeper v4.3.0,DIT,alpha=1,alpha=2,0.555225,0.557563,0.2051069
1,BookKeeper v4.3.0,DIT,alpha=1,alpha=5,0.555225,0.570063,1.479346e-08
2,BookKeeper v4.3.0,DIT,alpha=1,alpha=auto,0.555225,0.556589,0.4486721
6,BookKeeper v4.3.0,FLT,alpha=1,alpha=2,0.435806,0.433651,0.0003515726
7,BookKeeper v4.3.0,FLT,alpha=1,alpha=5,0.435806,0.435948,4.77028e-11
8,BookKeeper v4.3.0,FLT,alpha=1,alpha=auto,0.435806,0.434269,0.002587723
12,Mahout v0.10.0,DIT,alpha=1,alpha=2,0.294484,0.289548,0.001208079
13,Mahout v0.10.0,DIT,alpha=1,alpha=5,0.294484,0.291443,7.416468e-08
14,Mahout v0.10.0,DIT,alpha=1,alpha=auto,0.294484,0.29462,0.4254184
18,Mahout v0.10.0,FLT,alpha=1,alpha=2,0.277402,0.285476,2.701432e-07


In [31]:
len(t), len(t[t.MRR > t.MRR2]), len(t[t.p < 0.05])

(36, 18, 23)

# table building

In [32]:
model_df[model_df.Rank == 0]

Unnamed: 0,Subject,Task,Issue,Rank,Item,alpha,eta,K


In [33]:
model_all = model_df.groupby(["Task", "alpha", "eta", "K"]).Rank.apply(lambda x: numpy.mean(1/x))
model_all

Task  alpha  eta   K  
DIT   1      1     100    0.301700
                   200    0.350912
                   500    0.377034
             2     100    0.292285
                   200    0.333820
                   500    0.376133
             5     100    0.285501
                   200    0.310650
                   500    0.365735
             auto  100    0.303069
                   200    0.343411
                   500    0.374172
      2      1     100    0.300678
                   200    0.346168
                   500    0.371791
             2     100    0.291940
                   200    0.328208
                   500    0.380202
             5     100    0.284913
                   200    0.314315
                   500    0.369073
             auto  100    0.302577
                   200    0.338648
                   500    0.371773
      5      1     100    0.315005
                   200    0.345699
                   500    0.379527
             2     100    0.2914

In [34]:
corpus_all = corpus_df.groupby(["Task", "Additions", "Removals", "Context", "Message"]).Rank.apply(lambda x: numpy.mean(1/x))
corpus_all

Task  Additions  Removals  Context  Message
DIT   False      False     False    True       0.335894
                           True     False      0.414800
                                    True       0.416467
                 True      False    False      0.314670
                                    True       0.330810
                           True     False      0.337312
                                    True       0.339059
      True       False     False    False      0.337990
                                    True       0.353739
                           True     False      0.380211
                                    True       0.378506
                 True      False    False      0.346219
                                    True       0.364607
                           True     False      0.377034
                                    True       0.378414
FLT   False      False     False    True       0.383786
                           True     False      0.338609
    

In [35]:
names = {'model': {'score': 'score',
                   'model_base_alpha': 'alpha',
                   'model_base_eta': 'eta',
                   'num_topics': 'K'
                  },
         'corpus': {'score': 'score',
                    'changeset_include_additions': 'Additions',
                    'changeset_include_context': 'Context',
                    'changeset_include_message': 'Message',
                    'changeset_include_removals': 'Removals',
                    },
        }
exps = ['triage', 'feature_location']
table_headers = {
    'model': ['K', 'alpha', 'eta', 'FLT', 'DIT'],
    'corpus': ['Additions', 'Removals', 'Context', 'Message', 'FLT', 'DIT']
}
groups = {
    'model': ['K', 'alpha', 'eta'],
    'corpus': ['Additions', 'Removals', 'Context', 'Message']
}

full_tex = r"""
\begin{table}
\begin{spacing}{1.2}
\centering
\caption{MRR values of %s %s construction sweep}
\label{table:%s}
\vspace{0.2em}
%s
\end{spacing}
\end{table}
"""

tex_dir = os.path.expanduser("~/git/dissertation/tables")


def do_great_table(main_df, label_name, rq, caption):
    include_fmt = lambda x: "Included" if x else ""

    formatters = {
        'FLT': lambda x: r"$\bm{%.4f}$" % x if x == max(main_df["FLT"]) else "$%.4f$" % x,
        'alpha': lambda x: "$%s/K$" % x if x != 'auto' else x,
        'eta': lambda x: "$%s/K$" % x if x != 'auto' else x,
        'K': lambda x: "$%s$" % int(x),
        'Additions': include_fmt,
        'Removals': include_fmt,
        'Context': include_fmt,
        'Message': include_fmt,
        'DIT': lambda x:  r"$\bm{%.4f}$" % x if x == max(main_df["DIT"]) else "$%.4f$" % x,
    }

    # filter out uninteresting rows, like there was no corpus
    main_df = main_df[(main_df["FLT"] != 0) | (main_df["DIT"] != 0)]
    if rq == "model":
        main_df = main_df.sort(["K", "alpha", "eta"])
    else:
        main_df = main_df.sort(["Additions", "Removals", "Context", "Message"], ascending=False)

    label = "%s_%s_sweep" % (label_name, rq)
    op = os.path.join(tex_dir, label + ".tex")
    
            
    best_flt = main_df[main_df["FLT"] == main_df["FLT"].max()]
    best_dit = main_df[main_df["DIT"] == main_df["DIT"].max()]

    if len(main_df) > 24:
        tex = r"\parbox{.45\linewidth}{\centering %s} \hfill \parbox{.45\linewidth}{\centering %s}"
        mid = len(main_df)//2
        tex = tex % (main_df[:mid].to_latex(index=False,
                                            escape=False, # needed so it doesn't screw up formatters
                                            formatters=formatters,
                                            columns=table_headers[rq]),
                     main_df[mid:].to_latex(index=False,
                                            escape=False, # needed so it doesn't screw up formatters
                                            formatters=formatters,
                                            columns=table_headers[rq]))
    else:
        tex = main_df.to_latex(index=False,
                               escape=False, # needed so it doesn't screw up formatters
                               formatters=formatters,
                               columns=table_headers[rq],)

    # and now the lazy
    this_full_tex = full_tex % (caption, rq, label, tex)
    this_full_tex = this_full_tex.replace(" alpha ", r" $\alpha$ ")
    this_full_tex = this_full_tex.replace(" eta ", r" $\eta$ ")
    this_full_tex = this_full_tex.replace(r"\begin{tabular}{rllrr}", r"\begin{tabular}{rll|rr}")
    this_full_tex = this_full_tex.replace(r"\begin{tabular}{llllrr}", r"\begin{tabular}{llll|rr}")
    this_full_tex = this_full_tex.replace(r"$500$ &  $1/K$ &  $1/K$ &", r"\myrowcolor $500$ &  $1/K$ &  $1/K$ &")
    this_full_tex = this_full_tex.replace(r"Included &  Included &  Included &           &", r"\myrowcolor Included &  Included &  Included &           &")

    #print("Writing to: %s\n%s\n" % (op, this_full_tex))
    print("Writing to:", op)
    with open(op, 'wt') as f:
        f.write(this_full_tex)
        
    return best_dit, best_flt

best_dits = dict({"model": dict(), "corpus": dict()})
best_flts = dict({"model": dict(), "corpus": dict()})

for rq, main_df in [("model", model_all), ("corpus", corpus_all)]:
    names[rq]['score'] = 'score'
    main_df = main_df.unstack(0).reset_index()
    
    best_dit, best_flt = do_great_table(main_df, "all", rq, "all subject systems")
    best_dits[rq]["all subject systems"] = best_dit
    best_flts[rq]["all subject systems"] = best_flt
    
for rq, main_df in [("model", model_df), ("corpus", corpus_df)]:
    names[rq]['score'] = 'score'
    group = groups[rq]
    zz = main_df.groupby(["Subject", "Task"] + group).Rank.apply(lambda x: numpy.mean(1/x))

    for each in zz.index.levels[0]:
        each_df = zz[each].unstack(0).reset_index()
        bad_person = each.split()[0].lower()
        best_dit, best_flt = do_great_table(each_df, bad_person, rq, "\\" + bad_person)
        best_dits[rq][each] = best_dit
        best_flts[rq][each] = best_flt

  return np.sum(name == np.asarray(self.names)) > 1


Writing to: /home/cscorley/git/dissertation/tables/all_model_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/all_corpus_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/bookkeeper_model_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/mahout_model_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/openjpa_model_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/pig_model_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/tika_model_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/zookeeper_model_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/bookkeeper_corpus_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/mahout_corpus_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/openjpa_corpus_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/pig_corpus_sweep.tex
Writing to: /home/cscorley/git/dissertation/tables/tika_corpus_sweep.tex
Writing to: /home/cscorley/git/diss

In [36]:
best_dits['model']['all subject systems'], best_flts['model']['all subject systems']

(Task alpha   eta    K       DIT       FLT
 35       5  auto  500  0.381787  0.410651,
 Task alpha   eta    K       DIT       FLT
 23       2  auto  500  0.371773  0.416203)

In [37]:
FIG_TEX="""
%% new fig
\\begin{figure}
    \\centering
    \\begin{subfigure}{.4\\textwidth}
        \\centering
        \\includegraphics[height=0.4\\textheight]{%s}
        \\caption{Including outliers}\\label{fig:combo:%s_outlier}
    \\end{subfigure}%%
    \\begin{subfigure}{.4\\textwidth}
        \\centering
        \\includegraphics[height=0.4\\textheight]{%s_no_outlier}
        \\caption{Excluding outliers}\\label{fig:combo:%s_no_outlier}
    \\end{subfigure}
\\caption{%s effectiveness measures of optimal ($MRR=%.4f$) and alternative ($MRR=%.4f$) %s configurations for %s}
\\label{fig:combo:%s}
\\end{figure}
"""

def plot_dataframe(df, each, name, kind):
    order = ["Optimal", "Alternate"]
    
    kind = kind.lower()
    name = name.lower()
    
    size = (len(order)*1.6, 4.5)
    limitgrowth = 0.5
    fontsize = None
    widths = 0.3
    lower = 0
    kinds = {"flt": "Feature Location", "dit": "Developer Identification"}
    rqs = {"flt": {"rq1": "\\cone", "rq2": "\\ctwo", "all": "Overview"},
           "dit": {"rq1": "\\cone", "rq2": "\\ctwo", "all": "Overview"}}
    config_name = "model" if name == "rq1" else "corpus"
    
    result = df.plot(kind='box',
                     fontsize=fontsize,
                     figsize=size,
                     widths=widths,
                     y=order)
    
    limit = result.get_ylim()
    lower = limit[0] - limitgrowth
    if (lower < 0):
        lower = 0
        
    result.set_ylim(lower, limit[1] + limitgrowth)
    
    plt.tight_layout()
    short_each = each.lower().split(' ')[0]
    fig_name = 'figures/combo/%s_%s_%s' % (kind, name, short_each)
    path = os.path.expanduser('~/git/dissertation/') + fig_name
    plt.savefig(path + ".pdf", dpi=300)

    optimal_data = df["Optimal"].dropna()
    alternate_data = df["Alternate"].dropna()
    optimal_mrr = src.utils.calculate_mrr(optimal_data)
    alternative_mrr = src.utils.calculate_mrr(alternate_data)
    #T, p, r = wilcoxon(optimal_data, alternate_data)

    with open(path + ".tex", "wt") as f:
        figlabel = ":".join([x.lower() for x in [kind, name, short_each]])
        f.write(FIG_TEX % (fig_name, figlabel,
                           fig_name, figlabel,
                           kinds[kind], 
                           optimal_mrr, alternative_mrr,
                           config_name, each,
                           figlabel))
    plt.close()
    
    # no outliers              
    result = df.plot(kind='box',
         fontsize=fontsize,
         figsize=size,
         widths=widths,
         y=order,
         showfliers=False)
    limit = result.get_ylim()
    lower = limit[0] - limitgrowth
    if (lower < 0):
        lower = 0

    result.set_ylim(lower, limit[1] + limitgrowth)

    plt.tight_layout()

    fig_name = 'figures/combo/%s_%s_%s_no_outlier' % (kind, name, short_each)
    path = os.path.expanduser('~/git/dissertation/') + fig_name
    plt.savefig(path + ".pdf", dpi=300)
    plt.close()

In [39]:
corpus_wilcoxon_df = pandas.DataFrame(columns=["Subject", "Task", "Optimal MRR", "Alternate MRR", "T", "p", "effect"])
model_wilcoxon_df = pandas.DataFrame(columns=["Subject", "Task", "Optimal MRR", "Alternate MRR", "T", "p", "effect"])

for task, best_df, best_alt_df in [("FLT", best_flts, best_dits), ("DIT", best_dits, best_flts)]:
    for project, table in best_df['corpus'].items():
        print(project, task, 'corpus')
        #print(table.sample(1))
        optimal = corpus_df[
                        (corpus_df.Task == task) &
                        (corpus_df.Additions == table.Additions.values[0]) &
                        (corpus_df.Removals == table.Removals.values[0]) &
                        (corpus_df.Context == table.Context.values[0]) &
                        (corpus_df.Message == table.Message.values[0])]

        alt_table = best_alt_df['corpus'][project]
        #print(alt_table.sample(1))

        alternate = corpus_df[
                              (corpus_df.Task == task) &
                              (corpus_df.Additions == alt_table.Additions.values[0]) &
                              (corpus_df.Removals == alt_table.Removals.values[0]) &
                              (corpus_df.Context == alt_table.Context.values[0]) &
                              (corpus_df.Message == alt_table.Message.values[0])]

        if project != "all subject systems":
            optimal = optimal[optimal.Subject == project]
            alternate = alternate[alternate.Subject == project]

        merge_df = optimal.merge(alternate, on=["Subject", "Task", "Issue"])
        optalt_df = pandas.DataFrame()
        optalt_df["Optimal"] = merge_df.Rank_x
        optalt_df["Alternate"] = merge_df.Rank_y

        plot_dataframe(optalt_df, project,  "rq2", task)
        
        if((optalt_df.Optimal == optalt_df.Alternate).all()):
            corpus_wilcoxon_df = corpus_wilcoxon_df.append({
                    "Subject": project,
                    "Task": task,
                    "Optimal MRR": numpy.mean(1/optalt_df.Optimal), 
                    "Alternate MRR": numpy.mean(1/optalt_df.Alternate),
                    "T": numpy.NaN,
                    "p": numpy.NaN,
                    "effect": numpy.NaN
                    }, ignore_index=True)
        else:
            T, p, r = wilcoxon(optalt_df.Optimal, optalt_df.Alternate)
            corpus_wilcoxon_df = corpus_wilcoxon_df.append({
                    "Subject": project,
                    "Task": task,
                    "Optimal MRR": numpy.mean(1/optalt_df.Optimal), 
                    "Alternate MRR": numpy.mean(1/optalt_df.Alternate),
                    "T": T,
                    "p": p,
                    "effect": r                   
                    }, ignore_index=True)
                               
for task, best_df, best_alt_df in [("FLT", best_flts, best_dits), ("DIT", best_dits, best_flts)]:
    for project, table in best_df['model'].items():
        print(project, task, 'model')
        #print(table.sample(1))
        optimal = model_df[
                        (model_df.Task == task) &
                        (model_df.alpha == table.alpha.values[0]) &
                        (model_df.eta == table.eta.values[0]) &
                        (model_df.K == table.K.values[0])]

        alt_table = best_alt_df['model'][project]
        #print(alt_table.sample(1))

        alternate = model_df[
                        (model_df.Task == task) &
                        (model_df.alpha == alt_table.alpha.values[0]) &
                        (model_df.eta == alt_table.eta.values[0]) &
                        (model_df.K == alt_table.K.values[0])]

        if project != "all subject systems":
            optimal = optimal[optimal.Subject == project]
            alternate = alternate[alternate.Subject == project]

        merge_df = optimal.merge(alternate, on=["Subject", "Task", "Issue"])
        optalt_df = pandas.DataFrame()
        optalt_df["Optimal"] = merge_df.Rank_x
        optalt_df["Alternate"] = merge_df.Rank_y

        plot_dataframe(optalt_df, project, "rq1", task)
        if((optalt_df.Optimal == optalt_df.Alternate).all()):
            model_wilcoxon_df = model_wilcoxon_df.append({
                    "Subject": project,
                    "Task": task,
                    "Optimal MRR": numpy.mean(1/optalt_df.Optimal), 
                    "Alternate MRR": numpy.mean(1/optalt_df.Alternate),
                    "T": numpy.NaN,
                    "p": numpy.NaN,
                    "effect": numpy.NaN
                    }, ignore_index=True)
        else:
            T, p, r = wilcoxon(optalt_df.Optimal, optalt_df.Alternate)
            model_wilcoxon_df = model_wilcoxon_df.append({
                    "Subject": project,
                    "Task": task,
                    "Optimal MRR": numpy.mean(1/optalt_df.Optimal), 
                    "Alternate MRR": numpy.mean(1/optalt_df.Alternate),
                    "T": T,
                    "p": p,
                    "effect": r                   
                    }, ignore_index=True)


Tika v1.8 FLT corpus
OpenJPA v2.3.0 FLT corpus
ZooKeeper v3.5.0 FLT corpus
all subject systems FLT corpus
Mahout v0.10.0 FLT corpus
BookKeeper v4.3.0 FLT corpus
Pig v0.14.0 FLT corpus
Tika v1.8 DIT corpus
OpenJPA v2.3.0 DIT corpus
ZooKeeper v3.5.0 DIT corpus
all subject systems DIT corpus
Mahout v0.10.0 DIT corpus
BookKeeper v4.3.0 DIT corpus
Pig v0.14.0 DIT corpus
Tika v1.8 FLT model
OpenJPA v2.3.0 FLT model
ZooKeeper v3.5.0 FLT model
all subject systems FLT model
Mahout v0.10.0 FLT model
BookKeeper v4.3.0 FLT model
Pig v0.14.0 FLT model
Tika v1.8 DIT model
OpenJPA v2.3.0 DIT model
ZooKeeper v3.5.0 DIT model
all subject systems DIT model
Mahout v0.10.0 DIT model
BookKeeper v4.3.0 DIT model
Pig v0.14.0 DIT model


In [40]:
print(model_wilcoxon_df.sort(columns=["Subject"]).to_latex(index=False,
                               escape=False#, # needed so it doesn't screw up formatters
                               )#formatters=formatters,
                               #columns=table_headers[rq],))
      )

\begin{tabular}{llrrrrr}
\toprule
             Subject & Task &  Optimal MRR &  Alternate MRR &        T &         p &    effect \\
\midrule
   BookKeeper v4.3.0 &  FLT &     0.488400 &       0.488400 &      NaN &       NaN &       NaN \\
   BookKeeper v4.3.0 &  DIT &     0.664228 &       0.664228 &      NaN &       NaN &       NaN \\
      Mahout v0.10.0 &  FLT &     0.338952 &       0.280194 &    254.5 &  0.059513 &  0.347436 \\
      Mahout v0.10.0 &  DIT &     0.354404 &       0.350435 &   2500.0 &  0.932351 &  0.009901 \\
      OpenJPA v2.3.0 &  FLT &     0.308929 &       0.298263 &   1545.0 &  0.118198 &  0.192790 \\
      OpenJPA v2.3.0 &  DIT &     0.369520 &       0.346583 &    583.0 &  0.011180 &  0.362842 \\
         Pig v0.14.0 &  FLT &     0.396437 &       0.285928 &   5019.5 &  0.087269 &  0.158860 \\
         Pig v0.14.0 &  DIT &     0.217319 &       0.163108 &   7733.0 &  0.011396 &  0.206994 \\
           Tika v1.8 &  FLT &     0.483125 &       0.392207 &    189.0 &  0

In [41]:
j = model_wilcoxon_df[model_wilcoxon_df.Task == "FLT"]
del j["Task"]
j.sort(columns=["Subject"])

Unnamed: 0,Subject,Optimal MRR,Alternate MRR,T,p,effect
5,BookKeeper v4.3.0,0.4884,0.4884,,,
4,Mahout v0.10.0,0.338952,0.280194,254.5,0.059513,0.347436
1,OpenJPA v2.3.0,0.308929,0.298263,1545.0,0.118198,0.19279
6,Pig v0.14.0,0.396437,0.285928,5019.5,0.087269,0.15886
0,Tika v1.8,0.483125,0.392207,189.0,0.544492,0.131034
2,ZooKeeper v3.5.0,0.488183,0.466974,4555.5,0.06667,0.17368
3,all subject systems,0.416203,0.410651,37416.5,7.5e-05,0.21807


In [42]:
j = model_wilcoxon_df[model_wilcoxon_df.Task == "DIT"]
del j["Task"]
j.sort(columns=["Subject"])

Unnamed: 0,Subject,Optimal MRR,Alternate MRR,T,p,effect
12,BookKeeper v4.3.0,0.664228,0.664228,,,
11,Mahout v0.10.0,0.354404,0.350435,2500.0,0.932351,0.009901
8,OpenJPA v2.3.0,0.36952,0.346583,583.0,0.01118,0.362842
13,Pig v0.14.0,0.217319,0.163108,7733.0,0.011396,0.206994
7,Tika v1.8,0.377484,0.332759,151.0,0.020774,0.461676
9,ZooKeeper v3.5.0,0.421349,0.401081,10397.0,0.174748,0.104479
10,all subject systems,0.381787,0.371773,48376.5,0.226211,0.063242


In [43]:
j = corpus_wilcoxon_df[corpus_wilcoxon_df.Task == "FLT"]
del j["Task"]
j.sort(columns=["Subject"])

Unnamed: 0,Subject,Optimal MRR,Alternate MRR,T,p,effect
5,BookKeeper v4.3.0,0.532703,0.5246,2141.0,0.731075,0.040985
4,Mahout v0.10.0,0.365008,0.260056,369.0,0.304924,0.182724
1,OpenJPA v2.3.0,0.368668,0.286927,2713.0,0.190823,0.142541
6,Pig v0.14.0,0.435972,0.312365,4287.0,0.00703,0.252876
0,Tika v1.8,0.648192,0.557178,90.5,0.394314,0.21645
2,ZooKeeper v3.5.0,0.494966,0.393001,6121.5,0.005333,0.240037
3,all subject systems,0.451749,0.397141,82657.5,0.006227,0.127257


In [44]:
j = corpus_wilcoxon_df[corpus_wilcoxon_df.Task == "DIT"]
del j["Task"]
j.sort(columns=["Subject"])

Unnamed: 0,Subject,Optimal MRR,Alternate MRR,T,p,effect
12,BookKeeper v4.3.0,0.721646,0.697358,147.0,6.286573e-08,0.778281
11,Mahout v0.10.0,0.382731,0.341262,3255.5,0.8251844,0.023838
8,OpenJPA v2.3.0,0.409568,0.39347,2446.0,0.5454168,0.068723
13,Pig v0.14.0,0.303156,0.143687,1727.5,4.14198e-25,0.834768
7,Tika v1.8,0.452242,0.394299,137.0,0.4989525,0.156923
9,ZooKeeper v3.5.0,0.456471,0.366692,14329.0,6.505966e-09,0.381838
10,all subject systems,0.416467,0.378506,127321.5,1.041398e-10,0.258046
