In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
from __future__ import print_function, division

from matplotlib import pyplot as plt
import pandas
import src
import gensim
import os
import os.path
import csv
import functools
import itertools
import collections
import scipy
import scipy.stats
from operator import itemgetter

In [3]:
def fake(*args, **kwargs):
    print('Fake called with', str(args), str(kwargs))
    sys.exit(1)

# fake out the create_model so we don't accidentally attempt to create data
src.common.create_model = fake

In [4]:
model_config = {
    'num_topics': 500,
    'alpha': 1/500,
    'eta': 1/500,
    'alpha_base': 1,
    'eta_base': 1,
    'decay': 0.5,
    'offset': 1.0,
    'iterations': 1000,
    'passes': 1,
    'max_bound_iterations': 1000, # special
    'algorithm': 'batch', # special
}

changeset_config = {
    'include_additions': True,
    'include_context': True,
    'include_message': False,
    'include_removals': True,
}

def get_config_string(config):
    return '-'.join([unicode(v) for k, v in sorted(config.items())])

alpha_bases = ['auto', 1, 2, 5]
eta_bases = ['auto', 1, 2, 5]
num_topics = [100, 200, 500]

def get_rank_name(kind, experiment, changeset_config, model_config):
    """
    kind = [changeset, release, temporal]
    experiment = [triage, feature_location]
    """
    cs_str = get_config_string(changeset_config)
    model_config = dict(model_config)
    del model_config['alpha_base']
    del model_config['eta_base']
    m_str = get_config_string(model_config)
    return '-'.join([kind, experiment, 'lda', cs_str, m_str, 'file', 'ranks']).lower() + '.csv.gz'

model_sweep = list()
for a, e, K in itertools.product(alpha_bases, eta_bases, num_topics):
    m = dict(model_config)
    m['alpha_base'] = a
    m['eta_base'] = e

    if a != 'auto':
        a /= K
    if e != 'auto':
        e /= K
    m['alpha'] = a
    m['eta'] = e
    m['num_topics'] = K
    model_sweep.append(m)

corpus_sweep = list()
b = [True, False]
for a, c, m, r in itertools.product(b, repeat=4):
    conf = dict(changeset_config)
    conf['include_additions'] = a
    conf['include_context'] = c
    conf['include_message'] = m
    conf['include_removals'] = r
    if any(conf.values()):
        corpus_sweep.append(conf)

In [5]:
projects = src.common.load_projects(dict(model="lda", level="file", rankpath='', config=dict()), "../data")
projects

[Project(name='tika', printable_name='Tika v1.8', version='v1.8', ref='refs/tags/1.8', data_path='../data/tika/', full_path='../data/tika/v1.8/', src_path='../data/tika/v1.8/src/', model='lda', rankpath='', config={}, level='file'),
 Project(name='pig', printable_name='Pig v0.14.0', version='v0.14.0', ref='refs/tags/release-0.14.0', data_path='../data/pig/', full_path='../data/pig/v0.14.0/', src_path='../data/pig/v0.14.0/src/', model='lda', rankpath='', config={}, level='file'),
 Project(name='bookkeeper', printable_name='BookKeeper v4.3.0', version='v4.3.0', ref='refs/tags/release-4.3.0', data_path='../data/bookkeeper/', full_path='../data/bookkeeper/v4.3.0/', src_path='../data/bookkeeper/v4.3.0/src/', model='lda', rankpath='', config={}, level='file'),
 Project(name='openjpa', printable_name='OpenJPA v2.3.0', version='v2.3.0', ref='refs/tags/2.3.0', data_path='../data/openjpa/', full_path='../data/openjpa/v2.3.0/', src_path='../data/openjpa/v2.3.0/src/', model='lda', rankpath='', con

In [6]:
cs_dit = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'triage', c, model_config)) for c in corpus_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data_path, project.version, rankname)
    if os.path.exists(rankpath):
        cs_dit.append(project._replace(rankpath=rankpath, config=config))
    else:
        print(rankpath)

cs_flt = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'feature_location', c, model_config)) for c in corpus_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data_path, project.version,  rankname)
    if os.path.exists(rankpath):
        cs_flt.append(project._replace(rankpath=rankpath, config=config))
    else:
        print(rankpath)

        
ms_dit = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'triage', changeset_config, c)) for c in model_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data_path, project.version,  rankname)
    if os.path.exists(rankpath):
        ms_dit.append(project._replace(rankpath=rankpath, config=config))
    else:
        print(rankpath)
        
ms_flt = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'feature_location', changeset_config, c)) for c in model_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data_path, project.version,  rankname)
    if os.path.exists(rankpath):
         ms_flt.append(project._replace(rankpath=rankpath, config=config))
    else:
        print(rankpath)

In [7]:
corpus_df = pandas.DataFrame(columns=["Subject", "Task","Issue", "Rank", "Distance", "Additions", "Removals", "Context", "Message"])
for item in cs_dit:
    df = pandas.DataFrame(columns=corpus_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.printable_name)
    df["Additions"] = df.Additions.fillna(item.config['include_additions'])
    df["Removals"] = df.Removals.fillna(item.config['include_removals'])
    df["Context"] = df.Context.fillna(item.config['include_context'])
    df["Message"] = df.Message.fillna(item.config['include_message'])
    df["Task"] = df.Task.fillna("DIT")
    corpus_df = corpus_df.append(df, ignore_index=True)
    
for item in cs_flt:
    df = pandas.DataFrame(columns=corpus_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.printable_name)
    df["Additions"] = df.Additions.fillna(item.config['include_additions'])
    df["Removals"] = df.Removals.fillna(item.config['include_removals'])
    df["Context"] = df.Context.fillna(item.config['include_context'])
    df["Message"] = df.Message.fillna(item.config['include_message'])
    df["Task"] = df.Task.fillna("FLT")
    corpus_df = corpus_df.append(df, ignore_index=True)
    
model_df = pandas.DataFrame(columns=["Subject", "Task", "Issue", "Rank", "Distance", "alpha", "eta", "K"])
for item in ms_dit:
    df = pandas.DataFrame(columns=model_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.printable_name)
    df["alpha"] = df.alpha.fillna(item.config['alpha_base'])
    df["eta"] = df.eta.fillna(item.config['eta_base'])
    df["K"] = df.K.fillna(item.config['num_topics'])
    df["Task"] = df.Task.fillna("DIT")
    model_df = model_df.append(df, ignore_index=True)
    
for item in ms_flt:
    df = pandas.DataFrame(columns=model_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.printable_name)
    df["alpha"] = df.alpha.fillna(item.config['alpha_base'])
    df["eta"] = df.eta.fillna(item.config['eta_base'])
    df["K"] = df.K.fillna(item.config['num_topics'])
    df["Task"] = df.Task.fillna("FLT")
    model_df = model_df.append(df, ignore_index=True)

# Corpus analysis

In [8]:
corpus_df[:10]

Unnamed: 0,Subject,Task,Issue,Rank,Distance,Additions,Removals,Context,Message
0,Tika v1.8,DIT,241,1,0.856419,True,True,True,True
1,Tika v1.8,DIT,936,4,0.89548,True,True,True,True
2,Tika v1.8,DIT,995,11,0.933785,True,True,True,True
3,Tika v1.8,DIT,1028,1,0.87314,True,True,True,True
4,Tika v1.8,DIT,1269,4,0.872447,True,True,True,True
5,Tika v1.8,DIT,1286,3,0.883931,True,True,True,True
6,Tika v1.8,DIT,1365,13,0.923789,True,True,True,True
7,Tika v1.8,DIT,1383,3,0.827191,True,True,True,True
8,Tika v1.8,DIT,1416,8,0.930014,True,True,True,True
9,Tika v1.8,DIT,1423,2,0.838308,True,True,True,True


In [9]:
(corpus_df.groupby(["Subject", "Task", "Additions"]).Rank.apply(src.utils.calculate_mrr),
 '********************************************',
 corpus_df.groupby(["Subject", "Task", "Removals"]).Rank.apply(src.utils.calculate_mrr),
 '********************************************',
 corpus_df.groupby(["Subject", "Task", "Context"]).Rank.apply(src.utils.calculate_mrr),
 '********************************************',
 corpus_df.groupby(["Subject", "Task", "Message"]).Rank.apply(src.utils.calculate_mrr),
)

(Subject            Task  Additions
 BookKeeper v4.3.0  DIT   False        0.646312
                          True         0.633168
                    FLT   False        0.478765
                          True         0.564210
 Mahout v0.10.0     DIT   False        0.291689
                          True         0.319166
                    FLT   False        0.668975
                          True         0.667676
 OpenJPA v2.3.0     DIT   False        0.311652
                          True         0.372377
                    FLT   False        0.295981
                          True         0.341231
 Pig v0.14.0        DIT   False        0.214010
                          True         0.169551
                    FLT   False        0.440798
                          True         0.488335
 Tika v1.8          DIT   False        0.350845
                          True         0.393482
                    FLT   False        0.491929
                          True         0.544183
 Zoo

In [10]:
# what the fuck was I doing here?
res = pandas.DataFrame(columns=["Subject", "Task", "Config", "ExcludeMRR", "IncludeMRR", "p"])
for k in ["Additions", "Removals", "Context", "Message"]:
    for key, group in corpus_df.groupby(["Subject", "Task"]):
        subject, task = key
        sub = group.groupby(k).groups
        f = corpus_df.ix[sub[False]].Rank
        t = corpus_df.ix[sub[True]].Rank
        stat, p = scipy.stats.mannwhitneyu(f, t)
        res = res.append(
            dict(
                zip(res.columns, 
                    [subject, task, k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
            , ignore_index=True)
    sub = corpus_df.groupby(["Task", k]).groups
    f = corpus_df.ix[sub[("DIT", False)]].Rank
    t = corpus_df.ix[sub[("DIT", True)]].Rank
    stat, p = scipy.stats.mannwhitneyu(f, t)
    res = res.append(
        dict(
            zip(res.columns, 
                ["Overall", "DIT", k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
        , ignore_index=True)
    f = corpus_df.ix[sub[("FLT", False)]].Rank
    t = corpus_df.ix[sub[("FLT", True)]].Rank
    stat, p = scipy.stats.mannwhitneyu(f, t)
    res = res.append(
        dict(
            zip(res.columns, 
                ["Overall", "FLT", k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
        , ignore_index=True)        

In [11]:
res[(res.ExcludeMRR > res.IncludeMRR) & (res.p < 0.05)]

Unnamed: 0,Subject,Task,Config,ExcludeMRR,IncludeMRR,p
16,Mahout v0.10.0,DIT,Removals,0.327144,0.288143,0.0008426824
20,Pig v0.14.0,DIT,Removals,0.220149,0.16418,2.857885e-12
24,ZooKeeper v3.5.0,DIT,Removals,0.378937,0.348919,1.159353e-08
26,Overall,DIT,Removals,0.373944,0.348516,3.629592e-13
27,Overall,FLT,Removals,0.541252,0.531228,0.01870853
52,ZooKeeper v3.5.0,DIT,Message,0.372189,0.354824,0.0008063377


In [12]:
res[(res.ExcludeMRR < res.IncludeMRR) & (res.p < 0.05)]

Unnamed: 0,Subject,Task,Config,ExcludeMRR,IncludeMRR,p
1,BookKeeper v4.3.0,FLT,Additions,0.478765,0.56421,2.95774e-08
2,Mahout v0.10.0,DIT,Additions,0.291689,0.319166,0.00504991
4,OpenJPA v2.3.0,DIT,Additions,0.311652,0.372377,3.54704e-06
5,OpenJPA v2.3.0,FLT,Additions,0.295981,0.341231,0.02421842
7,Pig v0.14.0,FLT,Additions,0.440798,0.488335,0.0005880241
10,ZooKeeper v3.5.0,DIT,Additions,0.351252,0.373143,1.506603e-09
11,ZooKeeper v3.5.0,FLT,Additions,0.609747,0.642824,0.002524532
12,Overall,DIT,Additions,0.355573,0.36459,0.0006921688
13,Overall,FLT,Additions,0.512807,0.556117,9.560251e-10
28,BookKeeper v4.3.0,DIT,Context,0.592102,0.680602,1.904592e-10


In [13]:
# for key, group in corpus_df.groupby(["Subject", "Task"]):
#     ranks = dict()
#     for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
#         ranks[subkey] = subgroup.Rank

#     print(key, scipy.stats.friedmanchisquare(*ranks.values()))
#     for x, y in itertools.combinations(corpus_df.groupby(["Additions", "Removals", "Context", "Message"]).groups.keys(), r=2):
#         stat, p = scipy.stats.wilcoxon(ranks[x], ranks[y])
#         if p < 0.01:
#             print(x, y, p, "******")
#         else:
#             print(x, y, p)
#     print()

In [14]:
for key, group in corpus_df.groupby(["Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
        assert subkey not in ranks
        ranks[subkey] = subgroup.Rank

    print(("Overall", key), scipy.stats.friedmanchisquare(*ranks.values()))

for key, group in corpus_df.groupby(["Subject", "Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
        assert subkey not in ranks
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))

('Overall', 'DIT') (1026.4202237626146, 3.3513230863942486e-210)
('Overall', 'FLT') (204.92519870085542, 5.4081738142670862e-36)
('BookKeeper v4.3.0', 'DIT') (269.15965685137962, 3.0826378883316631e-49)
('BookKeeper v4.3.0', 'FLT') (120.17870967741945, 5.8050748444341914e-19)
('Mahout v0.10.0', 'DIT') (50.551435851920118, 4.9350360355472855e-06)
('Mahout v0.10.0', 'FLT') (13.595833012993253, 0.48023490708348249)
('OpenJPA v2.3.0', 'DIT') (124.62989253393617, 7.7689368153807406e-20)
('OpenJPA v2.3.0', 'FLT') (39.360167260803436, 0.00032079920046937968)
('Pig v0.14.0', 'DIT') (887.09399971431264, 2.5129987045198387e-180)
('Pig v0.14.0', 'FLT') (53.369310278534641, 1.6459942163477199e-06)
('Tika v1.8', 'DIT') (23.153468323977627, 0.057817878117764045)
('Tika v1.8', 'FLT') (39.1413438076533, 0.0003468335835578398)
('ZooKeeper v3.5.0', 'DIT') (454.49564930784339, 3.9880227095427624e-88)
('ZooKeeper v3.5.0', 'FLT') (71.613097718625838, 9.8353877807882468e-10)


# Model analysis

In [15]:
model_df[:10]

Unnamed: 0,Subject,Task,Issue,Rank,Distance,alpha,eta,K
0,Tika v1.8,DIT,241,1,0.821746,auto,auto,100
1,Tika v1.8,DIT,936,7,0.855173,auto,auto,100
2,Tika v1.8,DIT,995,8,0.86905,auto,auto,100
3,Tika v1.8,DIT,1028,9,0.84206,auto,auto,100
4,Tika v1.8,DIT,1269,9,0.851506,auto,auto,100
5,Tika v1.8,DIT,1286,3,0.76362,auto,auto,100
6,Tika v1.8,DIT,1365,19,0.894476,auto,auto,100
7,Tika v1.8,DIT,1383,4,0.830088,auto,auto,100
8,Tika v1.8,DIT,1416,10,0.911869,auto,auto,100
9,Tika v1.8,DIT,1423,7,0.880329,auto,auto,100


In [16]:
# for key, group in model_df.groupby(["Subject", "Task"]):
#     ranks = dict()
#     for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
#         ranks[subkey] = subgroup.Rank

#     print(key, scipy.stats.friedmanchisquare(*ranks.values()))
#     for x, y in itertools.combinations(model_df.groupby(["alpha", "eta", "K"]).groups.keys(), r=2):
#         stat, p = scipy.stats.wilcoxon(ranks[x], ranks[y])
#         if p < 0.01:
#             print(x, y, p, "******")
#         else:
#             print(x, y, p)
#     print()

In [32]:
for key, group in model_df.groupby(["Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
        assert subkey not in ranks
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))

print('----')

for key, group in model_df.groupby(["Subject", "Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
        assert subkey not in ranks
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))

DIT (2765.6842770138119, 0.0)
FLT (1268.166287218836, 9.0338652843886392e-235)
----
('BookKeeper v4.3.0', 'DIT') (914.49405131140975, 3.6823690924705265e-161)
('BookKeeper v4.3.0', 'FLT') (127.67039786455267, 2.2117762416702573e-09)
('Mahout v0.10.0', 'DIT') (384.55290252857844, 1.6150248713121185e-54)
('Mahout v0.10.0', 'FLT') (150.24132992327426, 1.004820159576684e-12)
('OpenJPA v2.3.0', 'DIT') (184.24751696601817, 3.8012496245453142e-18)
('OpenJPA v2.3.0', 'FLT') (381.52881843863065, 6.1398023875876853e-54)
('Pig v0.14.0', 'DIT') (354.7128566365742, 8.0054132695147136e-49)
('Pig v0.14.0', 'FLT') (399.58170379169297, 2.0758617659844917e-57)
('Tika v1.8', 'DIT') (184.08385594624323, 4.0448423099902757e-18)
('Tika v1.8', 'FLT') (113.12362651908087, 2.2419304091520334e-07)
('ZooKeeper v3.5.0', 'DIT') (1938.788023570414, 0.0)
('ZooKeeper v3.5.0', 'FLT') (313.269150327413, 4.977175984165248e-41)


In [51]:
res = pandas.DataFrame(columns=["Subject", "Task", "Config", "Config2", "MRR", "MRR2", "p"])
for k in ["alpha", "eta", "K"]:
    for key, group in model_df.groupby(["Subject", "Task"]):
        ranks = dict()
        for subkey, subgroup in group.groupby(k):
            assert subkey not in ranks
            ranks[subkey] = subgroup.Rank
        
        for each in itertools.combinations(ranks.keys(), r=2):
            f, t = each
            stat, p = scipy.stats.wilcoxon(ranks[f], ranks[t])
            res = res.append(
                dict(
                    zip(res.columns, 
                        [key[0], key[1], k + "=" + str(f), k + "=" + str(t), src.utils.calculate_mrr(ranks[f]), src.utils.calculate_mrr(ranks[t]), p]))
                , ignore_index=True)

ores = pandas.DataFrame(columns=["Subject", "Task", "Config", "Config2", "MRR", "MRR2", "p"])
for k in ["alpha", "eta", "K"]:
    for key, group in model_df.groupby(["Task"]):
        ranks = dict()
        for subkey, subgroup in group.groupby(k):
            assert subkey not in ranks
            ranks[subkey] = subgroup.Rank
        
        for each in itertools.combinations(ranks.keys(), r=2):
            f, t = each
            stat, p = scipy.stats.wilcoxon(ranks[f], ranks[t])
            ores = ores.append(
                dict(
                    zip(ores.columns, 
                        ["Overall", key, k + "=" + str(f), k + "=" + str(t), src.utils.calculate_mrr(ranks[f]), src.utils.calculate_mrr(ranks[t]), p]))
                , ignore_index=True)

In [52]:
len(res[res.p < 0.01]), len(res[res.p >= 0.01])

(113, 67)

In [53]:
len(ores[ores.p < 0.01]), len(ores[ores.p >= 0.01])

(21, 9)

In [56]:
ores

Unnamed: 0,Subject,Task,Config,Config2,MRR,MRR2,p
0,Overall,DIT,alpha=1,alpha=2,0.334535,0.333357,0.1864598
1,Overall,DIT,alpha=1,alpha=5,0.334535,0.337464,0.1804551
2,Overall,DIT,alpha=1,alpha=auto,0.334535,0.334525,0.05329261
3,Overall,DIT,alpha=2,alpha=5,0.333357,0.337464,0.9951651
4,Overall,DIT,alpha=2,alpha=auto,0.333357,0.334525,0.01806911
5,Overall,DIT,alpha=5,alpha=auto,0.337464,0.334525,0.0332251
6,Overall,FLT,alpha=1,alpha=2,0.513829,0.513511,8.217651000000001e-17
7,Overall,FLT,alpha=1,alpha=5,0.513829,0.511614,1.432529e-32
8,Overall,FLT,alpha=1,alpha=auto,0.513829,0.513888,2.996421e-14
9,Overall,FLT,alpha=2,alpha=5,0.513511,0.511614,5.572237999999999e-26


In [50]:
res[(res.MRR > res.MRR2) & (res.p < 0.01)]

Unnamed: 0,Subject,Task,Config,Config2,MRR,MRR2,p
5,BookKeeper v4.3.0,DIT,alpha=5,alpha=auto,0.570063,0.556589,2.936204e-08
6,BookKeeper v4.3.0,FLT,alpha=1,alpha=2,0.508050,0.506171,3.514500e-04
8,BookKeeper v4.3.0,FLT,alpha=1,alpha=auto,0.508050,0.506710,2.582741e-03
11,BookKeeper v4.3.0,FLT,alpha=5,alpha=auto,0.508175,0.506710,1.298867e-09
12,Mahout v0.10.0,DIT,alpha=1,alpha=2,0.294484,0.289548,1.207603e-03
13,Mahout v0.10.0,DIT,alpha=1,alpha=5,0.294484,0.291443,7.413960e-08
19,Mahout v0.10.0,FLT,alpha=1,alpha=5,0.662337,0.654459,2.479687e-09
21,Mahout v0.10.0,FLT,alpha=2,alpha=5,0.666110,0.654459,7.575810e-07
22,Mahout v0.10.0,FLT,alpha=2,alpha=auto,0.666110,0.662035,6.381415e-07
40,Pig v0.14.0,DIT,alpha=2,alpha=auto,0.186871,0.185495,3.793328e-18


In [43]:
res[(res.MRR < res.MRR2) & (res.p < 0.01)]

Unnamed: 0,Subject,Task,Config,Config2,MRR,MRR2,p
1,BookKeeper v4.3.0,DIT,alpha=1,alpha=5,0.555225,0.570063,1.475669e-08
3,BookKeeper v4.3.0,DIT,alpha=2,alpha=5,0.557563,0.570063,1.23125e-07
7,BookKeeper v4.3.0,FLT,alpha=1,alpha=5,0.50805,0.508175,4.768191e-11
9,BookKeeper v4.3.0,FLT,alpha=2,alpha=5,0.506171,0.508175,1.671411e-08
10,BookKeeper v4.3.0,FLT,alpha=2,alpha=auto,0.506171,0.50671,0.00167623
15,Mahout v0.10.0,DIT,alpha=2,alpha=5,0.289548,0.291443,0.000105686
16,Mahout v0.10.0,DIT,alpha=2,alpha=auto,0.289548,0.29462,0.0008288507
17,Mahout v0.10.0,DIT,alpha=5,alpha=auto,0.291443,0.29462,5.300537e-08
18,Mahout v0.10.0,FLT,alpha=1,alpha=2,0.662337,0.66611,2.697989e-07
23,Mahout v0.10.0,FLT,alpha=5,alpha=auto,0.654459,0.662035,8.720254e-09


In [22]:
t = res[(res.Config == "alpha=1") | (res.Config2 == "alpha=1")]
t

Unnamed: 0,Subject,Task,Config,Config2,MRR,MRR2,p
0,BookKeeper v4.3.0,DIT,alpha=1,alpha=2,0.555225,0.557563,0.2046987
1,BookKeeper v4.3.0,DIT,alpha=1,alpha=5,0.555225,0.570063,1.475669e-08
2,BookKeeper v4.3.0,DIT,alpha=1,alpha=auto,0.555225,0.556589,0.4365058
6,BookKeeper v4.3.0,FLT,alpha=1,alpha=2,0.50805,0.506171,0.00035145
7,BookKeeper v4.3.0,FLT,alpha=1,alpha=5,0.50805,0.508175,4.768191e-11
8,BookKeeper v4.3.0,FLT,alpha=1,alpha=auto,0.50805,0.50671,0.002582741
12,Mahout v0.10.0,DIT,alpha=1,alpha=2,0.294484,0.289548,0.001207603
13,Mahout v0.10.0,DIT,alpha=1,alpha=5,0.294484,0.291443,7.41396e-08
14,Mahout v0.10.0,DIT,alpha=1,alpha=auto,0.294484,0.29462,0.4142162
18,Mahout v0.10.0,FLT,alpha=1,alpha=2,0.662337,0.66611,2.697989e-07


In [23]:
len(t), len(t[t.MRR > t.MRR2]), len(t[t.p < 0.05])

(36, 18, 23)

# table building

In [24]:
model_all = model_df.groupby(["Task", "alpha", "eta", "K"]).Rank.apply(src.utils.calculate_mrr)
model_all

Task  alpha  eta   K  
DIT   1      1     100    0.301700
                   200    0.350912
                   500    0.377034
             2     100    0.292285
                   200    0.333820
                   500    0.376133
             5     100    0.285501
                   200    0.310650
                   500    0.365735
             auto  100    0.303069
                   200    0.343411
                   500    0.374172
      2      1     100    0.300678
                   200    0.346168
                   500    0.371791
             2     100    0.291940
                   200    0.328208
                   500    0.380202
             5     100    0.284913
                   200    0.314315
                   500    0.369073
             auto  100    0.302577
                   200    0.338648
                   500    0.371773
      5      1     100    0.315005
                   200    0.345699
                   500    0.379527
             2     100    0.2914

In [25]:
corpus_all = corpus_df.groupby(["Task", "Additions", "Removals", "Context", "Message"]).Rank.apply(src.utils.calculate_mrr)
corpus_all

Task  Additions  Removals  Context  Message
DIT   False      False     False    True       0.335894
                           True     False      0.414800
                                    True       0.416467
                 True      False    False      0.314670
                                    True       0.330810
                           True     False      0.337312
                                    True       0.339059
      True       False     False    False      0.337990
                                    True       0.353739
                           True     False      0.380211
                                    True       0.378506
                 True      False    False      0.346219
                                    True       0.364607
                           True     False      0.377034
                                    True       0.378414
FLT   False      False     False    True       0.529955
                           True     False      0.495494
    

In [57]:
names = {'model': {'score': 'score',
                   'model_base_alpha': 'alpha',
                   'model_base_eta': 'eta',
                   'num_topics': 'K'
                  },
         'corpus': {'score': 'score',
                    'changeset_include_additions': 'Additions',
                    'changeset_include_context': 'Context',
                    'changeset_include_message': 'Message',
                    'changeset_include_removals': 'Removals',
                    },
        }
exps = ['triage', 'feature_location']
table_headers = {
    'model': ['K', 'alpha', 'eta', 'FLT', 'DIT'],
    'corpus': ['Additions', 'Removals', 'Context', 'Message', 'FLT', 'DIT']
}
formatters = {
    'FLT': lambda x: r"$\bm{%.4f}$" % x if x == max(main_df["FLT"]) else "$%.4f$" % x,
    'alpha': lambda x: "$%s/K$" % x if x != 'auto' else x,
    'eta': lambda x: "$%s/K$" % x if x != 'auto' else x,
    'K': lambda x: "$%s$" % int(x),
    'DIT': lambda x:  r"$\bm{%.4f}$" % x if x == max(main_df["DIT"]) else "$%.4f$" % x,
}

full_tex = r"""
\begin{table}
\begin{spacing}{1.2}
\centering
\caption{MRR values of %s %s construction sweep}
\label{table:%s}
\vspace{0.2em}
%s
\end{spacing}
\end{table}
"""

tex_dir = os.path.expanduser("~/git/dissertation/tables")
for rq, main_df in [("model", model_all), ("corpus", corpus_all)]:
    names[rq]['score'] = 'score'
    main_df = main_df.unstack(0).reset_index()

    # filter out uninteresting rows, like there was no corpus
    main_df = main_df[(main_df["FLT"] != 0) | (main_df["DIT"] != 0)]
    if rq == "model":
        main_df = main_df.sort(["K", "alpha", "eta"])
    else:
        main_df = main_df.sort(["Additions", "Removals", "Context", "Message"], ascending=False)

    label = "%s_%s_sweep" % ("all", rq)
    op = os.path.join(tex_dir, label + ".tex")

    if len(main_df) > 24:
        tex = r"\parbox{.45\linewidth}{\centering %s} \hfill \parbox{.45\linewidth}{\centering %s}"
        mid = len(main_df)//2
        tex = tex % (main_df[:mid].to_latex(index=False,
                                            escape=False, # needed so it doesn't screw up formatters
                                            formatters=formatters,
                                            columns=table_headers[rq]),
                     main_df[mid:].to_latex(index=False,
                                            escape=False, # needed so it doesn't screw up formatters
                                            formatters=formatters,
                                            columns=table_headers[rq]))
    else:
        tex = main_df.to_latex(index=False,
                               escape=False, # needed so it doesn't screw up formatters
                               formatters=formatters,
                               columns=table_headers[rq],)

    # and now the lazy
    this_full_tex = full_tex % ("all subject systems", rq, label, tex)
    this_full_tex = this_full_tex.replace(" alpha ", r" $\alpha$ ")
    this_full_tex = this_full_tex.replace(" eta ", r" $\eta$ ")
    this_full_tex = this_full_tex.replace(r"\begin{tabular}{rllrr}", r"\begin{tabular}{rll|rr}")
    this_full_tex = this_full_tex.replace(r"\begin{tabular}{llllrr}", r"\begin{tabular}{llll|rr}")
    this_full_tex = this_full_tex.replace(r"$500$ &  $1/K$ &  $1/K$ &", r"\myrowcolor $500$ &  $1/K$ &  $1/K$ &")
    this_full_tex = this_full_tex.replace(r"True &     True &    True &   False &", r"\myrowcolor True &     True &    True &   False &")

    print("Writing to: %s\n%s\n" % (op, this_full_tex))
    with open(op, 'wt') as f:
        f.write(this_full_tex)

Writing to: /home/cscorley/git/dissertation/tables/all_model_sweep.tex

\begin{table}
\begin{spacing}{1.2}
\centering
\caption{MRR values of all subject systems model construction sweep}
\label{table:all_model_sweep}
\vspace{0.2em}
\parbox{.45\linewidth}{\centering \begin{tabular}{rll|rr}
\toprule
    K &  $\alpha$ &    $\eta$ &      FLT &      DIT \\
\midrule
$100$ &  $1/K$ &  $1/K$ & $0.4786$ & $0.3017$ \\
$100$ &  $1/K$ &  $2/K$ & $0.4749$ & $0.2923$ \\
$100$ &  $1/K$ &  $5/K$ & $0.4648$ & $0.2855$ \\
$100$ &  $1/K$ &   auto & $0.4772$ & $0.3031$ \\
$100$ &  $2/K$ &  $1/K$ & $0.4804$ & $0.3007$ \\
$100$ &  $2/K$ &  $2/K$ & $0.4747$ & $0.2919$ \\
$100$ &  $2/K$ &  $5/K$ & $0.4691$ & $0.2849$ \\
$100$ &  $2/K$ &   auto & $0.4775$ & $0.3026$ \\
$100$ &  $5/K$ &  $1/K$ & $0.4765$ & $0.3150$ \\
$100$ &  $5/K$ &  $2/K$ & $0.4782$ & $0.2915$ \\
$100$ &  $5/K$ &  $5/K$ & $0.4693$ & $0.2837$ \\
$100$ &  $5/K$ &   auto & $0.4764$ & $0.3139$ \\
$100$ &   auto &  $1/K$ & $0.4779$ & $0.3028$ \\
