In [1]:
%matplotlib inline

In [2]:
from __future__ import print_function, division

from matplotlib import pyplot as plt
import pandas
import src
import gensim
import os
import os.path
import csv
import functools
import itertools
import collections
import scipy
import scipy.stats
from operator import itemgetter

INFO:gensim.corpora.sharded_corpus:Could not import Theano, will use standard float for default ShardedCorpus dtype.
2015-12-18 23:08:25 twoism gensim.corpora.sharded_corpus[2491] INFO Could not import Theano, will use standard float for default ShardedCorpus dtype.
INFO:summa.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English
2015-12-18 23:08:25 twoism summa.preprocessing.cleaner[2491] INFO 'pattern' package not found; tag filters are not available for English


In [24]:
model_config = {
    'num_topics': 500,
    'alpha': 1/500,
    'eta': 1/500,
    'decay': 0.5,
    'offset': 1.0,
    'iterations': 1000,
    'passes': 1,
    'max_bound_iterations': 1000, # special
    'algorithm': 'batch', # special
}

changeset_config = {
    'include_additions': True,
    'include_context': True,
    'include_message': False,
    'include_removals': True,
}

def get_config_string(config):
    return '-'.join([unicode(v) for k, v in sorted(config.items())])

alpha_bases = ['auto', 1, 2, 5]
eta_bases = ['auto', 1, 2, 5]
num_topics = [100, 200, 500]

def get_rank_name(kind, experiment, changeset_config, model_config):
    """
    kind = [changeset, release, temporal]
    experiment = [triage, feature_location]
    """
    cs_str = get_config_string(changeset_config)
    m_str = get_config_string(model_config)
    return '-'.join([kind, experiment, 'lda', cs_str, m_str, 'file', 'ranks']).lower() + '.csv.gz'

model_sweep = list()
for a, e, K in itertools.product(alpha_bases, eta_bases, num_topics):
    m = dict(model_config)
    if a != 'auto':
        a /= K
    if e != 'auto':
        e /= K
    m['alpha'] = a
    m['eta'] = e
    m['num_topics'] = K
    model_sweep.append(m)

corpus_sweep = list()
b = [True, False]
for a, c, m, r in itertools.product(b, repeat=4):
    conf = dict(changeset_config)
    conf['include_additions'] = a
    conf['include_context'] = c
    conf['include_message'] = m
    conf['include_removals'] = r
    if any(conf.values()):
        corpus_sweep.append(conf)

In [25]:
projects = list()
Project = collections.namedtuple('Project', 'name version data rankpath config')
for dirpath, dirname, filenames in os.walk("../data"):
    for filename in filenames:
        if filename == 'ref':
            _, _, name, version = dirpath.split('/')
            projects.append(Project(name, version, dirpath, '', dict()))
projects

[Project(name='tika', version='v1.8', data='../data/tika/v1.8', rankpath='', config={}),
 Project(name='pig', version='v0.14.0', data='../data/pig/v0.14.0', rankpath='', config={}),
 Project(name='bookkeeper', version='v4.3.0', data='../data/bookkeeper/v4.3.0', rankpath='', config={}),
 Project(name='openjpa', version='v2.3.0', data='../data/openjpa/v2.3.0', rankpath='', config={}),
 Project(name='mahout', version='v0.10.0', data='../data/mahout/v0.10.0', rankpath='', config={}),
 Project(name='zookeeper', version='v3.5.0', data='../data/zookeeper/v3.5.0', rankpath='', config={})]

In [31]:
cs_dit = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'triage', c, model_config)) for c in corpus_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data, rankname)
    if os.path.exists(rankpath):
        cs_dit.append(project._replace(rankpath=rankpath, config=config))

cs_flt = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'feature_location', c, model_config)) for c in corpus_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data, rankname)
    if os.path.exists(rankpath):
        cs_flt.append(project._replace(rankpath=rankpath, config=config))

        
ms_dit = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'triage', changeset_config, c)) for c in model_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data, rankname)
    if os.path.exists(rankpath):
        ms_dit.append(project._replace(rankpath=rankpath, config=config))
        
ms_flt = list()
for project, rankstuff in itertools.product(projects, [
        (c, get_rank_name('changeset', 'feature_location', changeset_config, c)) for c in model_sweep]):
    config, rankname = rankstuff
    rankpath = os.path.join(project.data, rankname)
    if os.path.exists(rankpath):
         ms_flt.append(project._replace(rankpath=rankpath, config=config))



In [34]:
corpus_df = pandas.DataFrame(columns=["Subject", "Task","Issue", "Rank", "Distance", "Additions", "Removals", "Context", "Message"])
for item in cs_dit:
    df = pandas.DataFrame(columns=corpus_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.name.title().replace("keeper", "Keeper").replace("Openjpa", "OpenJPA") + " " + item.version)
    df["Additions"] = df.Additions.fillna(item.config['include_additions'])
    df["Removals"] = df.Removals.fillna(item.config['include_removals'])
    df["Context"] = df.Context.fillna(item.config['include_context'])
    df["Message"] = df.Message.fillna(item.config['include_message'])
    df["Task"] = df.Task.fillna("DIT")
    corpus_df = corpus_df.append(df, ignore_index=True)
    
for item in cs_flt:
    df = pandas.DataFrame(columns=corpus_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.name.title().replace("keeper", "Keeper").replace("Openjpa", "OpenJPA") + " " + item.version)
    df["Additions"] = df.Additions.fillna(item.config['include_additions'])
    df["Removals"] = df.Removals.fillna(item.config['include_removals'])
    df["Context"] = df.Context.fillna(item.config['include_context'])
    df["Message"] = df.Message.fillna(item.config['include_message'])
    df["Task"] = df.Task.fillna("FLT")
    corpus_df = corpus_df.append(df, ignore_index=True)
    
model_df = pandas.DataFrame(columns=["Subject", "Task", "Issue", "Rank", "Distance", "alpha", "eta", "K"])
for item in ms_dit:
    df = pandas.DataFrame(columns=model_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.name.title().replace("keeper", "Keeper").replace("Openjpa", "OpenJPA") + " " + item.version)
    df["alpha"] = df.alpha.fillna(item.config['alpha'])
    df["eta"] = df.eta.fillna(item.config['eta'])
    df["K"] = df.K.fillna(item.config['num_topics'])
    df["Task"] = df.Task.fillna("DIT")
    model_df = model_df.append(df, ignore_index=True)
    
for item in ms_flt:
    df = pandas.DataFrame(columns=model_df.columns)
    subdf = pandas.read_csv(item.rankpath)
    df["Rank"] = subdf.groupby("id")["rank"].min().values
    df["Distance"] = subdf.groupby("id")["distance"].min().values
    df["Issue"] = subdf.groupby("id")["rank"].min().index
    df["Subject"] = df.Subject.fillna(item.name.title().replace("keeper", "Keeper").replace("Openjpa", "OpenJPA") + " " + item.version)
    df["alpha"] = df.alpha.fillna(item.config['alpha'])
    df["eta"] = df.eta.fillna(item.config['eta'])
    df["K"] = df.K.fillna(item.config['num_topics'])
    df["Task"] = df.Task.fillna("FLT")
    model_df = model_df.append(df, ignore_index=True)

In [36]:
corpus_df.groupby(["Subject", "Task", "Additions"]).Rank.apply(src.utils.calculate_mrr)

Subject            Task  Additions
BookKeeper v4.3.0  DIT   False        0.617247
                         True         0.622891
                   FLT   False        0.478523
                         True         0.575344
Mahout v0.10.0     DIT   False        0.280230
                         True         0.311753
                   FLT   False        0.671349
                         True         0.659577
OpenJPA v2.3.0     DIT   False        0.315077
                         True         0.344113
                   FLT   False        0.316098
                         True         0.331543
Pig v0.14.0        DIT   False        0.200080
                         True         0.176639
                   FLT   False        0.446399
                         True         0.480212
Tika v1.8          DIT   False        0.355327
                         True         0.414457
                   FLT   False        0.457941
                         True         0.503806
ZooKeeper v3.5.0   DIT   

In [41]:
res = pandas.DataFrame(columns=["Subject", "Config", "NotIncl", "Incl", "p"])
for k in ["Additions", "Removals", "Context", "Message"]:
    for key, group in corpus_df.groupby(["Subject", "Task"]):
        sub = group.groupby(k).groups
        f = corpus_df.ix[sub[False]].Rank
        t = corpus_df.ix[sub[True]].Rank
        stat, p = scipy.stats.mannwhitneyu(f, t)
        res = res.append(
            dict(
                zip(res.columns, 
                    [key, k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
            , ignore_index=True)
    sub = corpus_df.groupby(["Task", k]).groups
    f = corpus_df.ix[sub[("DIT", False)]].Rank
    t = corpus_df.ix[sub[("DIT", True)]].Rank
    stat, p = scipy.stats.mannwhitneyu(f, t)
    res = res.append(
        dict(
            zip(res.columns, 
                ["**Overall DIT**", k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
        , ignore_index=True)
    f = corpus_df.ix[sub[("FLT", False)]].Rank
    t = corpus_df.ix[sub[("FLT", True)]].Rank
    stat, p = scipy.stats.mannwhitneyu(f, t)
    res = res.append(
        dict(
            zip(res.columns, 
                ["**Overall FLT**", k, src.utils.calculate_mrr(f), src.utils.calculate_mrr(t), p]))
        , ignore_index=True)        

In [42]:
res[(res.NotIncl > res.Incl) & (res.p < 0.05)]

Unnamed: 0,Subject,Config,NotIncl,Incl,p
15,"(BookKeeper v4.3.0, FLT)",Removals,0.541638,0.520118,0.03835963
16,"(Mahout v0.10.0, DIT)",Removals,0.31024,0.285494,0.0003012957
20,"(Pig v0.14.0, DIT)",Removals,0.212382,0.165875,7.141461e-13
24,"(ZooKeeper v3.5.0, DIT)",Removals,0.381525,0.344173,1.853858e-11
26,**Overall DIT**,Removals,0.365892,0.343557,2.612734e-14
35,"(Pig v0.14.0, FLT)",Context,0.478371,0.452237,0.02513671
44,"(Mahout v0.10.0, DIT)",Message,0.307949,0.287499,0.03774864


In [43]:
res[(res.NotIncl < res.Incl) & (res.p < 0.05)]

Unnamed: 0,Subject,Config,NotIncl,Incl,p
1,"(BookKeeper v4.3.0, FLT)",Additions,0.478523,0.575344,1.090972e-09
2,"(Mahout v0.10.0, DIT)",Additions,0.28023,0.311753,0.0006962983
4,"(OpenJPA v2.3.0, DIT)",Additions,0.315077,0.344113,0.001438463
7,"(Pig v0.14.0, FLT)",Additions,0.446399,0.480212,0.007726905
10,"(ZooKeeper v3.5.0, DIT)",Additions,0.357364,0.365314,0.0003700157
12,**Overall DIT**,Additions,0.349374,0.35801,0.000226208
13,**Overall FLT**,Additions,0.51856,0.54997,7.531264e-06
28,"(BookKeeper v4.3.0, DIT)",Context,0.56957,0.664609,1.53474e-11
34,"(Pig v0.14.0, DIT)",Context,0.172989,0.200344,9.659076000000001e-33
38,"(ZooKeeper v3.5.0, DIT)",Context,0.349857,0.371883,9.960133e-07


In [46]:
for key, group in corpus_df.groupby(["Subject", "Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["Additions", "Removals", "Context", "Message"]):
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))
    for x, y in itertools.combinations(corpus_df.groupby(["Additions", "Removals", "Context", "Message"]).groups.keys(), r=2):
        stat, p = scipy.stats.wilcoxon(ranks[x], ranks[y])
        if p < 0.01:
            print(x, y, p, "******")
        else:
            print(x, y, p)
    print()

('BookKeeper v4.3.0', 'DIT') (237.24182745176435, 1.2404160564716079e-42)
(True, False, True, True) (True, True, False, True) 0.00948094768239 ******
(True, False, True, True) (False, True, False, True) 0.0546324628204
(True, False, True, True) (True, True, True, False) 0.000690377537119 ******
(True, False, True, True) (True, False, True, False) 0.00844649235618 ******
(True, False, True, True) (False, True, True, True) 0.0895844008762
(True, False, True, True) (False, False, True, True) 0.299801402527
(True, False, True, True) (True, True, True, True) 3.76534149136e-06 ******
(True, False, True, True) (False, False, True, False) 0.00365825474391 ******
(True, False, True, True) (True, False, False, True) 0.0108529046923
(True, False, True, True) (False, True, True, False) 0.000176434456611 ******
(True, False, True, True) (False, True, False, False) 0.0627442166216
(True, False, True, True) (True, True, False, False) 5.6527130921e-11 ******
(True, False, True, True) (True, False, Fal

In [50]:
model_df

Unnamed: 0,Subject,Task,Issue,Rank,Distance,alpha,eta,K
0,Tika v1.8,DIT,241,14,0.926001,auto,auto,100
1,Tika v1.8,DIT,936,3,0.802066,auto,auto,100
2,Tika v1.8,DIT,995,5,0.819010,auto,auto,100
3,Tika v1.8,DIT,1028,10,0.835602,auto,auto,100
4,Tika v1.8,DIT,1269,6,0.880518,auto,auto,100
5,Tika v1.8,DIT,1286,2,0.808931,auto,auto,100
6,Tika v1.8,DIT,1365,12,0.853499,auto,auto,100
7,Tika v1.8,DIT,1383,2,0.732601,auto,auto,100
8,Tika v1.8,DIT,1416,7,0.911575,auto,auto,100
9,Tika v1.8,DIT,1423,2,0.790068,auto,auto,100


In [51]:
for key, group in model_df.groupby(["Subject", "Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))
    for x, y in itertools.combinations(model_df.groupby(["alpha", "eta", "K"]).groups.keys(), r=2):
        stat, p = scipy.stats.wilcoxon(ranks[x], ranks[y])
        if p < 0.01:
            print(x, y, p, "******")
        else:
            print(x, y, p)
    print()

('BookKeeper v4.3.0', 'DIT') (699.43501411184604, 4.4954912006458822e-117)
(0.002, 0.01, 500.0) (0.01, 'auto', 200.0) 0.000554023956416 ******
(0.002, 0.01, 500.0) (0.01, 0.02, 100.0) 0.018316758109
(0.002, 0.01, 500.0) (0.01, 'auto', 500.0) 0.738796732663
(0.002, 0.01, 500.0) (0.002, 0.004, 500.0) 0.000102868986574 ******
(0.002, 0.01, 500.0) ('auto', 0.025, 200.0) 0.0532813810404
(0.002, 0.01, 500.0) ('auto', 0.05, 100.0) 1.09810267306e-11 ******
(0.002, 0.01, 500.0) (0.01, 0.05, 100.0) 2.77420704614e-08 ******
(0.002, 0.01, 500.0) (0.01, 0.005, 200.0) 0.376903323884
(0.002, 0.01, 500.0) (0.05, 0.05, 100.0) 2.3584339386e-10 ******
(0.002, 0.01, 500.0) (0.002, 0.002, 500.0) 0.87196376737
(0.002, 0.01, 500.0) ('auto', 0.002, 500.0) 0.0114105878642
(0.002, 0.01, 500.0) (0.004, 0.01, 500.0) 1.65457999906e-07 ******
(0.002, 0.01, 500.0) (0.01, 0.01, 500.0) 3.45540693176e-06 ******
(0.002, 0.01, 500.0) (0.025, 0.025, 200.0) 0.000501002615128 ******
(0.002, 0.01, 500.0) (0.05, 'auto', 100.0

In [54]:
for key, group in model_df.groupby(["Task"]):
    ranks = dict()
    for subkey, subgroup in group.groupby(["alpha", "eta", "K"]):
        ranks[subkey] = subgroup.Rank

    print(key, scipy.stats.friedmanchisquare(*ranks.values()))
    continue
    for x, y in itertools.combinations(model_df.groupby(["alpha", "eta", "K"]).groups.keys(), r=2):
        stat, p = scipy.stats.wilcoxon(ranks[x], ranks[y])
        if p < 0.01:
            print(x, y, p, "******")
        else:
            print(x, y, p)
    print()

DIT (2451.7193649520473, 0.0)
FLT (836.58592171787814, 4.1273171489425937e-145)
