Grid Search Analysis
===

This integrated notebook analysis all grid search results in one shot.

Compares the results of the grid search per dataset. And spits out the best one...


In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from matplotlib import cm
from scipy import stats
from scipy.stats import ttest_ind



import json
import codecs
import os
from os import path

In [None]:
# set to false if the full csv already exists
CONVERT_FROM_RAW=True

In [None]:
basepath = os.path.normpath("C:\\Users\\hatieke\\temp")

In [None]:
basepath = os.path.normpath("C:\\Users\\hatieke\\.ukpsummarizer\\results\\grid_search")

In [None]:
basepath = os.path.normpath("C:\\Users\\hatieke\\.ukpsummarizer\\results\\scores_grid_2017-09-15")

In [None]:
def new_parser(dir):
    for root, dirs, files in os.walk(dir):
        for f in files:
            if f.startswith("result-") and f.endswith(".json"):
                fn = os.path.join(root, f)
                fsize = os.path.getsize(fn)
                if fsize > 0:
                    with open(fn) as fp:
                        data = json.load(fp)
                        data["parent_directory"] = root
                        yield data
                        

In [None]:
def parse_single_result_per_iteration_into_rows(obj):
    config = obj[u'config_feedbackstore']

    results = [i for i in obj["result_rougescores"]]
    for res in results:
        #print "before: it:" , res["iteration"] , "acc:" , len(res[u'accepted']) ,"rej:" , len(res["rejected"])
        if res["iteration"] == 0 and len(res[u'accepted']) == 0 and len(res["rejected"]) == 0:
            # do nothing
            minusone=-1
        else: 
            minusone=0
        iteration = res["iteration"] + minusone
        
        # this reduces the iteration-counter for the inital summary, i.e. the one, where we left of.
        #        res["iteration"] = res["iteration"]+minusone
        #print "after : it:" , iteration , "acc:" , len(res[u'accepted']) ,"rej:" , len(res["rejected"])

        accept_count = res["accept_count"]
        reject_count = res["reject_count"]

        total_accept = sum([1 for i in obj[u'log_feedbacks'] if i["value"] == 'accept' and i["iteration"] < iteration])
        total_reject = sum([1 for i in obj[u'log_feedbacks'] if i["value"] != 'accept' and i["iteration"] < iteration])
        total_feedback = total_accept + total_reject

        #iteration = res["iteration"]
        r1 =  res[u'ROUGE-1 R score']
        r2 =  res[u'ROUGE-2 R score']
        r4 =  res[u'ROUGE-SU* R score']

        classtype = config.get(u'type')
        cut_off_threshold = config.get(u'cut_off_threshold')
        iterations_accept = config.get(u'iterations_accept')
        iterations_reject = config.get(u'iterations_reject')
        propagation_abort_threshold = config.get(u'propagation_abort_threshold')
        mass_accept = config.get(u'mass_accept')
        mass_reject = config.get(u'mass_reject')
        window_size = config.get(u'N')
        factor_reject = config.get(u"multiplier_reject")
        factor_accept = config.get(u"multiplier_accept")
        cutoff = config.get(u"cut_off_threshold", config.get("cutoff_threshold"))

        runid = obj.get("config_run_id")
        word2vec = obj.get("config_wordembeddings")

        parse_type = obj.get("config_parse_type")
        if parse_type is None or parse_type.lower() is "None".lower():
            parse_type = "ngrams"

        oracle_type = obj.get("config_oracle_type", "accept")

        ub = obj.get("model_rougescores", {})
        ub_r1 = ub.get("ROUGE-1 R score", -1)
        ub_r2 = ub.get("ROUGE-2 R score", -1)
        ub_r4 = ub.get("ROUGE-SU* R score", -1)

        yield {
            "accept" : total_accept,
            "reject": total_reject,
            "accept_new": accept_count,
            "reject_new": reject_count,
            "feedback_new": accept_count+reject_count,
            "total_feedback": total_feedback,
            "ref_summary": str([item["name"] for item in obj[u'models']]),
            "cfg": json.dumps(config),
            "num_iterations": iteration,
            "r1": r1,
            "r2": r2,
            "r4": r4,
            "oracle": oracle_type,
            "classtype": classtype,
            "iterations_accept":iterations_accept,
            "iterations_reject": iterations_reject,
            "propagation_abort_threshold": propagation_abort_threshold,
            "mass_accept" : mass_accept,
            "mass_reject" : mass_reject,
            "window_size": window_size,
            "multiplier_reject": factor_reject,
            "multiplier_accept": factor_accept,
            "cutoff_threshold": cutoff,
            "run_id": runid,
            "ub_r1": ub_r1,
            "ub_r2": ub_r2,
            "ub_r4": ub_r4,
            "phrase_type": parse_type,
            "embeddings": word2vec,
            "size": 100,
            "max_iteration_count": len(results) - 1,
            "parent": obj.get("parent_directory", "unknown/parent/directory")
        }


In [None]:
odf = pd.DataFrame((r for i in new_parser(basepath) for r in parse_single_result_per_iteration_into_rows(i)))

In [None]:
list(odf.columns)

In [None]:
# remove real duplicates:
unidf = odf.drop_duplicates(["accept", "reject", "accept_new", "reject_new", "feedback_new", "total_feedback", "ref_summary", "cfg", "num_iterations", "r1", "r2", "r4", "oracle", "classtype", "iterations_accept", "iterations_reject", "propagation_abort_threshold", "mass_accept", "mass_reject", "window_size", "multiplier_reject", "multiplier_accept", "cutoff_threshold", "run_id", "ub_r1", "ub_r2", "ub_r4", "phrase_type", "embeddings", "size", "max_iteration_count"])

In [None]:
unidf.to_csv(path.join(basepath, "grid_search_dataframe_unique_rows.csv"))

In [None]:
# convert pojo into string
#df["cfg"] = df.apply(lambda x: json.dumps(x["cfg"]), axis=1)

In [None]:
unidf.info()

In [None]:
cdf = pd.DataFrame(unidf)
# Make categorical data although having numerical values, behave as categories
cdf["classtype"] = cdf["classtype"].astype("category")
cdf["cfg"] = cdf["cfg"].astype("category")
cdf["cutoff_threshold"] = cdf["cutoff_threshold"].astype("category")
cdf["embeddings"] = cdf["embeddings"].astype("category")
cdf["mass_accept"] = cdf["mass_accept"].astype("category")
cdf["mass_reject"] = cdf["mass_reject"].astype("category")
cdf["max_iteration_count"] = cdf["max_iteration_count"].astype("category")
cdf["multiplier_accept"] = cdf["multiplier_accept"].astype("category")
cdf["multiplier_reject"] = cdf["multiplier_reject"].astype("category")
cdf["oracle"] = cdf["oracle"].astype("category")
cdf["parent"] = cdf["parent"].astype("category")
cdf["phrase_type"] = cdf["phrase_type"].astype("category")
cdf["propagation_abort_threshold"] = cdf["propagation_abort_threshold"].astype("category")
cdf["size"] = cdf["size"].astype("category")
cdf["window_size"] = cdf["window_size"].astype("category")


In [None]:
odf = cdf

In [None]:
# throw away implausible results
ddf = odf.loc[odf.r2 >0].loc[odf.r1 > 0].loc[odf.r4 >0]

In [None]:
# only do this to exclude the initial value (-1)
ddf = ddf.loc[odf.num_iterations >= 0]

In [None]:
ddf.info()

In [None]:
histogram_cols = list(set(list(cdf.columns)) - set(['run_id', "cfg", "parent", "max_iteration_count" , u'r1', "total_feedback", "feedback_new", u'r2', u'r4', u'ub_r1', u'ub_r2', u'ub_r4', u'accept', u'accept_new', u'cfg',u'reject', u'reject_new', ]))
for col in histogram_cols:
    if col == "run_id":
        continue
    print cdf.loc[:,[col, "run_id"]].groupby([col]).count()["run_id"]
    

In [None]:
cdf.hist(figsize=(16,9))
plt.show()

Construction of a dataframe that fits the analysis needs
===

unabhängige Variablen, also Steuervariablen kann ich festlegen, die den Ausgang der Messung beeinflussen (in unbekannter Art). Im Dataframe sind das folgende Spalten:

* classtype
* cutoff_threshold
* iterations_accept
* iterations_reject
* mass_accept
* mass_reject
* multiplier_accept
* multiplier_reject
* propagation_abort_threshold
* ref_summary
* window_size
* num_iterations    
* oracle

Und folgende sind die abhängigen Variablen, also die Messvariablen, die quasi die Manifestation der Effekte. 

* accept
* num_iterations
* r1
* r2
* r4
* reject
* total_feedback

Es soll untersucht werden, welche Kombinationen unabhängiger Variablen die besten sind, also die baseline am weitesten übertreffen.

Dazu muss in jeder ref_summary-gruppe der wert der "baseline r2" als extra feld hinzugefügt werden und auch die differenz

Aus Rouge-2 baseline neue Spalten ableiten: als Delta und als Ratio 
---

In [None]:
ddf.loc[ddf.classtype.str.lower().str.contains("baseline")].loc[ddf.r2 == ddf.ub_r2].head(1).transpose()

In [None]:
baselines = ddf.classtype.str.contains("Baseline")


In [None]:
ddf.loc[baselines].loc[ddf.oracle.str.contains("active_learning")].loc[ddf.phrase_type.str.contains("None")].groupby(["parent", "num_iterations","phrase_type", "oracle"]).count()

In [None]:
ddf.groupby("ref_summary").count().transpose()

In [None]:
# find all baselines
baselines = ddf.classtype.str.contains("Baseline")

# throw away uninteresting columns
baseline_scores = ddf.loc[baselines].loc[ddf.num_iterations == 0].loc[:,["ref_summary","accept", "reject","r1","r2","r4","phrase_type"]]
print baseline_scores.count()



In [None]:
baseline_scores.head(1).transpose()

In [None]:
# rename important columns
baseline_scores.rename(columns={"r2":"r2bl", "r1":"r1bl", "r4":"r4bl", "accept": "acceptbl", "reject":"rejectbl"}, inplace=True)
# make sure, we have everything in there only once.
baseline_scores= baseline_scores.drop_duplicates(["ref_summary", "phrase_type"])
#baseline_scores= baseline_scores.loc[baseline_scores.acceptbl > 0].loc[baseline_scores.rejectbl > 0]

In [None]:
baseline_scores.groupby(["ref_summary", "phrase_type"]).max()

In [None]:
baseline_scores

In [None]:
baseline_scores.groupby(["phrase_type","ref_summary"]).agg([np.min, np.max])

In [None]:
# merge df and baseline_scores, so that the r2bl score is now available in every run.
df= pd.merge(ddf, baseline_scores, on=["ref_summary", "phrase_type"])

In [None]:
df.head(1).transpose()

In [None]:
# add the delta of baseline and actual r2 as separate column
df["delta_r2bl_r2"] = df["r2bl"] - df["r2"]
df["ratio_r2_vs_r2bl"] = df["r2"] / df["r2bl"]

In [None]:
# add the delta and ratio of the upper bound r2 as separate columns:
df["delta_r2ub"] = df["ub_r2"] - df["r2"]
df["ratio_r2ub"] = df["r2"] / df["ub_r2"]
df["ratio_r2ub_vs_feedback"] = df["ratio_r2ub"] / df["total_feedback"]

In [None]:
divisor = df["ub_r2"] - df["r2bl"]
term = df["r2"] - df["r2bl"]


In [None]:
df["r2bl"].describe()

In [None]:
scaled_r2 = term / divisor

In [None]:
scaled_r2.describe()

In [None]:
df["scaled_r2"] = scaled_r2

In [None]:
df.to_csv(path.join(basepath, "grid_search_df_with_r2ub_and_r2bl.csv"))

In [None]:
df.head().loc[:,["num_iterations","r2bl","ub_r2","r2"]].transpose()

In [None]:
# restrict to the main topic
D31043 = df.loc[df.ref_summary.str.contains("D31043.M.100.T")]

In [None]:
# restrict to the main topic
D30044 = df.loc[df.ref_summary.str.contains("D30044.M.100.T")]

Overall information
===

In [None]:
df.groupby(["phrase_type", "ref_summary"]).count()

In [None]:
df.groupby("ref_summary").count()

What classtypes are there? And how many of each?

In [None]:
df.groupby("classtype").count()

In [None]:
df.groupby("oracle").count()

In [None]:
len(df.groupby("cfg").count())

In [None]:
len(df.groupby(["parent", "cfg"]).count())

Methods
===

In [None]:
def scatterplot(data, locs, x='ratio_r2_vs_r2bl',y='total_feedback'):
    ax = None
    l = locs[0]
    c = plt.get_cmap("Vega10").colors[0];
    ax = data.loc[l].plot.scatter(x=x, y=y, color=c);

    for i in range(1, len(locs)):
        l = locs[i]
        c = plt.get_cmap("Vega10").colors[i];
        ax = data.loc[l].plot.scatter(x=x, y=y, color=c, ax=ax);
    plt.show()

In [None]:
def myplot(df, indep_var, dep_var, style = None, groupby = None, classtype = None, save=False, prefix=None, **kwargs):
    style = style or "whisker"
    classtype = classtype or "all"
    
    if groupby is None:
        data = df.loc[:,[dep, indep]]
    else:
        data = df.loc[:,[dep, indep, grouping_var]].groupby(groupby)
        
#    try:
    if style is "groupby":
        #groupby = groupby or indep_var
        data.boxplot(by=indep_var, showmeans=True, rot=90, fontsize="8", meanline=True, sym="+", vert=True, **kwargs)
    if style is "whisker":
        #groupby = groupby or indep_var
        data.boxplot(by=indep_var, showmeans=True, meanline=True, rot=90, fontsize="8", sym="+", vert=True, **kwargs)
    if style is "line":
        #groupby = groupby or indep_var
        data.agg([np.min, np.mean, np.median, np.max])\
            .plot(marker="o", **kwargs)


    #delta2.boxplot(by=col, figsize=(10,4), showmeans=True, meanline=True, sym="+", vert=True)
    if save:
        filename =  "%s-%s-%s-%s-%s.pdf" % (classtype, groupby, dep_var, indep_var, style)
        if prefix:
            filename = prefix +"--" + filename
        plt.savefig(path.join(basepath, filename))
    plt.show()
    plt.close()
#    except:
#        print "problem while plotting ", indep_var, dep_var
#        plt.close()

def bp(data, col, dep_var, grouping = None, classtype=""):
    try:
        if grouping is "groupby":
            data.loc[:,[col,  dep_var, "ref_summary"]].groupby("ref_summary").boxplot(by=col, showmeans=True, meanline=True, sym="+", vert=True)
        elif grouping is None:
            data.loc[:,[col,  dep_var, "ref_summary"]].boxplot(by=col, figsize=(10,4), showmeans=True, meanline=True, sym="+", vert=True)
        elif grouping is "line":
            data.loc[:,[col,  dep_var, "ref_summary"]].groupby(col).agg([np.min, np.mean, np.median, np.max]).plot(marker="o")

        #delta2.boxplot(by=col, figsize=(10,4), showmeans=True, meanline=True, sym="+", vert=True)
        plt.savefig(path.join(basepath, "%s-%s-%s-per-summary-%s.pdf" % (grouping, classtype, dep_var,col)))
        plt.show()
        plt.close()
    except:
        print "problem while plotting ", col, dep_var
        plt.close()


User study analysis
===

Analysing only the configurations that are part of the user study:

* concept-type: parse
* summarizer-type: propagation
* oracle: active_learning2
* GB:   `{"mass_reject": 0.0, "iterations_accept": 128, "mass_accept": 4.0, "cutoff_threshold": 0.6, "type": "WordEmbeddingGaussianFeedbackGraph", "iterations_reject": 16}`
* RW: `{"mass_reject": -1.0, "iterations_accept": 200, "propagation_abort_threshold": 0.25, "mass_accept": 1.0, "cut_off_threshold": 0.6, "type": "WordEmbeddingRandomWalkDiffusionFeedbackGraph", "iterations_reject": 1024}
`
  





Option 1: Random Walk
----

In [None]:
rw = D31043.loc[df.classtype.str.contains("WordEmbeddingRandomWalkDiffusionFeedbackGraph")]\
        .loc[df.mass_accept == 1]\
        .loc[df.mass_reject == -1]\
        .loc[df.iterations_accept == 1024]\
        .loc[df.iterations_reject == 200]\
        .loc[df.cutoff_threshold == 0.6]\
        .loc[df.propagation_abort_threshold == 0.25]\
        .loc[df.phrase_type == "parse"]

# use the highest iteration only
rw = rw.sort_values(by="num_iterations", ascending= False).drop_duplicates(["cfg", "classtype", "ref_summary","oracle", "embeddings", "phrase_type"])

print rw.groupby("oracle")["r2"].describe()
print rw.groupby("oracle")["num_iterations"].describe()

In [None]:
print rw.to_csv()

Option 2: Gaussian Blur Feedback
---

In [None]:
gb = D31043.loc[df.classtype.str.contains("WordEmbeddingGaussianFeedbackGraph")]\
        .loc[df.mass_accept == 4]\
        .loc[df.mass_reject == 0]\
        .loc[df.iterations_accept == 128]\
        .loc[df.iterations_reject == 16]\
        .loc[df.cutoff_threshold == 0.6]\
        .loc[df.phrase_type == "parse"]

# use the highest iteration only
gb = gb.sort_values(by="num_iterations", ascending= False).drop_duplicates(["cfg", "classtype", "ref_summary","oracle", "embeddings", "phrase_type"])

print "gaussian"
print gb.groupby("oracle")["r2"].describe()
print gb.groupby("oracle")["num_iterations"].describe()

In [None]:
print gb.to_csv()

Option 3: baseline
---

In [None]:
bl = D31043.loc[df.classtype.str.contains("BaselineFeedbackStore")]\
        .loc[df.phrase_type == "parse"]

# use the highest iteration only
bl = bl.sort_values(by="num_iterations", ascending= False).drop_duplicates(["cfg", "classtype", "ref_summary","oracle", "embeddings", "phrase_type"])

print bl.groupby("oracle")["r2"].describe()
print bl.groupby("oracle")["num_iterations"].describe()

Option 1-3 united
---

In [None]:
joined = rw.append(gb).append(bl)


In [None]:
print joined.to_csv()

In [None]:
print joined.groupby("oracle")["r2"].describe()
print joined.groupby("oracle")["num_iterations"].describe()

Defining the dependent variables of interest
===

The dependent variables are the variables that measure the impact. 

* `ratio_r2ub` is the relative comparison to the Upper Bound
* `ratio_r2_vs_r2bl` is the relative comparison against the baseline. If r2 > r2bl, then we are better than the baseline!


In [None]:
dep_vars = ["ratio_r2ub", "r2", "ratio_r2_vs_r2bl"]

In [None]:
# the variable we want to analyse
dep_var = "ratio_r2ub"

In [None]:
# the variable we want to analyse
dep_var = "ratio_r2ub"

In [None]:
# the variable we want to analyse
dep_var = "ratio_r2ub_vs_feedback"

Is phrase better than ngram?
---

In [None]:
indep_vars = ["phrase_type"] 
grouping_var = None

dep_vars = ["r2", 'ratio_r2ub',  'ratio_r2_vs_r2bl']

cols = [""]
cols.extend(indep_vars)
#cols.append(grouping_var)
cols.extend(dep_vars)

if "ref_summary" not in cols:
    cols.append("ref_summary")


topic1 = df.loc[:,cols]

for dep in dep_vars:
    for indep in indep_vars:
        print dep
        print indep
        myplot(topic1, indep, dep, save=True, prefix="phrase-vs-ngram")

Is this difference due related to the type of propagation?
---

In [None]:
indep_vars = ["phrase_type"] 
grouping_var = "classtype"

dep_vars = ["r2", 'ratio_r2ub',  'ratio_r2_vs_r2bl']

cols = []
cols.extend(indep_vars)
cols.append(grouping_var)
cols.extend(dep_vars)

if "ref_summary" not in cols:
    cols.append("ref_summary")


topic1 = df.loc[:,cols]

for dep in dep_vars:
    for indep in indep_vars:
        print dep
        print indep
        myplot(topic1, indep, dep, groupby=grouping_var, save=True, layout=(1,4), figsize=(16,4), prefix="concept-type-vs-classtype")

How do configurations develop over time?
===

* grouping by cfg and runid
* num_iterations on x, 
* r2, accept_count, reject_count on y. 

In [None]:
#data = df.sample(20000)

In [None]:
data = df

In [None]:
indep_vars = ["num_iterations"] 
grouping_var = "classtype"

dep_vars = [ 'accept_new', "reject_new", "accept", "reject"]

cols = ["parent"]
cols.extend(indep_vars)
cols.append(grouping_var)
cols.extend(dep_vars)

if "ref_summary" not in cols:
    cols.append("ref_summary")


topic1 = data.loc[:,cols]

for dep in dep_vars:
    for indep in indep_vars:
        print dep
        print indep
        myplot(topic1, indep, dep, groupby=grouping_var, save=True, layout=(1,4), figsize=(16,4), prefix="feeback-over-time")

In [None]:
data = df

In [None]:
indep_vars = ["num_iterations"] 
grouping_var = "phrase_type"

dep_vars = [ 'accept_new', "reject_new", "accept", "reject"]

cols = []
cols.extend(indep_vars)
cols.append(grouping_var)
cols.extend(dep_vars)

if "ref_summary" not in cols:
    cols.append("ref_summary")


topic1 = data.loc[:,cols]

for dep in dep_vars:
    for indep in indep_vars:
        print dep
        print indep
        myplot(topic1, indep, dep, groupby=grouping_var, save=True, layout=(1,4), figsize=(16,4), prefix="feeback-over-time-by-concept-type")

In [None]:
topic1.columns

In [None]:
[1,2,3] + [4] + [2,3,4]

In [None]:
indep_vars = ["num_iterations"] 
grouping_var = "phrase_type"

dep_vars = [ 'accept_new', "reject_new", "accept", "reject"]

cols = list(set(indep_vars +  [grouping_var] + dep_vars + ["ref_summary"]))

topic1 = data.loc[:,cols]

for dep in dep_vars:
    for indep in indep_vars:
        print dep
        print indep
        myplot(topic1, indep, dep, groupby=grouping_var, save=True, layout=(1,4), figsize=(16,4), prefix="feeback-over-time-by-concept-type")

In [None]:
data=df

In [None]:
#data = df.sample(20000)

In [None]:
data.groupby("cfg").count()

In [None]:
data.groupby("run_id").count()

In [None]:
key = lambda x: x.num_iterations

In [None]:
d = data.loc[:,["run_id","accept","num_iterations"]].groupby("num_iterations")
d = d.head(20)

In [None]:
plt.close()
d.plot()
plt.show()

In [None]:
ax = None
cm = plt.get_cmap("Vega10")
for idx, (name, group) in enumerate(data.groupby("num_iterations")):
    print idx, name
    if idx > 20:
        break;
    ax = group.plot.scatter(x="num_iterations", y="ratio_r2_vs_r2bl", color=cm.colors[idx % 10], ax=ax);


plt.show()
#for i in range(1, len(locs)):
#    l = locs[i]
#    c = plt.get_cmap("Vega10").colors[i];
#    ax = data.loc[l].plot.scatter(x=x, y=y, color=c, ax=ax);
#plt.show()

In [None]:
42 %3

Comparing the different classtypes against each other
===

In [None]:
description = df.groupby(["ref_summary", "classtype"]).agg([len, np.min, np.mean, np.max]).loc[:,["r2", "ub_r2", "accept","ratio_r2_vs_r2bl"]]

In [None]:
print description.to_latex(float_format='{0:.3f}'.format)

In [None]:
description = df.groupby(["ref_summary", "classtype"]).agg([len, np.min, np.mean, np.max]).loc[:,["num_iterations","accept","reject","ub_r2","ratio_r2ub","r2","ratio_r2_vs_bl"]]

In [None]:
print description.to_csv()

Best config, first try
====

best configuration as per agreement between all summaries of same config

we group by configuration, and then use the average ratio_ub2 score as performance measure.

In [None]:
myf = df

In [None]:
myf.info()

In [None]:
# use the highest iteration only
myf = myf.sort_values(by="num_iterations", ascending= False).drop_duplicates(["cfg", "classtype", "ref_summary","oracle","embeddings", "phrase_type"])

In [None]:
len(myf)

In [None]:
 myf.loc[myf.classtype.str.contains("WordEmbeddingGaussianFeedbackGraph")].groupby(["phrase_type", "oracle","cfg"]).mean().loc[:,["r2", "ratio_r2ub", "r2bl"," ub_r2","ratio_r2_vs_r2bl", "num_iterations"]].sort_values(by="ratio_r2ub", ascending=False)

In [None]:
myf.loc[myf.cfg.str.contains('"mass_reject": -1.0, "iterations_accept": 1024, "propagation_abort_threshold": 0.25, "mass_accept": 1.0, "cut_off_threshold": 0.6, "type": "WordEmbeddingRandomWalkDiffusionFeedbackGraph", "iterations_reject": 200')].groupby("ref_summary").mean().sort_values("ratio_r2_vs_r2bl", ascending=False)

In [None]:
myf.loc[myf.cfg.str.contains('"mass_reject": 0.0, "iterations_accept": 128, "mass_accept": 4.0, "cutoff_threshold": 0.6, "type": "WordEmbeddingGaussianFeedbackGraph", "iterations_reject": 16')].groupby("ref_summary").mean().sort_values("ratio_r2_vs_r2bl", ascending=False)

In [None]:
df.iloc[170774]

In [None]:
myf = df.loc[:,["cfg", "ref_summary","num_iterations"]].drop_duplicates(["cfg", "ref_summary","num_iterations"])


In [None]:
myf=df.loc[df.classtype.str.contains("WordEmbeddingGaussianFeedbackGraph")].loc[df.cutoff_threshold >= 0.998].loc[df.ref_summary.str.contains("u'D31043.M.100.T")].loc[df.oracle.str.contains("active_learning")]

In [None]:
len(myf)

In [None]:
group_merge = myf.groupby(["cfg", "ref_summary"]).num_iterations.apply(np.max).reset_index().merge(myf, on=["cfg","ref_summary", "num_iterations"])
%timeit myf.groupby(["cfg", "ref_summary"]).num_iterations.apply(np.max).reset_index().merge(myf, on=["cfg","ref_summary", "num_iterations"])

In [None]:
sort_drop=myf.sort_values(by=["cfg", "ref_summary","num_iterations"], ascending= [True, True, False]).drop_duplicates(["cfg", "ref_summary"])
%timeit myf.sort_values(by=["cfg", "ref_summary","num_iterations"], ascending= [True, True, False]).drop_duplicates(["cfg", "ref_summary"])

In [None]:
sort_drop2=myf.sort_values(by=["num_iterations"], ascending= [ False]).drop_duplicates(["cfg", "ref_summary"])
%timeit myf.sort_values(by=["num_iterations"], ascending= [False]).drop_duplicates(["cfg", "ref_summary"])

In [None]:
group_merge.info()

In [None]:
sort_drop2.info()

In [None]:
x = pd.merge(sort_drop2,sort_drop, how="inner", on=["cfg", "ref_summary","num_iterations"])

In [None]:
x.info()

In [None]:
x.dropna(inplace=True)

In [None]:
x.info()

In [None]:
x.drop_duplicates(["cfg", "ref_summary"])

In [None]:
f2 = iteration_indexes.sample(2000)

In [None]:
iteration_indexes

In [None]:
f2.info()

In [None]:
myf.iloc[6845]

In [None]:
myf.loc[myf.cfg ]

In [None]:
myf = df.sample(2000)


# throw away uninteresting columns
baseline_scores = myf.loc[baselines].loc[:,["ref_summary","num_iterations","accept", "reject","r2"]]

# rename important columns
baseline_scores.rename(columns={"r2":"r2bl", "accept": "acceptbl", "reject":"rejectbl"}, inplace=True)
# make sure, we have everything in there only once.
baseline_scores= baseline_scores.drop_duplicates(["ref_summary","num_iterations"])
baseline_scores= baseline_scores.loc[baseline_scores.acceptbl > 0].loc[baseline_scores.rejectbl > 0]

In [None]:
dep_var = "ratio_r2ub"

In [None]:
df.groupby(["ref_summary", "classtype"]).max().loc[:,["r2", "ratio_r2ub","ratio_r2_vs_r2bl", "accept"]]

In [None]:
top = df.loc[df["ratio_r2_vs_r2bl"]>1.0].groupby(["cfg","ref_summary"]).max().sort_values("ratio_r2_vs_r2bl", ascending=False).loc[:,["r2", "ratio_r2ub","ratio_r2_vs_r2bl", "accept"]]

In [None]:
top

In [None]:
dep_vars.append("cfg")

In [None]:
dep_vars = dep_vars[:-1]

In [None]:
dep_var

In [None]:
top.head(20)

In [None]:
cfg_of_winner = top.reset_index().iloc[0].cfg
print cfg_of_winner
data = df.loc[df.cfg.str.contains(cfg_of_winner)]

indep_vars = ["num_iterations","iterations_accept", "propagation_abort_threshold"] 
grouping_vars = ["num_iterations"]
dep_vars = [ 'accept_new', "reject_new", "accept", "reject","ratio_r2ub", "ratio_r2_vs_r2bl","r2"]
default = ["classtype","ratio_r2ub", "ratio_r2_vs_r2bl", "r2", "ref_summary", "accept", "reject", "total_feedback","num_iterations","run_id"]
cols = list(set(indep_vars +  grouping_vars + dep_vars + default))
print cols

topic1 = data.loc[:,cols]

In [None]:
topic1

In [None]:
for dep in dep_vars:
    for indep in indep_vars:
        for group in grouping_vars:
            if dep == indep or dep == group or indep == dep or indep == group:
                continue
            print dep, indep, group
            myplot(topic1, indep, dep, groupby=group, save=False, figsize=(16,4), prefix="feeback-over-time-by-concept-type")

In [None]:
cfg_of_winner = top.reset_index().iloc[1].cfg
print cfg_of_winner
df.loc[df.cfg.str.contains(cfg_of_winner), cols]

In [None]:
cfg_of_winner = top.reset_index().iloc[2].cfg
print cfg_of_winner
df.loc[df.cfg.str.contains(cfg_of_winner), cols]

In [None]:
cfg_of_winner = top.reset_index().iloc[3].cfg
print cfg_of_winner
df.loc[df.cfg.str.contains(cfg_of_winner), cols]

In [None]:
cfg_of_winner = top.reset_index().iloc[4].cfg
print cfg_of_winner
df.loc[df.cfg.str.contains(cfg_of_winner), cols]

Configs with 8 results
====

best configuration as per agreement between all summaries of same config

we group by configuration, and then use the average ratio_ub2 score as performance measure.

In [None]:
top = df.groupby("cfg").count().sort_values(dep_var, ascending=False).reset_index().loc[:,["cfg", "accept"]]

In [None]:
top=top.rename(columns={"accept": "num_results"})

In [None]:
pd.merge(df, top, on="cfg")["num_results"]

Block 1: BaselineFeedbackStore
---

Hyperparameter search has no parameters, => 4 - 8 results are all to get (depending whether one or two topics are used)

In [None]:
selected_classtype = "BaselineFeedbackStore"

In [None]:
data = df.loc[df.classtype.str.contains(selected_classtype)]

In [None]:
#indep_var = [ "multiplier_accept","multiplier_reject", "window_size"]

No independent variables, therefore no configurations to compare against each other.
There should be `4` resp. `8` rows in the dataset

In [None]:
data = df.loc[df.classtype.str.contains(selected_classtype)]

In [None]:
len(data)

In [None]:
data.columns

In [None]:
print(data.loc[:, [u'ref_summary',"classtype","phrase_type","r1","r2","r4",]].to_latex(index=False,float_format='{0:.3f}'.format))

In [None]:
print(data.loc[:, [u'ref_summary',"classtype","phrase_type","ub_r1", 'ub_r2', 'ub_r4',]].to_latex(index=False,float_format='{0:.3f}'.format))

In [None]:
# BaselineFeedbackStore is something special, as it doesnt have parameters:

In [None]:
delta2 = data.loc[:,[ dep_var, "ref_summary"]]

delta2.boxplot(figsize=(10,4), showmeans=True, meanline=True, sym="+", vert=True)

delta2.groupby("ref_summary").boxplot(figsize=(16,4), layout=(2,4), fontsize="8", showmeans=True, meanline=True, sym="+", vert=True)
plt.savefig(path.join(basepath, "%s-%s-%s-per-summary-%s.pdf" % ("boxplot", selected_classtype, dep_var, "ref_summary")))

plt.show()
plt.close()
#indep_var = ["ref_summary"]

Block 2: SimpleNgramFeedbackGraph
---

In [None]:
selected_classtype = "SimpleNgramFeedbackGraph"

This feedback store has three parameters.
The following combinations are tested:

In [None]:
indep_var = [ "multiplier_accept","multiplier_reject", "window_size"]

                window_size = [2, 3, 4, 5]
                factor_rejects = [1, 0, 0.05, 0.25, 0.5, 2, 4,8]
                factor_accepts = [1, 0, 0.05, 0.25, 0.5, 2, 4,8]

As a result, there are `4*7*7 = 196` combinations per topic. In total, there should be `196 * 4` resp. `196 * 8` rows in the dataset

In [None]:
data = df.loc[df.classtype.str.contains(selected_classtype)]

In [None]:
len(data)

In [None]:
selected_classtype

In [None]:
for col in indep_var:
    bp(data, col, dep_var,classtype=selected_classtype)

    
for col in indep_var:
    bp(data, col, dep_var, grouping="groupby", classtype=selected_classtype)

    
for col in indep_var:
    bp(data, col, dep_var, grouping="line", classtype=selected_classtype)


Block 3: WordEmbeddingGaussianFeedbackGraph
---

In [None]:
selected_classtype = "WordEmbeddingGaussianFeedbackGraph"

In [None]:
# indep_var = [ "mass_accept","mass_reject","type","iterations_accept","iterations_reject", "cutoff_threshold"]
# due to a bug, the cutoff_threshold is NOT stored in the result file, we therefore have to exclude it.
indep_var = [ "mass_accept","mass_reject","type","iterations_accept","iterations_reject"]

                mass_reject = [4.0, 1.0, 0.0, -1.0, -4.0]
                mass_accept = [4.0, 1.0, 0.0, -1.0, -4.0]
                iterations_accept = [16, 128, 1024]
                iterations_reject = [2, 4, 8, 16, 64]
                cut_off_threshold = [0.998, 0.98, 0.9, 0.6, 0.4]

Results in 1875 combinations per topic.

In [None]:
data = df.loc[df.classtype.str.contains(selected_classtype)]

In [None]:
len(data)

In [None]:
for col in indep_var:
    bp(data, col, dep_var,classtype=selected_classtype)

    
for col in indep_var:
    bp(data, col, dep_var, grouping="groupby", classtype=selected_classtype)

    
for col in indep_var:
    bp(data, col, dep_var, grouping="line", classtype=selected_classtype)


Block 4: WordEmbeddingRandomWalkDiffusionFeedbackGraph
---

In [None]:
selected_classtype = "WordEmbeddingRandomWalkDiffusionFeedbackGraph"

This feedback store has 6 parameters.
The following combinations are tested:

In [None]:
indep_var = [ "mass_reject", "mass_accept", \
             "iterations_accept", "iterations_reject",\
             "propagation_abort_threshold", \
             "cutoff_threshold"]

mass_reject = [4.0, 1.0, 0.0, -1.0, -4.0]
                mass_accept = [4.0, 1.0, 0.0, -1.0, -4.0]
                iterations_accept = [128, 1024, 10000]
                iterations_reject = [64, 200, 5000]
                cut_off_threshold = [0.998, 0.98, 0.9, 0.6, 0.4]
                propagation_abort_threshold = [0.01, 0.1, 0.25, 0.5, 0.75, 0.9]


As a result, there are `5*5*3*3*5*6 = 6750` combinations per topic. In total, there should be `6750 * 4` resp. `6750 * 8` rows in the dataset

In [None]:
data = df.loc[df.classtype.str.contains(selected_classtype)]

In [None]:
len(data)

In [None]:
for col in indep_var:
    bp(data, col, dep_var,classtype=selected_classtype)

    
for col in indep_var:
    bp(data, col, dep_var, grouping="groupby", classtype=selected_classtype)

    
for col in indep_var:
    bp(data, col, dep_var, grouping="line", classtype=selected_classtype)


Some overall basic correlation tests
===

In [None]:
df.corr(method="spearman")


In [None]:
df.corr(method="pearson").to_csv(path.join(basepath, "correlation-analysis-pearson-overall.csv"))


In [None]:
df.groupby("classtype").corr(method="pearson").to_csv(path.join(basepath, "correlation-analysis-pearson-per-classtype.csv"))


In [None]:
cols = list(df.columns)

In [None]:
cols = list(set(cols) - set(['accept', 'num_iterations', 'r1', 'r2', 'r4', 'reject', 'total_feedback', 'ub_r1', 'ub_r2', 'ub_r4', 'acceptbl', 'rejectbl', 'r2bl', 'delta_r2bl_r2', 'ratio_r2bl_r2', 'delta_r2ub']))

In [None]:
r= df.loc[:,cols].groupby(["classtype", "mass_accept"]).corr(method="pearson")

r.to_csv(path.join(basepath, "correlation-analysis-pearson-per-classtype-and-mass_accept.csv"))
r

Top result analysis
===

From the best performing summaries, are there certain configurations that appear more often than others?

in the first run, we use the `WordEmbeddingRandomWalkDiffusionFeedbackGraph`

In [None]:
selected_classtype = "WordEmbeddingRandomWalkDiffusionFeedbackGraph"

In [None]:
data = df.loc[df.classtype.str.contains("WordEmbeddingRandomWalkDiffusionFeedbackGraph")]

In [None]:
d = data.loc[df.ratio_r2ub > 1.0, ["classtype","cutoff_threshold","iterations_accept","iterations_reject","mass_accept","mass_reject","propagation_abort_threshold", "ref_summary","ratio_r2ub"]]

In [None]:
g = d.groupby("ref_summary")
for grp in g.groups:
    print grp, g.get_group(grp).count()
    
    g.get_group(grp)["propagation_abort_threshold"].hist()
    plt.show()


In [None]:
for col in d.columns:
    if col is not dep_var:
        bp(d, col, dep_var, grouping="line")

In [None]:
for col in indep_var:
    bp(data, col, dep_var, grouping="line")
    

In [None]:
len(data)

In [None]:
scatterplot(data,x="r2",y="accept", locs= [data.ref_summary.str.contains("D30044.M.100.T.B"), data.ref_summary.str.contains("D30044.M.100.T.C"), data.ref_summary.str.contains("D30044.M.100.T.D"), data.ref_summary.str.contains("D30044.M.100.T.G")])

In [None]:
scatterplot(data,x="total_feedback",y="ratio_r2ub", locs= [data.classtype.str.contains("SimpleNgramFeedbackGraph"),data.classtype.str.contains("WordEmbeddingRandomWalkDiffusionFeedbackGraph"),data.classtype.str.contains("WordEmbeddingRandomWalkDiffusionFeedbackGraph")])

In [None]:
df.ref_summary.str.contains("D30044.M.100.T.B")

In [None]:
type(df.ref_summary.str.contains("D30044.M.100.T.D"))


In [None]:
len(df)

In [None]:
data.classtype.str.contains("BaselineFeedbackStore")

In [None]:
data.loc[data.classtype.str.contains("SimpleNgramFeedbackGraph")].describe()

Upper Bound and Baseline
===

In [None]:
last_baseline_iteration_score=baseline_scores.drop_duplicates(["ref_summary","num_iterations", "phrase_type"])

In [None]:
last_baseline_iteration_score.sort_values("num_iterations", ascending=False).drop_duplicates(["ref_summary","phrase_type"]).sort_values("ref_summary")

In [None]:
print last_baseline_iteration_score.sort_values("num_iterations", ascending=False).drop_duplicates(["ref_summary","phrase_type"]).sort_values("ref_summary").to_csv()

In [None]:
ub = df.drop_duplicates(["ref_summary", "phrase_type", "num_iterations"])

In [None]:
ub = ub.sort_values("num_iterations", ascending=False).drop_duplicates(["ref_summary", "phrase_type"])

In [None]:
ub.loc[:, ["ref_summary", "phrase_type",u'ub_r1',"ub_r2","ub_r4",]].sort_values(["ref_summary", "phrase_type"])

In [None]:
ub.loc[:,["ref_summary","r2bl", "r1bl", "r4bl", u'ub_r1',"ub_r2","ub_r4","phrase_type"]].sort_values("ref_summary")

In [None]:
print ub.loc[:,["r2bl", u'ub_r1',"ub_r2","ub_r4","phrase_type","ref_summary"]].sort_values("ref_summary").to_latex()

Independence tests between the different classtypes (i.e. finding p-values)
===

In [None]:
iid = df.sort_values(by="num_iterations", ascending= False).drop_duplicates(["cfg", "classtype", "ref_summary", "oracle", "embeddings", "phrase_type"])


In [None]:
iid.classtype.unique()

In [None]:
column = "r2"

In [None]:
# create our samples
bl = iid.loc[iid.classtype.str.contains(u'BaselineFeedbackStore')]
snfg= iid.loc[iid.classtype.str.contains("SimpleNgramFeedbackGraph")]
gf= iid.loc[iid.classtype.str.contains("WordEmbeddingGaussianFeedbackGraph")]
rw= iid.loc[iid.classtype.str.contains("WordEmbeddingRandomWalkDiffusionFeedbackGraph")]


In [None]:
bl.loc[:,column].count()

In [None]:
iid.loc[:, ["classtype", "scaled_r2","r2"]].groupby("classtype")[column].agg([stats.normaltest, stats.skewtest, stats.kurtosistest])

In [None]:
print "bl mean:  ", bl.mean()
print "snfg mean:", snfg.mean()
print "gf mean:  ", gf.mean()
print "rw mean:  ", rw.mean()


In [None]:
# fitting the distributions into normals

print stats.normaltest(bl)
print stats.normaltest(snfg)
print stats.normaltest(gf)
print stats.normaltest(rw)


BL vs SNFG
---

In [None]:
print "bl mean:", bl.mean()
print "vs mean:", snfg.mean()

print "diff snfg mean:", snfg.mean() - bl.mean()
print "welch", ttest_ind(bl, snfg , equal_var=False)
print "df", (bl.count() + snfg.count()) -1


BL vs WEGFG 
---

In [None]:
print "bl mean:", bl.mean()
print "vs mean:", gf.mean()
print "diff gf mean:", bl.mean() - gf.mean()
print "welch", ttest_ind(bl, gf , equal_var=False)
print "df", (bl.count() + gf.count()) -1


BL vs WERWFG 
---

In [None]:
print "bl mean:", bl.mean()
print "vs mean:", rw.mean()

print "diff snfg mean:", bl.mean() - rw.mean()
print "welch", ttest_ind(bl, rw , equal_var=False)
print "df", (bl.count() + rw.count()) -1


Different RW cutoff-thresholds vs BL
---

In [None]:
def inditest(a,b):
    print "mean a", a.mean()
    print "mean b", b.mean()
    print "a - b",  a.mean() - b.mean()
    print "df ", (a.count() + b.count()) -1
    print "welch", ttest_ind(a, b , equal_var=False)

In [None]:
rw.cutoff_threshold.unique()

In [None]:
rw.phrase_type.unique()

In [None]:
rw.query('cutoff_threshold == 0.4 & phrase_type == "None"').groupby("cfg").mean()
#rw.loc[rw.cutoff_threshold ==0.4  & rw.phrase_type == u'None', column]


In [None]:
co04 = rw.query('cutoff_threshold == 0.4 & phrase_type == "parse"').groupby("cfg").mean().loc[:,column]
co0998 = rw.query('cutoff_threshold == 0.998 & phrase_type == "parse"').groupby("cfg").mean().loc[:,column]
co09 = rw.query('cutoff_threshold == 0.9 & phrase_type == "parse"').groupby("cfg").mean().loc[:,column]
co06 = rw.query('cutoff_threshold == 0.6 & phrase_type == "parse"').groupby("cfg").mean().loc[:,column]



In [None]:
inditest(bl.loc[:,column], co04)

In [None]:
inditest(bl.loc[:,column], co06)

In [None]:
inditest(bl.loc[:,column], co09)

In [None]:
inditest(bl.loc[:,column], co0998)

WERWFG vs WERWFG
===

In [None]:
a = df.loc[df.classtype.str.contains("Baseline")].loc[df.phrase_type.str.contains("None")].loc[:,column]
b = df.loc[df.classtype.str.contains("RandomWalk")].loc[df.phrase_type.str.contains("None")].loc[:,column]

In [None]:
print "a mean:", a.mean()
print "b mean:", b.mean()

In [None]:
print "diff snfg mean:", a.mean() - b.mean()
print "df", (a.count() + b.count()) -1
print "welch", ttest_ind(a, b , equal_var=False)

In [None]:
print "diff bl mean:  ", bl.mean()
print "diff snfg mean:", snfg.mean() - bl.mean()
print "diff gf mean:  ", gf.mean() - bl.mean()
print "diff rw mean:  ", rw.mean() - bl.mean()

In [None]:
print "bl var:  ", bl.var()
print "snfg var:", snfg.var()
print "gf var:  ", gf.var()
print "rw var:  ", rw.var()


In [None]:
print "bl std:  ", bl.std()
print "snfg std:", snfg.std()
print "gf std:  ", gf.std()
print "rw std:  ", rw.std()


In [None]:
# testing skewness
print stats.skewtest(bl)
print stats.skewtest(snfg)
print stats.skewtest(gf)
print stats.skewtest(rw)


In [None]:
# testing kurtosistest
print stats.kurtosistest(bl)
print stats.kurtosistest(snfg)
print stats.kurtosistest(gf)
print stats.kurtosistest(rw)


In [None]:
target_class = "BaselineFeedbackStore"
print ttest_ind(bl, bl , equal_var=False)
print stats.f_oneway(bl, bl)
print stats.kruskal(bl, bl)

In [None]:
target_class = "SimpleNgramFeedbackGraph"
print ttest_ind(bl, snfg , equal_var=False)
print stats.f_oneway(bl, snfg)
print stats.kruskal(bl, snfg)

In [None]:
target_class = "WordEmbeddingRandomWalkDiffusionFeedbackGraph"
print ttest_ind(bl, rw , equal_var=False)
print stats.f_oneway(bl,rw)
print stats.kruskal(bl, rw)

In [None]:
target_class = "WordEmbeddingGaussianFeedbackGraph"
print ttest_ind(bl, gf , equal_var=False)
print stats.f_oneway(bl, gf)
print stats.kruskal(bl, gf)

In [None]:
stats.f_oneway(iid.loc[iid.classtype.str.contains(u'BaselineFeedbackStore')].loc[:,column], iid.loc[iid.classtype.str.contains(target_class)].loc[:,column])

In [None]:
iid.loc[:,["classtype", "ref_summary", "r2"]].groupby("ref_summary").hist(by=["classtype"], figsize=(10,2))

In [None]:
iid.loc[:,["classtype", "ref_summary", "r2"]].groupby(["classtype"])["r2"].plot.kde(legend =True)

plt.show()

In [None]:
for store in ["WordEmbeddingGaussianFeedbackGraph","BaselineFeedbackStore","SimpleNgramFeedbackGraph","WordEmbeddingRandomWalkDiffusionFeedbackGraph"]:
    print store
    iid.loc[iid.classtype.str.contains(store),["classtype", "ref_summary", "r2"]].groupby(["ref_summary"])["r2"].plot.kde(legend =True)
    plt.show()

In [None]:
for store in ["D31043.M.100.T.D","D31043.M.100.T.F","D31043.M.100.T.G","D31043.M.100.T.H"]:
    print store
    iid.loc[iid.ref_summary.str.contains(store),["classtype", "ref_summary", "r2"]].groupby(["classtype"])["r2"].plot.kde(legend =True, alpha=0.5)
    plt.show()

Baseline summary score:
===

In [None]:
df.loc[df.groupby("ref_summary")["r2bl"].count()

In [None]:
xy = df.query('phrase_type == "parse"').groupby(["ref_summary", "oracle", "phrase_type"])

In [None]:
xy.mean().loc[:,["r2bl"]]

In [None]:
df.query('phrase_type == "parse"').loc[:,["r2bl"]]

In [None]:
xy.count()

In [None]:
xy = df.query('phrase_type == "parse"')

In [None]:
len(xy)

In [None]:
xy.r2bl.unique()


In [None]:
xy.groupby("r2bl").count()

In [None]:
baselines

In [None]:
baseline_scores.loc[baseline_scores.phrase_type.str.contains("None")]