Grid Search Analysis
===

Compares the results of the grid search per dataset. And spits out the best one...


In [20]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from matplotlib import cm


import json
import codecs
import os

In [21]:
basepath = os.path.normpath("C:\Users\hatieke\.ukpsummarizer\scores_grid")

In [22]:
dirs = [f for f in os.listdir(basepath) if os.path.isdir(os.path.join(basepath, f))]

In [23]:
dirs

['5701303a1d18a8bc2b9fa4abe9bb2c6f5a65105053e1febf6e759e6c',
 '9623ef172d38f5006cb0633b0fcbbbdcbc233ec98b38fbad9db02eef',
 'c09db1f934b431bde5e263688426ae2e8ff075319cb228fd7dab0c4f']

In [24]:
selected = dirs[1]

In [25]:
def parse_dir(dir):
    p = os.path.join(basepath, dir)
    result_jsons = []
    result_files = [f for f in os.listdir(p) if f.startswith("result-") and f.endswith(".json")]
    for f in result_files:
        fn = os.path.join(p, f)
        fsize = os.path.getsize(fn)
        if fsize > 0:
            with open(fn) as fp:
                result_jsons.append(json.load(fp))
    return result_jsons

In [38]:
def parse_single_result_into_dataframe(obj, iteration=11):
    config = obj[u'config_feedbackstore']
    try:
        res = [i for i in obj["result_rougescores"] if i["iteration"] <= iteration][:-1][0]
    except:
        raise BaseException("unknown iteration %s" % (obj["config_run_id"]))
    
    total_accept = sum([1 for i in obj[u'log_feedbacks'] if i["value"] == 'accept' and i["iteration"] < iteration])
    total_reject = sum([1 for i in obj[u'log_feedbacks'] if i["value"] != 'accept' and i["iteration"] < iteration])
    total_feedback = total_accept + total_reject
    
    num_iterations = res["iteration"]
    r1 =  res[u'ROUGE-1 R score']
    r2 =  res[u'ROUGE-2 R score']
    r4 =  res[u'ROUGE-SU* R score']
   
    classtype = config.get(u'type')
    cut_off_threshold = config.get(u'cut_off_threshold')
    iterations_accept = config.get(u'iterations_accept')
    iterations_reject = config.get(u'iterations_reject')
    propagation_abort_threshold = config.get(u'propagation_abort_threshold')
    mass_accept = config.get(u'mass_accept')
    mass_reject = config.get(u'mass_reject')
    window_size = config.get(u'N')
    factor_reject = config.get(u"multiplier_reject")
    factor_accept = config.get(u"multiplier_accept")
    cutoff = config.get(u"cut_off_threshold")
    
    return {
        "accept" : total_accept,
        "reject": total_reject,
        "total_feedback": total_feedback,
        "ref_summary": str([item["name"] for item in obj[u'models']]),
        "cfg": config,
        "num_iterations": num_iterations,
        "r1": r1,
        "r2": r2,
        "r4": r4,
        "classtype": classtype,
        "iterations_accept":iterations_accept,
        "iterations_reject": iterations_reject,
        "propagation_abort_threshold": propagation_abort_threshold,
        "mass_accept" : mass_accept,
        "mass_reject" : mass_reject,
        "window_size": window_size,
        "multiplier_reject": factor_reject,
        "multiplier_accept": factor_accept,
        "cutoff_threshold": cutoff
    }


In [27]:
#parse_single_result_into_dataframe(first, iteration=10)

In [None]:
items = [parse_single_result_into_dataframe(item, iteration=11) for item in parse_dir(selected)]

In [None]:
len(items)

In [None]:
#items[0]

In [None]:
df = pd.DataFrame(items)

In [None]:
df.to_csv("C:\\Users\\hatieke\\.ukpsummarizer\\tmp\\grid.csv")

In [None]:
df["r2"].min()

In [None]:
top_score

In [None]:
top_score

In [None]:
v = top_score["cfg"]
v

In [None]:
v.apply(lambda x: x.mass_reject)

In [None]:
df.iloc[64]

In [None]:
df.groupby("ref_summary").max()

In [None]:
df.groupby(["ref_summary","classtype"])["r2"].describe()