In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from matplotlib import cm


import json
import codecs
import os

In [2]:
basepath = os.path.normpath("C:\Users\hatieke\.ukpsummarizer\scores_new")
result_files = [f for f in os.listdir(basepath) if f.startswith("result-") and f.endswith(".json")]

In [3]:
result_jsons = []
for f in result_files:
    fn = os.path.join(basepath, f)
    fsize = os.path.getsize(fn)
    if fsize > 0:
        with open(fn) as fp:
            result_jsons.append(json.load(fp))

In [4]:
result_jsons[0]

{u'config_concept_recommendation': {},
 u'config_feedback_interpretation': {},
 u'config_feedbackstore': {u'iterations_accept': 1000,
  u'iterations_reject': 100,
  u'mass_accept': -1.0,
  u'mass_reject': 1.0,
  u'propagation_mode': u'ABSORBING_RANDOM_WALK',
  u'type': u'WordEmbeddingFeedbackGraph',
 u'config_oracle_type': u'active_learning',
 u'config_parse_type': u'None',
 u'config_run_id': u'014fc5cb0dd4d40bcefdce36e775169897f2aeb9e0e9aad12f2cb291',
 u'config_summarizer_type': u'PROPAGATION',
 u'config_wordembeddings': u'glove.6B.300d',
 u'dataset': u'DUC2004',
 u'log_feedbacks': [{u'concept': u'the hous',
   u'iteration': 0,
   u'value': u'accept'},
  {u'concept': u'to follow', u'iteration': 0, u'value': u'accept'},
  {u'concept': u'follow his', u'iteration': 0, u'value': u'accept'},
  {u'concept': u'his exampl', u'iteration': 0, u'value': u'accept'},
  {u'concept': u'congress in', u'iteration': 0, u'value': u'reject'},
  {u'concept': u'year but', u'iteration': 0, u'value': u'rejec

In [5]:
def parse_single_result_into_dataframe(obj):
    df = pd.DataFrame(obj[u'result_rougescores'])
    mf = pd.DataFrame([obj[u'model_rougescores']])
    df["r1-delta"] = max(mf["ROUGE-1 R score"])  -df["ROUGE-1 R score"]
    df["r2-delta"] = max(mf["ROUGE-2 R score"])  -df["ROUGE-2 R score"]
    df["r4-delta"] = max(mf["ROUGE-SU* R score"])-df["ROUGE-SU* R score"]
    df["topic"] =  obj[u'topic']
    df["dataset"] = obj[u'dataset']
    df["run_id"] = obj[u'config_run_id']
    return df

In [6]:
chunks = [parse_single_result_into_dataframe(obj) for obj in result_jsons]

In [7]:
df = pd.concat(chunks, ignore_index=True)

In [8]:
df.columns

Index([  u'ROUGE-1 R score',   u'ROUGE-2 R score', u'ROUGE-SU* R score',
            u'accept_count',          u'accepted',         u'iteration',
            u'reject_count',          u'rejected',           u'summary',
                u'r1-delta',          u'r2-delta',          u'r4-delta',
                   u'topic',           u'dataset',            u'run_id'],
      dtype='object')

In [9]:
len(df)

2927

In [None]:
data = df.loc[:,['ROUGE-1 R score','ROUGE-2 R score', 'ROUGE-SU* R score','accept_count', 'iteration','reject_count','r1-delta','r2-delta','r4-delta','topic','dataset','run_id']]
filtered = data.drop_duplicates(['iteration','run_id'], keep="last")


len(filtered)

piv = filtered.pivot(index="iteration", columns="run_id", values="r2-delta")

print(piv.head(2))
piv.plot(figsize=(20,6))

plt.show()

run_id     014fc5cb0dd4d40bcefdce36e775169897f2aeb9e0e9aad12f2cb291  \
iteration                                                             
0                                                       0.21          
1                                                       0.23          

run_id     025c19c387e405310dce9421549d84d8a7fe02fc59e224a1f8dc733e  \
iteration                                                             
0                                                    0.10784          
1                                                    0.10784          

run_id     04fc6e13dfde2454789e87b3024070543eaef11d28ee52451eba2942  \
iteration                                                             
0                                                    0.10680          
1                                                    0.11651          

run_id     06c8a17924572744dc2f5818cc134eaf1359d794c27e2b34bd0f77d8  \
iteration                                                             
0  

no. of accepts feedbacks on x
===

In [None]:
df.head(3)

Absolute scores
===

In [None]:
cmap = matplotlib.colors.LinearSegmentedColormap.from_list(colors=[(0,0,0,0.1),(0,0,0,0.1)], name="meh")

In [None]:
column = "ROUGE-1 R score"

c = df.loc[:,[column,"iteration", "run_id"]]
c = c.drop_duplicates(["iteration", "run_id"], keep="last")
p = c.pivot(index="iteration", columns="run_id", values=column)

p.plot(figsize=(20,6),colormap=cmap, legend=False)

plt.show()

In [None]:
column = "ROUGE-2 R score"

c = df.loc[:,[column,"iteration", "run_id"]]
c = c.drop_duplicates(["iteration", "run_id"], keep="last")
p = c.pivot(index="iteration", columns="run_id", values=column)

p.plot(figsize=(20,6),colormap=cmap, legend=False)
plt.show()

In [None]:
column = "ROUGE-SU* R score"

c = df.loc[:,[column,"iteration", "run_id"]]
c = c.drop_duplicates(["iteration", "run_id"], keep="last")
p = c.pivot(index="iteration", columns="run_id", values=column)

p.plot(figsize=(20,6),colormap=cmap, legend=False)
plt.show()

Delta scores
===
the actual ROUGE score has been substracted from the uppber bound rouge score. => low values are better  that high values. especially values below 0.0 are optimal, as they are better than the actual upper bound summary score

In [None]:
c = df.loc[:,["r1-delta","iteration", "run_id"]]
c = c.drop_duplicates(["iteration", "run_id"], keep="last")
p = c.pivot(index="iteration", columns="run_id", values="r1-delta")

p.plot(figsize=(20,6),colormap=cmap, legend=False)

plt.show()

In [None]:
c = df.loc[:,["r2-delta","iteration", "run_id"]]
c = c.drop_duplicates(["iteration", "run_id"], keep="last")
p = c.pivot(index="iteration", columns="run_id", values="r2-delta")

p.plot(figsize=(20,6),colormap=cmap, legend=False)
plt.show()

In [None]:
c = df.loc[:,["r4-delta","iteration", "run_id"]]
c = c.drop_duplicates(["iteration", "run_id"], keep="last")
p = c.pivot(index="iteration", columns="run_id", values="r4-delta")

p.plot(figsize=(20,6),colormap=cmap, legend=False,title="upper bound r4 - iteration r4")
plt.show()

accept counts
===

In [None]:
c = df.loc[:,["accept_count","iteration", "run_id"]]
c = c.drop_duplicates(["iteration", "run_id"], keep="last")
p = c.pivot(index="iteration", columns="run_id", values="accept_count")

p.expanding().sum().plot(figsize=(20,6),colormap=cmap, legend=False)
plt.show()

In [None]:
c = df.loc[:,["reject_count","iteration", "run_id"]]
c = c.drop_duplicates(["iteration", "run_id"], keep="last")
p = c.pivot(index="iteration", columns="run_id", values="reject_count")

p.expanding().sum().plot(figsize=(20,6),colormap=cmap, legend=False)
plt.show()

In [None]:
c = df.loc[:,["r1-delta","iteration", "run_id"]]
c = c.drop_duplicates(["iteration", "run_id"], keep="last")
p = c.pivot(index="iteration", columns="run_id", values="r1-delta")

p.expanding().sum().plot(figsize=(20,6),colormap=cmap, legend=False)
plt.show()

In [None]:
len(p.columns)

In [None]:
p.columns