In [1]:
from evaluate import *

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
def compare_wrapper(fp1, fp2, unique=True, write=True, union_gold=False):
    """
    Compare 2 prediction files
    """

    # Read data from disk
    print(f"Reading predictions from {fp1}")
    results1 = read_results(FILE1)
    print(f"Reading predictions from {fp2}")
    results2 = read_results(FILE2)

    OUTPUT_PATH = fp1.replace(".json", f"_{fp2.replace('.json', '_eval.json')}")

    # Compute metrics
    eval_results = compare_predictions(results1, results2, unique=unique, union_gold=union_gold)

    if write:
        # Write results to disk
        write_eval_results(OUTPUT_PATH, eval_results)
        print(f"\nSaved results at {OUTPUT_PATH}\n")

In [3]:
# Read prediction files and run evaluation
import glob

MODE = "single"  # single / multi

PATH_PREFIX = {
    "single": "singlehop",
    "multi": "predictions",
}

for fp in glob.glob(f'./{PATH_PREFIX[MODE]}_*'):
    if "meta" in fp or "eval" in fp:
        continue
    
    # Read data from disk
    print(f"Reading predictions from {fp}")
    results = read_results(fp)
    OUTPUT_PATH = fp.replace(".json", "_eval.json")

    # Compute metrics
    eval_results = compute_metrics(results)
    
    # Write results to disk
    write_eval_results(OUTPUT_PATH, eval_results)
    print(f"\nSaved results at {OUTPUT_PATH}\n")

Reading predictions from ./singlehop_1648033166.json
{
  "n_questions": 5918,
  "em": 17.27,
  "substr_gp": 22.05,
  "substr_pg": 25.18,
  "substr2": 29.4,
  "overlap": 32.45
}

Saved results at ./singlehop_1648033166_eval.json

Reading predictions from ./singlehop_1648036353.json
{
  "n_questions": 5918,
  "em": 20.95,
  "substr_gp": 25.8,
  "substr_pg": 30.36,
  "substr2": 34.52,
  "overlap": 38.33
}

Saved results at ./singlehop_1648036353_eval.json

Reading predictions from ./singlehop_1648041365.json
{
  "n_questions": 5918,
  "em": 30.52,
  "substr_gp": 35.96,
  "substr_pg": 45.2,
  "substr2": 49.48,
  "overlap": 53.44
}

Saved results at ./singlehop_1648041365_eval.json



In [9]:
for fp in glob.glob('./predictions_*'):
    print(fp)

./predictions_1647092281.json
./predictions_1647092281_eval.json
./predictions_1647092281_meta.json
./predictions_1647094432.json
./predictions_1647094432_eval.json
./predictions_1647094432_meta.json
./predictions_1647099031.json
./predictions_1647099031_eval.json
./predictions_1647099031_meta.json
./predictions_1647099776.json
./predictions_1647099776_eval.json
./predictions_1647099776_meta.json
./predictions_1647107689.json
./predictions_1647107689_eval.json
./predictions_1647107689_meta.json
./predictions_1647111196.json
./predictions_1647111196_eval.json
./predictions_1647111196_meta.json
./predictions_1647144281.json
./predictions_1647144281_eval.json
./predictions_1647144281_meta.json


In [10]:
# Baseline: "substr_gp": 16.76, "overlap": 27.94

cat predictions_1647092281_meta.json

{
    "top_k": 5,
    "use_large_index": false,
    "strip_qmark1": false,
    "strip_qmark2": false,
    "strip_qword1": false,
    "strip_qword2": false,
    "prepend_hop_phrase": false,
    "retrieval_unit": "phrase"
}

In [13]:
# Stripped question word at the first pos: "substr_gp": 17.2, "overlap": 28.3

cat predictions_1647094432_meta.json

{
    "top_k": 5,
    "use_large_index": false,
    "strip_qmark": false,
    "strip_qword1": true,
    "strip_qword2": false,
    "prepend_hop_phrase": false,
    "retrieval_unit": "phrase"
}

In [33]:
# Best performing with base index: strip question word over *entire* string:
# "substr_gp": 17.86, "overlap": 29.21

!cat predictions_1647178995_meta.json

{
    "top_k": 5,
    "use_large_index": false,
    "strip_qmark": false,
    "strip_qword1": true,
    "strip_qword2": false,
    "strip_qword_mode": "all",
    "prepend_hop_phrase": false,
    "retrieval_unit": "phrase",
    "single_hop": false
}

In [15]:
# Improvement over baseline in overlap metric: "substr_gp": 16.48, "overlap": 28.14

cat predictions_1647107689_meta.json

{
    "top_k": 5,
    "use_large_index": false,
    "strip_qmark": false,
    "strip_qword1": false,
    "strip_qword2": false,
    "prepend_hop_phrase": false,
    "retrieval_unit": "sentence"
}

In [16]:
# Large index results in ~5 point improvements: "substr_gp": 21.53, "overlap": 32.45

cat predictions_1647144281_meta.json

{
    "top_k": 5,
    "use_large_index": true,
    "strip_qmark": false,
    "strip_qword1": false,
    "strip_qword2": false,
    "prepend_hop_phrase": false,
    "retrieval_unit": "phrase"
}

In [21]:
# Params based on previous results (sentence-version): "substr_gp": 21.29, "overlap": 32.72

cat predictions_1647148438_meta.json

{
    "top_k": 5,
    "use_large_index": true,
    "strip_qmark": false,
    "strip_qword1": true,
    "strip_qword2": false,
    "prepend_hop_phrase": false,
    "retrieval_unit": "sentence"
}

In [29]:
# strip_qword_mode="first": "substr_gp": 22.41, "overlap": 33.52

!cat predictions_1647150037_meta.json

{
    "top_k": 5,
    "use_large_index": true,
    "strip_qmark": false,
    "strip_qword1": true,
    "strip_qword2": false,
    "prepend_hop_phrase": false,
    "retrieval_unit": "phrase"
}

In [37]:
# Best params so far: "em": 18.82, "overlap": 34.79, "overall_avg": 27.308

!cat predictions_1647181298_meta.json

{
    "top_k": 5,
    "use_large_index": true,
    "strip_qmark": false,
    "strip_qword1": true,
    "strip_qword2": false,
    "strip_qword_mode": "all",
    "prepend_hop_phrase": false,
    "retrieval_unit": "phrase",
    "single_hop": false
}

In [5]:
# Single hop performance: "em": 19.67, "substr_gp": 24.16, "overlap": 36.5

!cat singlehop_1647185573_meta.json

{
    "top_k": 5,
    "use_large_index": true,
    "strip_qmark": false,
    "strip_qword1": false,
    "strip_qword2": false,
    "strip_qword_mode": "all",
    "prepend_hop_phrase": false,
    "retrieval_unit": "phrase",
    "single_hop": true
}

In [1]:
!cat singlehop_1647185573_eval.json

{
    "n_questions": 5918,
    "em": 19.67,
    "substr_gp": 24.16,
    "substr_pg": 28.35,
    "substr2": 32.49,
    "overlap": 36.5,
    "overall_avg": 28.234
}

In [16]:
FILE1 = "singlehop_1647185573.json"
FILE2 = "predictions_1647181298.json"
compare_wrapper(FILE1, FILE2, union_gold=True)

Reading predictions from singlehop_1647185573.json
Reading predictions from predictions_1647181298.json
{
  "n_questions": 5918,
  "em": 23.72,
  "substr_gp": 28.76,
  "substr_pg": 34.49,
  "substr2": 38.83,
  "overlap": 42.46,
  "overall_avg": 33.652
}

Saved results at singlehop_1647185573_predictions_1647181298_eval.json



In [8]:
FILE1 = "predictions_1647150037.json"
FILE2 = "predictions_1647181298.json"
compare_wrapper(FILE1, FILE2)

Reading predictions from predictions_1647150037.json
Reading predictions from predictions_1647181298.json
{
  "n_questions": 5918,
  "em": 79.91,
  "substr_gp": 81.14,
  "substr_pg": 81.45,
  "substr2": 82.63,
  "overlap": 83.02,
  "overall_avg": 81.63
}

Saved results at predictions_1647150037_predictions_1647181298_eval.json



In [10]:
FILE1 = "predictions_1647094432.json"
FILE2 = "predictions_1647178995.json"
compare_wrapper(FILE1, FILE2)

Reading predictions from predictions_1647094432.json
Reading predictions from predictions_1647178995.json
{
  "n_questions": 5918,
  "em": 80.13,
  "substr_gp": 81.31,
  "substr_pg": 81.33,
  "substr2": 82.47,
  "overlap": 82.74,
  "overall_avg": 81.596
}

Saved results at predictions_1647094432_predictions_1647178995_eval.json

