In [1]:
from evaluate import *

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
def compare_wrapper(fp1, fp2, unique=True, write=True, union_gold=False):
    """
    Compare 2 prediction files
    """

    # Read data from disk
    print(f"Reading predictions from {fp1}")
    results1 = read_results(FILE1)
    print(f"Reading predictions from {fp2}")
    results2 = read_results(FILE2)

    OUTPUT_PATH = fp1.replace(".json", f"_{fp2.replace('.json', '_eval.json')}")

    # Compute metrics
    eval_results = compare_predictions(results1, results2, unique=unique, union_gold=union_gold)

    if write:
        # Write results to disk
        write_eval_results(OUTPUT_PATH, eval_results)
        print(f"\nSaved results at {OUTPUT_PATH}\n")

In [8]:
# Read prediction files and run evaluation
import glob

MODE = "oracle"  # single / multi

PATH_PREFIX = {
    "single": "singlehop",
    "multi": "predictions",
    "oracle": "oracle"
}

for fp in glob.glob(f'./{PATH_PREFIX[MODE]}_*'):
    if "meta" in fp or "eval" in fp:
        continue
    
    # Read data from disk
    print(f"Reading predictions from {fp}")
    results = read_results(fp)
    OUTPUT_PATH = fp.replace(".json", "_eval.json")

    # Compute metrics
    eval_results = compute_metrics(results)
    
    # Write results to disk
    write_eval_results(OUTPUT_PATH, eval_results)
    print(f"\nSaved results at {OUTPUT_PATH}\n")

Reading predictions from ./oracle_all_sent_preds_1648226462.json
{
  "n_questions": 5918,
  "em": 27.91,
  "substr_gp": 33.12,
  "substr_pg": 38.85,
  "substr2": 43.33,
  "overlap": 45.93,
  "overall_avg": 37.83
}

Saved results at ./oracle_all_sent_preds_1648226462_eval.json

Reading predictions from ./oracle_sent_preds_1648199883.json
{
  "n_questions": 5918,
  "em": 28.37,
  "substr_gp": 33.73,
  "substr_pg": 39.57,
  "substr2": 44.17,
  "overlap": 46.71,
  "overall_avg": 38.51
}

Saved results at ./oracle_sent_preds_1648199883_eval.json

Reading predictions from ./oracle_title_preds_1648201162.json
{
  "n_questions": 5918,
  "em": 22.32,
  "substr_gp": 27.21,
  "substr_pg": 32.33,
  "substr2": 36.52,
  "overlap": 40.25,
  "overall_avg": 31.73
}

Saved results at ./oracle_title_preds_1648201162_eval.json

Reading predictions from ./oracle_title_sent_preds_1648202141.json
{
  "n_questions": 5918,
  "em": 28.17,
  "substr_gp": 33.58,
  "substr_pg": 39.61,
  "substr2": 44.14,
  "overla

In [19]:
for fp in glob.glob('./*.json'):
    print(fp)

./predictions_1648115963.json
./predictions_1648115963_eval.json
./predictions_1648115963_meta.json
./singlehop_1648033166.json
./singlehop_1648033166_eval.json
./singlehop_1648033166_meta.json
./singlehop_1648036353.json
./singlehop_1648036353_eval.json
./singlehop_1648036353_meta.json
./singlehop_1648036353_predictions_1648115963_eval.json
./singlehop_1648041365.json
./singlehop_1648041365_eval.json
./singlehop_1648041365_meta.json


In [21]:
FILE1 = "singlehop_1648036353.json"
FILE2 = "predictions_1648127590.json"
compare_wrapper(FILE1, FILE2)

Reading predictions from singlehop_1648036353.json
Reading predictions from predictions_1648127590.json
{
  "n_questions": 5918,
  "em": 41.33,
  "substr_gp": 45.5,
  "substr_pg": 47.26,
  "substr2": 51.07,
  "overlap": 52.05,
  "overall_avg": 47.44199999999999
}

Saved results at singlehop_1648036353_predictions_1648127590_eval.json



In [22]:
FILE1 = "singlehop_1648036353.json"
FILE2 = "predictions_1648127590.json"
compare_wrapper(FILE1, FILE2, union_gold=True)

Reading predictions from singlehop_1648036353.json
Reading predictions from predictions_1648127590.json
{
  "n_questions": 5918,
  "em": 25.35,
  "substr_gp": 30.75,
  "substr_pg": 37.01,
  "substr2": 41.26,
  "overlap": 44.61,
  "overall_avg": 35.79600000000001
}

Saved results at singlehop_1648036353_predictions_1648127590_eval.json

