In [1]:
from evaluate import *

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [15]:
def compare_wrapper(fp1, fp2, unique=True, write=True, union_gold=False):
    """
    Compare 2 prediction files
    """

    # Read data from disk
    print(f"Reading predictions from {fp1}")
    results1 = read_results(FILE1)
    print(f"Reading predictions from {fp2}")
    results2 = read_results(FILE2)

    OUTPUT_PATH = fp1.replace(".json", f"_{fp2.replace('.json', '_eval.json')}")

    # Compute metrics
    eval_results = compare_predictions(results1, results2, unique=unique, union_gold=union_gold)

    if write:
        # Write results to disk
        write_eval_results(OUTPUT_PATH, eval_results)
        print(f"\nSaved results at {OUTPUT_PATH}\n")

In [24]:
# Read prediction files and run evaluation
import glob

MODE = "single"  # single / multi

PATH_PREFIX = {
    "single": "singlehop",
    "multi": "predictions",
}

for fp in glob.glob(f'./{PATH_PREFIX[MODE]}_*'):
    if "meta" in fp or "eval" in fp:
        continue
    
    # Read data from disk
    print(f"Reading predictions from {fp}")
    results = read_results(fp)
    OUTPUT_PATH = fp.replace(".json", "_eval.json")

    # Compute metrics
    eval_results = compute_metrics(results)
    
    # Write results to disk
    write_eval_results(OUTPUT_PATH, eval_results)
    print(f"\nSaved results at {OUTPUT_PATH}\n")

Reading predictions from ./singlehop_1648033166.json
{
  "n_questions": 5918,
  "em": 17.27,
  "substr_gp": 22.05,
  "substr_pg": 25.18,
  "substr2": 29.4,
  "overlap": 32.45,
  "overall_avg": 25.27
}

Saved results at ./singlehop_1648033166_eval.json

Reading predictions from ./singlehop_1648036353.json
{
  "n_questions": 5918,
  "em": 20.95,
  "substr_gp": 25.8,
  "substr_pg": 30.36,
  "substr2": 34.52,
  "overlap": 38.33,
  "overall_avg": 29.99
}

Saved results at ./singlehop_1648036353_eval.json

Reading predictions from ./singlehop_1648041365.json
{
  "n_questions": 5918,
  "em": 30.52,
  "substr_gp": 35.96,
  "substr_pg": 45.2,
  "substr2": 49.48,
  "overlap": 53.44,
  "overall_avg": 42.92
}

Saved results at ./singlehop_1648041365_eval.json

Reading predictions from ./singlehop_1648136621.json
{
  "n_questions": 5918,
  "em": 17.3,
  "substr_gp": 22.02,
  "substr_pg": 25.14,
  "substr2": 29.27,
  "overlap": 32.4,
  "overall_avg": 25.23
}

Saved results at ./singlehop_1648136621_

In [19]:
for fp in glob.glob('./*.json'):
    print(fp)

./predictions_1648115963.json
./predictions_1648115963_eval.json
./predictions_1648115963_meta.json
./singlehop_1648033166.json
./singlehop_1648033166_eval.json
./singlehop_1648033166_meta.json
./singlehop_1648036353.json
./singlehop_1648036353_eval.json
./singlehop_1648036353_meta.json
./singlehop_1648036353_predictions_1648115963_eval.json
./singlehop_1648041365.json
./singlehop_1648041365_eval.json
./singlehop_1648041365_meta.json


In [21]:
FILE1 = "singlehop_1648036353.json"
FILE2 = "predictions_1648127590.json"
compare_wrapper(FILE1, FILE2)

Reading predictions from singlehop_1648036353.json
Reading predictions from predictions_1648127590.json
{
  "n_questions": 5918,
  "em": 41.33,
  "substr_gp": 45.5,
  "substr_pg": 47.26,
  "substr2": 51.07,
  "overlap": 52.05,
  "overall_avg": 47.44199999999999
}

Saved results at singlehop_1648036353_predictions_1648127590_eval.json



In [22]:
FILE1 = "singlehop_1648036353.json"
FILE2 = "predictions_1648127590.json"
compare_wrapper(FILE1, FILE2, union_gold=True)

Reading predictions from singlehop_1648036353.json
Reading predictions from predictions_1648127590.json
{
  "n_questions": 5918,
  "em": 25.35,
  "substr_gp": 30.75,
  "substr_pg": 37.01,
  "substr2": 41.26,
  "overlap": 44.61,
  "overall_avg": 35.79600000000001
}

Saved results at singlehop_1648036353_predictions_1648127590_eval.json

