In [14]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

MOCHA_DIR_PATH = "../../datasets/mocha"
!ls {MOCHA_DIR_PATH}

dev.json	    minimal_pairs.json.sha1   train.json
dev.json.sha1	    test_no_labels.json       train.json.sha1
minimal_pairs.json  test_no_labels.json.sha1


In [15]:
def write_to_file(data: dict, filename, output_dir):
    output_filepath = f"{output_dir}/{filename}.json"
    with open(output_filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)

In [16]:
def flatten_ref_candidate(example: dict) -> tuple:
    """Flatten example in terms of ``reference`` and ``candidate`` answers.
    
    Returns
    -------
    2-dim tuple
        The data referring to the reference and candidate examples.
        Both data information shares the same context and question.
    """
    reference = {
        "question": example["question"],
        "context": example["context"],
        "answer": example["reference"],
        "is_reference": True,
        "correctness": 5, # assumption
        "correctness_std": 0,   # more realistic assumption could depend 
                                # on how many wrong answers we find in the data.
        "_metadata": str({}),
    }
    
    candidate = {
        "question": example["question"],
        "context": example["context"],
        "answer": example["candidate"],
        "is_reference": False,
        "correctness": example["score"],
        "correctness_std": np.std(example["metadata"]["scores"]),
        "_metadata": str(example["metadata"]),
    }
            
    return reference, candidate


def extend(examples: dict) -> tuple:
    """Computes the extended version of the examples."""
    extended_examples = {}
    extended_examples_cand_only = {}

    for example_id, example in examples.items():
        reference, candidate = flatten_ref_candidate(example)

        extended_examples[f"{example_id}__orig"] = reference 
        extended_examples[f"{example_id}__cand"] = candidate 
        extended_examples_cand_only[f"{example_id}"] = candidate 
    assert len(extended_examples) == len(extended_examples_cand_only) * 2
    
    return extended_examples, extended_examples_cand_only


# --------------------------------------------------------------
# Processing data
# --------------------------------------------------------------
def preprocess_data(filepath, out_name, output_dir):
    data = json.load(open(filepath))
    
    new_data = {}
    new_data_candidates_only = {}
    for dataset, examples in data.items():
        print(len(examples))
        ref_and_cand_examples, cand_examples = extend(examples)

        new_data[dataset] = ref_and_cand_examples
        new_data_candidates_only[dataset] = cand_examples
            
    
    write_to_file(new_data, f"{out_name}_all", output_dir=output_dir)
    write_to_file(new_data_candidates_only, f"{out_name}_candidates_only", output_dir=output_dir)
    
    return new_data, new_data_candidates_only

## Dump file

In [17]:
OUTPUT_DIR = "/home/kat/Projects/PhD/qasper-experiments/eqqa/data/mocha_eqqa_data"

In [18]:
# 31069 train examples if consider every dataset
new_data, new_data_candidates_only = preprocess_data(
    filepath=f'{MOCHA_DIR_PATH}/train.json',
    out_name="original__train",
    output_dir=OUTPUT_DIR,
);

5033
687
7210
7471
3259
7409


In [19]:
# 4009 dev examples if consider every dataset
preprocess_data(
    filepath=f'{MOCHA_DIR_PATH}/dev.json',
    out_name="original__dev",
    output_dir=OUTPUT_DIR,
);

683
97
978
890
344
1017


In [20]:
data = json.load(open(f'{MOCHA_DIR_PATH}/train.json'))
for ex in data["cosmosqa"].values():
    print(ex["score"], ex["metadata"])

1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
3 {'scores': [3], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'backtranslation'}
5 {'scores': [5], 'source': 'gpt2'}
2 {'scores': [2], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
4 {'scores': [4], 'source': 'backtranslation'}
4 {'scores': [4], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
5 {'scores': [5], 'source': 'backtranslation'}
4 {'scores': [4], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
3 {'scores': [3], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
4 {'scores': [4], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
5 {'scores': [5], 'source': 'backtranslation'}
1 {'scores': [1], 'source'

3 {'scores': [3], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
3 {'scores': [3], 'source': 'backtranslation'}
3 {'scores': [3], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
5 {'scores': [5], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'backtranslation'}
5 {'scores': [5], 'source': 'backtranslation'}
3 {'scores': [3], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'backtranslation'}
5 {'scores': [5], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
2 {'scores': [2], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
3 {'scores': [3], 'source': 'backtranslation'}
1 {'scores': [1

1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
3 {'scores': [3], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
3 {'scores': [3], 'source': 'gpt2'}
5 {'scores': [5], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
5 {'scores': [5], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
5 {'scores': [5], 'source': 'backtranslation'}
4 {'scores': [4], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'backtranslation'}
3 {'scores': [3], 'source': 'gpt2'}
4 {'scores': [4], 'source': 'backtranslation'}
2 {'scores': [2], 'source': 'backtranslation'}
1 {'scores': [1], 'source': 'gpt2'}
3 {'scores': [3], 'source': 'gpt2'}
1 {'scores': [1], 'source': 'gpt2'}
2 {'scores': [2], 'source': 'backtranslation'}
1 {'scores': [1], 'source'

 Since the test set does not have correctness "labels"
 we will use the development set as the test set and 
 we will randomly sample the training set


In [21]:
def split_examples(examples, fraction, seed, replace=False):
    np_rand = np.random.default_rng(seed)

    # Number of examples to be part of split1 
    n_samples = int(len(examples)*fraction)
    
    # Subset of indices that will partake in the dev set
    ids = np_rand.choice(
        np.arange(len(examples)), n_samples, replace=replace)

    split1 = {}
    split2 = {}
    
    for i, (example_id, example) in enumerate(examples.items()):
        if i in ids:
            split1[example_id] = example
        else:
            split2[example_id] = example
            
    return split1, split2


def split_data(filepath, eval_fraction, outnames, output_dir, seed=129831):    
    np_rand = np.random.default_rng(seed)
    data = json.load(open(filepath))

    train = {}
    dev = {}

    for dataset, examples in data.items():
        dataset_dev_set, dataset_train_set = split_examples(
            examples, fraction=eval_fraction, seed=np_rand.integers(10**6), replace=False)

        train[dataset] = dataset_train_set
        dev[dataset] = dataset_dev_set

    write_to_file(train, outnames[0], output_dir=output_dir)
    write_to_file(dev, outnames[1], output_dir=output_dir)
    
    return train, dev

In [22]:
split_data(
    filepath=f'{OUTPUT_DIR}/original__train_all.json',
    eval_fraction=0.2,
    outnames=("split__train_all","split__dev_all"),
    output_dir=OUTPUT_DIR,
);

In [23]:
split_data(
    filepath=f'{OUTPUT_DIR}/original__train_candidates_only.json',
    eval_fraction=0.2,
    outnames=("split__train_candidates_only","split__dev_candidates_only"),
    output_dir=OUTPUT_DIR,
);

In [2]:
import json