In [9]:
import math
import random
from pathlib import Path
from typing import Any

import pandas as pd
from datasets import load_dataset

In [2]:
def white_space_fix(text: Any) -> Any:
    return " ".join(text.strip().split()).strip()

In [4]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/saeednajafi/.cache/huggingface/token
Login successful


In [5]:
def read_narrative_dataset() -> Any:
    """Read the narrative qa dataset."""

    def process_narrative_row(row: Any) -> Any:
        """Helper functions for NarrativeQA Dataset."""
        all_answers = list(set([white_space_fix(answer["text"]) for answer in row["answers"]]))
        return {
            "context": white_space_fix(row["question"]["text"]),
            "question": white_space_fix(row["document"]["summary"]["text"]),
            "answers": all_answers,
        }

    train_dataset = load_dataset("deepmind/narrativeqa", split="train")
    dev_dataset = load_dataset("deepmind/narrativeqa", split="validation")
    test_dataset = load_dataset("deepmind/narrativeqa", split="test")

    train_dataset = train_dataset.map(
        process_narrative_row,
        remove_columns=["document"],
    )

    dev_dataset = dev_dataset.map(
        process_narrative_row,
        remove_columns=["document"],
    )

    test_dataset = test_dataset.map(
        process_narrative_row,
        remove_columns=["document"],
    )
    return train_dataset, dev_dataset, test_dataset

In [6]:
def read_race_dataset() -> Any:
    """Function to create the race dataset."""

    def process_race_row(row: Any) -> Any:
        """Helper function."""
        option_code = row["answer"]
        if option_code == "A":
            option_idx = 0
        elif option_code == "B":
            option_idx = 1
        elif option_code == "C":
            option_idx = 2
        elif option_code == "D":
            option_idx = 3

        answers = [row["options"][option_idx]]
        return {
            "context": white_space_fix(row["article"]),
            "question": white_space_fix(row["question"]),
            "answers": answers,
        }

    train_dataset = load_dataset("ehovy/race", "all", split="train")
    train_dataset = train_dataset.map(
        process_race_row,
        remove_columns=["options", "example_id", "article", "answer"],
    )
    dev_dataset = load_dataset("ehovy/race", "all", split="validation")
    dev_dataset = dev_dataset.map(
        process_race_row,
        remove_columns=["options", "example_id", "article", "answer"],
    )
    test_dataset = load_dataset("ehovy/race", "all", split="test")
    test_dataset = test_dataset.map(
        process_race_row,
        remove_columns=["options", "example_id", "article", "answer"],
    )
    return train_dataset, dev_dataset, test_dataset

In [7]:
def read_squad_dataset() -> Any:
    def process_squad_row(row: Any) -> Any:
        if row["answers"]["text"]:
            all_answers = list(set([white_space_fix(answer) for answer in row["answers"]["text"]]))
        else:
            all_answers = ["<no_answer>"]
        return {
            "context": white_space_fix(row["context"]),
            "question": white_space_fix(row["question"]),
            "answers": all_answers,
        }

    train_dataset = load_dataset("rajpurkar/squad_v2", split="train")
    train_dataset = train_dataset.map(
        process_squad_row,
        remove_columns=["id", "title"],
    )
    dev_dataset = load_dataset("rajpurkar/squad_v2", split="validation")
    dev_dataset = dev_dataset.map(
        process_squad_row,
        remove_columns=["id", "title"],
    )
    return train_dataset, dev_dataset, dev_dataset

In [8]:
rc_train_dataset, rc_dev_dataset, rc_test_dataset = read_race_dataset()
nq_train_dataset, nq_dev_dataset, nq_test_dataset = read_narrative_dataset()
sq_train_dataset, sq_dev_dataset, sq_test_dataset = read_squad_dataset()

In [10]:
seeds = [42, 100, 13, 87, 21]
few_shot_split_sizes = [16, 128, 1024, 8192, 0.1]
dataset_names = ["squad", "race", "narrativeqa"]


def write_eval(eval_dataset: Any, dataset_name: str, test_name: str) -> None:
    eval_df = pd.DataFrame([row for row in eval_dataset])
    for few_shot_split_size in few_shot_split_sizes:
        print(len(eval_df))
        Path(f"./{few_shot_split_size}-shot-datasets/{dataset_name}").mkdir(parents=True, exist_ok=True)
        eval_df.to_csv(
            f"./{few_shot_split_size}-shot-datasets/{dataset_name}/{test_name}.tsv",
            header=True,
            index=False,
            sep="\t",
        )


write_eval(sq_test_dataset, "squad", "original_validation")

write_eval(rc_test_dataset, "race", "original_test")
write_eval(nq_test_dataset, "narrativeqa", "original_test")

write_eval(rc_dev_dataset, "race", "original_validation")
write_eval(nq_dev_dataset, "narrativeqa", "original_validation")


def write_train_dev(train_dataset: Any, dataset_name: str) -> None:
    train_rows = [row for row in train_dataset]
    for seed in seeds:
        random.seed(seed)
        random.shuffle(train_rows)
        for few_shot_split_size in few_shot_split_sizes:
            if few_shot_split_size == 0.1:
                new_split_size = int(math.ceil(len(train_rows) * 0.1))
                fewshot_val_rows = train_rows[0:new_split_size]
                fewshot_train_rows = train_rows[new_split_size:]
            else:
                new_split_size = int(few_shot_split_size)
                fewshot_train_rows = train_rows[0:new_split_size]
                fewshot_val_rows = train_rows[new_split_size : new_split_size * 2]

            train_df = pd.DataFrame(fewshot_train_rows)
            csv_file = f"./{few_shot_split_size}-shot-datasets"
            Path(f"{csv_file}/{dataset_name}").mkdir(parents=True, exist_ok=True)
            csv_file = f"{csv_file}/{dataset_name}/{few_shot_split_size}-{seed}-train.tsv"
            train_df.to_csv(
                csv_file,
                header=True,
                index=False,
                sep="\t",
            )
            val_df = pd.DataFrame(fewshot_val_rows)
            csv_file = f"./{few_shot_split_size}-shot-datasets"
            Path(f"{csv_file}/{dataset_name}").mkdir(parents=True, exist_ok=True)
            csv_file = f"{csv_file}/{dataset_name}/{few_shot_split_size}-{seed}-dev.tsv"
            val_df.to_csv(
                csv_file,
                header=True,
                index=False,
                sep="\t",
            )


write_train_dev(sq_train_dataset, "squad")
write_train_dev(rc_train_dataset, "race")
write_train_dev(nq_train_dataset, "narrativeqa")

11873
11873
11873
11873
11873
4934
4934
4934
4934
4934
10557
10557
10557
10557
10557
4887
4887
4887
4887
4887
3461
3461
3461
3461
3461


In [11]:
print(rc_test_dataset)

Dataset({
    features: ['question', 'context', 'answers'],
    num_rows: 4934
})


In [12]:
print(nq_test_dataset)

Dataset({
    features: ['question', 'answers', 'context'],
    num_rows: 10557
})


In [13]:
print(sq_test_dataset)

Dataset({
    features: ['context', 'question', 'answers'],
    num_rows: 11873
})
