In [1]:
import random
from pathlib import Path
from typing import Any

import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def white_space_fix(text: Any) -> Any:
    return " ".join(text.strip().split()).strip()

In [3]:
def read_narrative_dataset() -> Any:
    """Read the narrative qa dataset."""

    def process_narrative_row(row: Any) -> Any:
        """Helper functions for NarrativeQA Dataset."""
        all_answers = list(set([white_space_fix(answer["text"]) for answer in row["answers"]]))
        return {
            "context": white_space_fix(row["question"]["text"]),
            "question": white_space_fix(row["document"]["summary"]["text"]),
            "answers": all_answers,
        }

    train_dataset = load_dataset("narrativeqa", split="train", download_mode="force_redownload", verification_mode="no_checks")
    dev_dataset = load_dataset(
        "narrativeqa", split="validation", download_mode="force_redownload", verification_mode="no_checks"
    )
    test_dataset = load_dataset("narrativeqa", split="test", download_mode="force_redownload", verification_mode="no_checks")

    train_dataset = train_dataset.map(
        process_narrative_row,
        remove_columns=["document"],
    )

    dev_dataset = dev_dataset.map(
        process_narrative_row,
        remove_columns=["document"],
    )

    test_dataset = test_dataset.map(
        process_narrative_row,
        remove_columns=["document"],
    )
    return train_dataset, dev_dataset, test_dataset

In [4]:
def read_race_dataset() -> Any:
    """Function to create the race dataset."""

    def process_race_row(row: Any) -> Any:
        """Helper function."""
        option_code = row["answer"]
        if option_code == "A":
            option_idx = 0
        elif option_code == "B":
            option_idx = 1
        elif option_code == "C":
            option_idx = 2
        elif option_code == "D":
            option_idx = 3

        answers = [row["options"][option_idx]]
        return {
            "context": white_space_fix(row["article"]),
            "question": white_space_fix(row["question"]),
            "answers": answers,
        }

    train_dataset = load_dataset("race", "all", split="train", download_mode="force_redownload", verification_mode="no_checks")
    train_dataset = train_dataset.map(
        process_race_row,
        remove_columns=["options", "example_id", "article"],
    )
    dev_dataset = load_dataset(
        "race", "all", split="validation", download_mode="force_redownload", verification_mode="no_checks"
    )
    dev_dataset = dev_dataset.map(
        process_race_row,
        remove_columns=["options", "example_id", "article"],
    )
    test_dataset = load_dataset("race", "all", split="test", download_mode="force_redownload", verification_mode="no_checks")
    test_dataset = test_dataset.map(
        process_race_row,
        remove_columns=["options", "example_id", "article"],
    )
    return train_dataset, dev_dataset, test_dataset

In [5]:
def read_squad_dataset() -> Any:
    def process_squad_row(row: Any) -> Any:
        if row["answers"]["text"]:
            all_answers = list(set([white_space_fix(answer) for answer in row["answers"]["text"]]))
        else:
            all_answers = ["<no_answer>"]
        return {
            "context": white_space_fix(row["context"]),
            "question": white_space_fix(row["question"]),
            "answers": all_answers,
        }

    train_dataset = load_dataset("squad_v2", split="train", download_mode="force_redownload", verification_mode="no_checks")
    train_dataset = train_dataset.map(
        process_squad_row,
        remove_columns=["id", "title"],
    )
    dev_dataset = load_dataset("squad_v2", split="validation", download_mode="force_redownload", verification_mode="no_checks")
    dev_dataset = dev_dataset.map(
        process_squad_row,
        remove_columns=["id", "title"],
    )
    return train_dataset, dev_dataset, dev_dataset

In [7]:
rc_train_dataset, rc_dev_dataset, rc_test_dataset = read_race_dataset()
nq_train_dataset, nq_dev_dataset, nq_test_dataset = read_narrative_dataset()
sq_train_dataset, sq_dev_dataset, sq_test_dataset = read_squad_dataset()

Downloading readme: 100%|██████████| 11.0k/11.0k [00:00<00:00, 26.0MB/s]
Downloading data: 100%|██████████| 2.08M/2.08M [00:00<00:00, 12.5MB/s]
Downloading data: 100%|██████████| 37.4M/37.4M [00:00<00:00, 71.9MB/s]
Downloading data: 100%|██████████| 2.05M/2.05M [00:00<00:00, 9.84MB/s]
Generating test split: 100%|██████████| 4934/4934 [00:00<00:00, 127085.29 examples/s]
Generating train split: 100%|██████████| 87866/87866 [00:00<00:00, 203199.32 examples/s]
Generating validation split: 100%|██████████| 4887/4887 [00:00<00:00, 202421.08 examples/s]
Map: 100%|██████████| 87866/87866 [00:13<00:00, 6624.91 examples/s]
Downloading readme: 100%|██████████| 11.0k/11.0k [00:00<00:00, 25.6MB/s]
Downloading data: 100%|██████████| 2.08M/2.08M [00:00<00:00, 19.5MB/s]
Downloading data: 100%|██████████| 37.4M/37.4M [00:00<00:00, 121MB/s] 
Downloading data: 100%|██████████| 2.05M/2.05M [00:00<00:00, 17.3MB/s]
Generating test split: 100%|██████████| 4934/4934 [00:00<00:00, 133399.70 examples/s]
Generat

DatasetGenerationError: An error occurred while generating the dataset

In [27]:
seeds = [42, 100, 13, 87, 21]
few_shot_split_sizes = [16, 128, 1024]
dataset_names = ["squad", "race", "narrativeqa"]


def write_eval(eval_dataset: Any, dataset_name: str) -> None:
    eval_df = pd.DataFrame([row for row in eval_dataset])
    for few_shot_split_size in few_shot_split_sizes:
        print(len(eval_df))
        Path(f"./{few_shot_split_size}-shot-datasets/{dataset_name}").mkdir(parents=True, exist_ok=True)
        eval_df.to_csv(
            f"./{few_shot_split_size}-shot-datasets/{dataset_name}/test.tsv",
            header=True,
            index=False,
            sep="\t",
        )


write_eval(sq_test_dataset, "squad")
write_eval(rc_test_dataset, "race")
write_eval(nq_test_dataset, "narrativeqa")


def write_train_dev(train_dataset: Any, dataset_name: str) -> None:
    train_rows = [row for row in train_dataset]
    for seed in seeds:
        random.seed(seed)
        random.shuffle(train_rows)
        for few_shot_split_size in few_shot_split_sizes:
            fewshot_train_rows = train_rows[0:few_shot_split_size]
            fewshot_val_rows = train_rows[few_shot_split_size : few_shot_split_size * 2]

            train_df = pd.DataFrame(fewshot_train_rows)
            csv_file = f"./{few_shot_split_size}-shot-datasets"
            Path(f"{csv_file}/{dataset_name}").mkdir(parents=True, exist_ok=True)
            csv_file = f"{csv_file}/{dataset_name}/{few_shot_split_size}-{seed}-train.tsv"
            train_df.to_csv(
                csv_file,
                header=True,
                index=False,
                sep="\t",
            )
            val_df = pd.DataFrame(fewshot_val_rows)
            csv_file = f"./{few_shot_split_size}-shot-datasets"
            Path(f"{csv_file}/{dataset_name}").mkdir(parents=True, exist_ok=True)
            csv_file = f"{csv_file}/{dataset_name}/{few_shot_split_size}-{seed}-dev.tsv"
            val_df.to_csv(
                csv_file,
                header=True,
                index=False,
                sep="\t",
            )


write_train_dev(sq_train_dataset, "squad")
write_train_dev(rc_train_dataset, "race")
write_train_dev(nq_train_dataset, "narrativeqa")

11873
11873
11873
4934
4934
4934
10557
10557
10557


In [28]:
print(rc_test_dataset)

Dataset({
    features: ['article', 'answer'],
    num_rows: 4934
})


In [29]:
print(nq_test_dataset)

Dataset({
    features: ['article', 'answer'],
    num_rows: 10557
})


In [30]:
print(sq_test_dataset)

Dataset({
    features: ['article', 'answer'],
    num_rows: 11873
})
