In [1]:
import random
from typing import Any

import pandas as pd
from datasets import load_dataset

In [2]:
def white_space_fix(text: Any) -> Any:
    return " ".join(text.split())

In [3]:
def read_narrative_dataset() -> Any:
    """Read the narrative qa dataset."""

    def process_narrative_row(row: Any) -> Any:
        """Helper functions for NarrativeQA Dataset."""
        answer = "[<@>]".join([answer["text"] for answer in row["answers"]])

        question = row["question"]["text"]

        article = row["document"]["summary"]["text"]

        context = "Question: " + question + " Context: " + article + " </s>"

        return {
            "article": white_space_fix(context),
            "answer": white_space_fix(answer + " </s>"),
        }

    train_dataset = load_dataset("narrativeqa", split="train", download_mode="force_redownload", ignore_verifications=True)
    dev_dataset = load_dataset("narrativeqa", split="validation", download_mode="force_redownload", ignore_verifications=True)
    test_dataset = load_dataset("narrativeqa", split="test", download_mode="force_redownload", ignore_verifications=True)

    train_dataset = train_dataset.map(
        process_narrative_row,
        remove_columns=["document", "answers", "question"],
    )

    dev_dataset = dev_dataset.map(
        process_narrative_row,
        remove_columns=["document", "answers", "question"],
    )

    test_dataset = test_dataset.map(
        process_narrative_row,
        remove_columns=["document", "answers", "question"],
    )
    return train_dataset, dev_dataset, test_dataset

In [4]:
def read_race_dataset() -> Any:
    """Function to create the race dataset."""

    def process_race_row(row: Any) -> Any:
        """Helper function."""
        option_code = row["answer"]
        if option_code == "A":
            option_idx = 0
        elif option_code == "B":
            option_idx = 1
        elif option_code == "C":
            option_idx = 2
        elif option_code == "D":
            option_idx = 3

        answer = row["options"][option_idx]
        question = row["question"]
        article = row["article"]
        return {
            "article": white_space_fix("Question: " + question + " Context: " + article + " </s>"),
            "answer": white_space_fix(answer + " </s>"),
        }

    train_dataset = load_dataset("race", "all", split="train", download_mode="force_redownload", ignore_verifications=True)
    train_dataset = train_dataset.map(
        process_race_row,
        remove_columns=["options", "example_id", "question"],
    )
    dev_dataset = load_dataset("race", "all", split="validation", download_mode="force_redownload", ignore_verifications=True)
    dev_dataset = dev_dataset.map(
        process_race_row,
        remove_columns=["options", "example_id", "question"],
    )
    test_dataset = load_dataset("race", "all", split="test", download_mode="force_redownload", ignore_verifications=True)
    test_dataset = test_dataset.map(
        process_race_row,
        remove_columns=["options", "example_id", "question"],
    )
    return train_dataset, dev_dataset, test_dataset

In [5]:
def read_squad_dataset() -> Any:
    def process_squad_row(row: Any) -> Any:
        context = row["context"]
        question = row["question"]
        if row["answers"]["text"]:
            answ = "[<@>]".join([answer["text"] for answer in row["answers"]])
        else:
            answ = "no_answer"
        return {
            "article": white_space_fix("Question: " + question + " Context: " + context + " </s>"),
            "answer": white_space_fix(answ + " </s>"),
        }

    train_dataset = load_dataset("squad_v2", split="train", download_mode="force_redownload", ignore_verifications=True)
    train_dataset = train_dataset.map(
        process_squad_row,
        remove_columns=["id", "title", "context", "question", "answers"],
    )
    dev_dataset = load_dataset("squad_v2", split="validation", download_mode="force_redownload", ignore_verifications=True)
    dev_dataset = dev_dataset.map(
        process_squad_row,
        remove_columns=["id", "title", "context", "question", "answers"],
    )
    return train_dataset, dev_dataset, dev_dataset

In [6]:
rc_train_dataset, rc_dev_dataset, rc_test_dataset = read_race_dataset()
nq_train_dataset, nq_dev_dataset, nq_test_dataset = read_narrative_dataset()
sq_train_dataset, sq_dev_dataset, sq_test_dataset = read_squad_dataset()



Downloading readme:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

Map:   0%|          | 0/87866 [00:00<?, ? examples/s]



Downloading readme:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

Map:   0%|          | 0/4887 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

Map:   0%|          | 0/4934 [00:00<?, ? examples/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/187M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32747 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10557 [00:00<?, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [28]:
seeds = [42, 100, 13, 87, 21]
few_shot_split_sizes = [16, 128, 1024]
dataset_names = ["squad", "race", "narrativeqa"]


def write_eval(eval_dataset: Any, dataset_name: str) -> None:
    eval_df = pd.DataFrame([row for row in eval_dataset])
    for few_shot_split_size in few_shot_split_sizes:
        print(len(eval_df))
        eval_df.to_csv(
            f"../{few_shot_split_size}-shot-datasets/{dataset_name}/test.tsv",
            header=True,
            index=False,
            sep="\t",
        )


write_eval(sq_test_dataset, "squad")
write_eval(rc_test_dataset, "race")
write_eval(nq_test_dataset, "narrativeqa")


def write_train_dev(train_dataset: Any, dataset_name: str) -> None:
    train_rows = [row for row in train_dataset]
    for seed in seeds:
        random.seed(seed)
        random.shuffle(train_rows)
        for few_shot_split_size in few_shot_split_sizes:
            fewshot_train_rows = train_rows[0:few_shot_split_size]
            fewshot_val_rows = train_rows[few_shot_split_size : few_shot_split_size * 2]

            train_df = pd.DataFrame(fewshot_train_rows)
            csv_file = f"../{few_shot_split_size}-shot-datasets"
            csv_file = f"{csv_file}/{dataset_name}/{few_shot_split_size}-{seed}-train.tsv"
            train_df.to_csv(
                csv_file,
                header=True,
                index=False,
                sep="\t",
            )
            val_df = pd.DataFrame(fewshot_val_rows)
            csv_file = f"./{few_shot_split_size}-shot-datasets"
            csv_file = f"{csv_file}/{dataset_name}/{few_shot_split_size}-{seed}-dev.tsv"
            val_df.to_csv(
                csv_file,
                header=True,
                index=False,
                sep="\t",
            )


write_train_dev(sq_train_dataset, "squad")
write_train_dev(rc_train_dataset, "race")
write_train_dev(nq_train_dataset, "narrativeqa")

11873
11873
4934
4934
10557
10557


In [19]:
print(rc_test_dataset)

Dataset({
    features: ['article', 'answer'],
    num_rows: 4934
})


In [20]:
print(nq_test_dataset)

Dataset({
    features: ['article', 'answer'],
    num_rows: 10557
})


In [22]:
print(sq_test_dataset)

Dataset({
    features: ['article', 'answer'],
    num_rows: 11873
})
