In [None]:
idk_list = ["I'm sorry, I don't have that information in the course documents",
            "I apologize, but that information isn't included in the course materials.",
            "Sorry, that information is not contained in the course materials.",
            "That information isn't included in the course materials, I'm sorry.",
            "I'm sorry, but I could not find that information in the course documents."]

In [None]:
from datasets import load_dataset, Dataset
import random

squad_dataset = load_dataset("squad_v2", split="train")

questions, contexts, answers = [], [], []
total = 0

for example in squad_dataset:
    total += 1
    if total % 20 == 0:
        questions.append(example['question'])
        contexts.append("")
        answers.append(random.choice(idk_list))

    else:
        # Check if the answer exits，if there is no answer for this question，set answer to "No answer"
        if example['answers']['text']:
            questions.append(example['question'])
            contexts.append(example['context'])
            answers.append(example['answers']['text'][0])

    transformed_data = {
        "question": questions,
        "context": contexts,
        "answer": answers
    }

transformed_squad_dataset = Dataset.from_dict(transformed_data)

# We split the dataset into two where test data is used to evaluate at the end.
train_and_test_dataset = transformed_squad_dataset.train_test_split(
    test_size=0.1)


train_dataset = train_and_test_dataset["train"]
shuffled_dataset = train_dataset.shuffle(seed=42)
subset_train_dataset = shuffled_dataset.select(range(10000))
subset_train_dataset.to_json("recall_train.jsonl")

In [None]:
from datasets import load_dataset

boolq_dataset = load_dataset("google/boolq")

boolq_questions, boolq_contexts, boolq_answers = [], [], []

for example in boolq_dataset["train"]:
    total += 1

    if total % 20 == 0:
        boolq_questions.append(example['question'])
        boolq_contexts.append("")
        boolq_answers.append(random.choice(idk_list))

    else:
        # Extract question and context
        boolq_questions.append(example['question'])
        boolq_contexts.append(example['passage'])
        boolq_answers.append(str(example['answer']))

    # Create form
    transformed_data = {
        "question": boolq_questions,
        "context": boolq_contexts,
        "answer": boolq_answers
    }

transformed_boolq_dataset = Dataset.from_dict(transformed_data)

train_and_test_boolq_dataset = transformed_boolq_dataset.train_test_split(test_size=0.1)

# Dumping the training data to a local file to be used for training.
train_and_test_boolq_dataset["train"].to_json("closed_train.jsonl")

In [None]:
from datasets import concatenate_datasets, DatasetDict

combined_train_dataset = concatenate_datasets([
    subset_train_dataset,
    train_and_test_boolq_dataset["train"]
])

combined_test_dataset = concatenate_datasets([
    train_and_test_dataset["test"],
    train_and_test_boolq_dataset["test"]
])

# Combine into a new DatasetDict
combined_datasets = DatasetDict({
    "train": combined_train_dataset,
    "test": combined_test_dataset
})

shuffled_train_dataset = combined_datasets["train"].shuffle(seed=42)
shuffled_test_dataset = combined_datasets["test"].shuffle(seed=42)


shuffled_train_dataset.to_json("general_dataset.jsonl")

In [None]:
answer_num = 0
non_answer_num_text = 0
non_answer_num_notext = 0
boolq_true_num = 0
boolq_false_num = 0

for example in shuffled_train_dataset:

    if example['answer'] in idk_list:
        if example['context'] == '':
            non_answer_num_notext += 1
        else:
            non_answer_num_text += 1
    elif example['answer'] == "True":
        boolq_true_num += 1
    elif example['answer'] == "False":
        boolq_false_num += 1
    else:
        answer_num += 1

import matplotlib.pyplot as plt

categories = ['Answerable', 'Non-answerable', 'True', 'False']
counts = [answer_num, non_answer_num_text + non_answer_num_notext, boolq_true_num, boolq_false_num]

# Graph to compare Answerable and Non-answerable Questions in SQuAD 2.0
plt.figure(figsize=(10, 6))
plt.bar(categories, counts, color=['blue', 'red', 'yellow', 'green'])
plt.title('Comparison of different types of Questions in fine-tuning dataset')
plt.xlabel('Category')
plt.ylabel('Number of Questions')
plt.show()

In [None]:
train_and_test_boolq_dataset["test"].to_json("closed_validation_dataset.jsonl")
train_and_test_dataset["test"].to_json("recall_validation_dataset.jsonl")