In [3]:
import os
from datasets import load_dataset, DatasetDict
import json

# Create directory for storing the dataset
dataset_dir = "squad_v2_05percent"
os.makedirs(dataset_dir, exist_ok=True)

# Load the SQuAD v2 dataset using the Hugging Face datasets library
print("Loading SQuAD v2 dataset...")
dataset = load_dataset("squad_v2")
print("Number of examples in the dataset:", len(dataset["train"]))
print("First example in the dataset:", dataset["train"][0])

# Extract 0.5% of each split
print("\nExtracting 0.5% of each dataset split...")

# Calculate 0.5% sizes
train_size = int(len(dataset["train"]) * 0.005)
test_size = int(len(dataset["validation"]) * 0.005)  # Use validation as test

print(f"Original train size: {len(dataset['train'])}")
print(f"0.5% train size: {train_size}")
print(f"Original validation size: {len(dataset['validation'])}")
print(f"0.5% test size: {test_size}")

# Create 0.5% subsets with only train and test
dataset_05percent = DatasetDict({
    "train": dataset["train"].select(range(train_size)),
    "test": dataset["validation"].select(range(test_size))
})

# Print final sizes
print(f"\nFinal dataset sizes:")
print(f"Train: {len(dataset_05percent['train'])}")
print(f"Test: {len(dataset_05percent['test'])}")

# Save each split to separate files
print(f"\nSaving dataset to {dataset_dir}/...")

# Save as JSON files
for split_name, split_data in dataset_05percent.items():
    filepath = os.path.join(dataset_dir, f"{split_name}.json")
    split_data.to_json(filepath)
    print(f"Saved {split_name} split to {filepath}")

# Also save using Hugging Face datasets format (recommended)
dataset_05percent.save_to_disk(dataset_dir)
print(f"Saved complete dataset to {dataset_dir}")

# Save metadata
metadata = {
    "original_train_size": len(dataset["train"]),
    "original_validation_size": len(dataset["validation"]),
    "extracted_train_size": len(dataset_05percent["train"]),
    "extracted_test_size": len(dataset_05percent["test"]),
    "extraction_percentage": 0.5
}

with open(os.path.join(dataset_dir, "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=2)

# Show examples from each split
print(f"\nTrain example: {dataset_05percent['train'][0]['question']}")
print(f"Test example: {dataset_05percent['test'][0]['question']}")

print("\nDataset extraction and saving completed!")

Loading SQuAD v2 dataset...
Number of examples in the dataset: 130319
First example in the dataset: {'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

Extractin

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved train split to squad_v2_05percent\train.json


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved test split to squad_v2_05percent\test.json


Saving the dataset (0/1 shards):   0%|          | 0/651 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/59 [00:00<?, ? examples/s]

Saved complete dataset to squad_v2_05percent

Train example: When did Beyonce start becoming popular?
Test example: In what country is Normandy located?

Dataset extraction and saving completed!


In [4]:
import os
from datasets import load_dataset, DatasetDict
import json
import random

# Set seed for reproducibility
random.seed(42)

# Create directory for storing the dataset
dataset_dir = "squad_v2_01percent"  # Changed directory name
os.makedirs(dataset_dir, exist_ok=True)

# Load the SQuAD v2 dataset using the Hugging Face datasets library
print("Loading SQuAD v2 dataset...")
dataset = load_dataset("squad_v2")
print("Number of examples in the dataset:", len(dataset["train"]))
print("First example in the dataset:", dataset["train"][0])

# Extract 0.1% of each split
print("\nExtracting 0.1% of each dataset split...")

# Calculate 0.1% sizes
train_size = int(len(dataset["train"]) * 0.001)  # Changed from 0.005 to 0.001
test_size = int(len(dataset["validation"]) * 0.001)  # Changed from 0.005 to 0.001

print(f"Original train size: {len(dataset['train'])}")
print(f"0.1% train size: {train_size}")
print(f"Original validation size: {len(dataset['validation'])}")
print(f"0.1% test size: {test_size}")

# Create 0.1% subsets with random sampling for better representation
print("Creating random subsets...")
train_indices = random.sample(range(len(dataset["train"])), train_size)
test_indices = random.sample(range(len(dataset["validation"])), test_size)

dataset_01percent = DatasetDict({  # Changed variable name
    "train": dataset["train"].select(train_indices),
    "test": dataset["validation"].select(test_indices)
})

# Print final sizes and statistics
print(f"\nFinal dataset sizes:")
print(f"Train: {len(dataset_01percent['train'])}")
print(f"Test: {len(dataset_01percent['test'])}")

# Check for answerable vs unanswerable questions (SQuAD v2 feature)
train_answerable = sum(1 for ex in dataset_01percent['train'] if len(ex['answers']['text']) > 0)
test_answerable = sum(1 for ex in dataset_01percent['test'] if len(ex['answers']['text']) > 0)

print(f"\nAnswerable questions:")
print(f"Train: {train_answerable}/{len(dataset_01percent['train'])} ({train_answerable/len(dataset_01percent['train'])*100:.1f}%)")
print(f"Test: {test_answerable}/{len(dataset_01percent['test'])} ({test_answerable/len(dataset_01percent['test'])*100:.1f}%)")

# Save each split to separate files
print(f"\nSaving dataset to {dataset_dir}/...")

# Save as JSON files
for split_name, split_data in dataset_01percent.items():
    filepath = os.path.join(dataset_dir, f"{split_name}.json")
    split_data.to_json(filepath)
    print(f"Saved {split_name} split to {filepath}")

# Also save using Hugging Face datasets format (recommended)
dataset_01percent.save_to_disk(dataset_dir)
print(f"Saved complete dataset to {dataset_dir}")

# Save enhanced metadata
metadata = {
    "original_train_size": len(dataset["train"]),
    "original_validation_size": len(dataset["validation"]),
    "extracted_train_size": len(dataset_01percent["train"]),
    "extracted_test_size": len(dataset_01percent["test"]),
    "extraction_percentage": 0.1,  # Changed from 0.5 to 0.1
    "sampling_method": "random",
    "seed": 42,
    "train_answerable": train_answerable,
    "test_answerable": test_answerable,
    "train_answerable_percentage": train_answerable/len(dataset_01percent['train'])*100,
    "test_answerable_percentage": test_answerable/len(dataset_01percent['test'])*100,
    "dataset_format": "squad_v2",
    "splits": ["train", "test"]
}

with open(os.path.join(dataset_dir, "metadata.json"), "w") as f:
    json.dump(metadata, f, indent=2)

# Show examples from each split
print(f"\nExample data:")
print(f"Train question: {dataset_01percent['train'][0]['question']}")
print(f"Train context: {dataset_01percent['train'][0]['context'][:100]}...")
print(f"Train answer: {dataset_01percent['train'][0]['answers']}")
print(f"\nTest question: {dataset_01percent['test'][0]['question']}")
print(f"Test context: {dataset_01percent['test'][0]['context'][:100]}...")
print(f"Test answer: {dataset_01percent['test'][0]['answers']}")

print("\n✅ Dataset extraction and saving completed!")
print(f"📁 Dataset saved to: {os.path.abspath(dataset_dir)}")
print(f"📊 Total examples: {len(dataset_01percent['train']) + len(dataset_01percent['test'])}")

Loading SQuAD v2 dataset...
Number of examples in the dataset: 130319
First example in the dataset: {'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

Extractin

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved train split to squad_v2_01percent\train.json


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved test split to squad_v2_01percent\test.json


Saving the dataset (0/1 shards):   0%|          | 0/130 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11 [00:00<?, ? examples/s]

Saved complete dataset to squad_v2_01percent

Example data:
Train question: What century did Nasser rule in?
Train context: Nasser remains an iconic figure in the Arab world, particularly for his strides towards social justi...
Train answer: {'text': ['20th'], 'answer_start': [667]}

Test question: How many State of California University campuses are there?
Test context: The Tech Coast is a moniker that has gained use as a descriptor for the region's diversified technol...
Test answer: {'text': [], 'answer_start': []}

✅ Dataset extraction and saving completed!
📁 Dataset saved to: c:\Users\manua\Desktop\BigData\Project\SelfPlagAI\squad_v2_01percent
📊 Total examples: 141
