In [None]:
############################################
# Team : RAGrats
# Team Members : Ali Asgar Padaria, Param Patel, Meet Zalavadiya
# 
# Code Description : This code extracts a subset of the original pqa_artifical data which is not biased and has equal number of both the class labels
#                    This subset of data in form of training and validation sets is stored in files/ folder for access by models.
#                    
#
# System : GCP Server L4 GPU
#############################################

In [1]:
from datasets import load_dataset, concatenate_datasets
from collections import Counter
import os

# Load full train split
dataset = load_dataset("pubmed_qa", "pqa_artificial")["train"]

# Filter only 'yes' and 'no' labels
yes_data = dataset.filter(lambda x: x['final_decision'] == 'yes')
no_data = dataset.filter(lambda x: x['final_decision'] == 'no')

# Total required from each class
total_yes = 10000
total_no = 10000

# Shuffle and sample
yes_sample = yes_data.shuffle(seed=42).select(range(total_yes))
no_sample = no_data.shuffle(seed=42).select(range(total_no))

# Split into train (9000) and val (1000) for each class
yes_train = yes_sample.select(range(9000))
yes_val = yes_sample.select(range(9000, 10000))

no_train = no_sample.select(range(9000))
no_val = no_sample.select(range(9000, 10000))

# Concatenate to create full train and validation sets
train_dataset = concatenate_datasets([yes_train, no_train]).shuffle(seed=42)
val_dataset = concatenate_datasets([yes_val, no_val]).shuffle(seed=42)

# Optional: Check distributions
print("Train label counts:", Counter(train_dataset['final_decision']))
print("Validation label counts:", Counter(val_dataset['final_decision']))

# Save paths
base_path = "/home/apadaria/NLP_Project/source/files"
train_path = os.path.join(base_path, "train_dataset")
val_path = os.path.join(base_path, "val_dataset")

# Create directories
os.makedirs(train_path, exist_ok=True)
os.makedirs(val_path, exist_ok=True)

# Save datasets
train_dataset.save_to_disk(train_path)
val_dataset.save_to_disk(val_path)

print(f"Train dataset saved to: {train_path}")
print(f"Validation dataset saved to: {val_path}")


Train label counts: Counter({'no': 9000, 'yes': 9000})
Validation label counts: Counter({'no': 1000, 'yes': 1000})


Saving the dataset (0/1 shards):   0%|          | 0/18000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Train dataset saved to: /home/apadaria/NLP_Project/source/files/train_dataset
Validation dataset saved to: /home/apadaria/NLP_Project/source/files/val_dataset
