In [17]:
from datasets import load_dataset, concatenate_datasets
import random

sciq_path = "/scratch/izar/aloureir/project-m3-2024-mynicelongpenguin/model/datasets/Filtered_SciQ_Dataset_with_explanation.jsonl"
deepmind_path = "/scratch/izar/aloureir/project-m3-2024-mynicelongpenguin/model/datasets/Transformed_DeepMind_Algebra_QA_with_explanation.jsonl"

seed = 42

In [18]:
sciq = load_dataset("json", data_files=sciq_path, split="train")
deepmind = load_dataset("json", data_files=deepmind_path, split="train")

sciq = sciq.shuffle(seed=seed)
deepmind = deepmind.shuffle(seed=seed)

In [19]:
print(sciq)
print(deepmind)

Dataset({
    features: ['question', 'answer', 'explanation', 'related'],
    num_rows: 9997
})
Dataset({
    features: ['question', 'answer', 'explanation'],
    num_rows: 97467
})


In [20]:
idx = random.randint(0, len(sciq))
print(sciq[idx])

print()
idx = random.randint(0, len(deepmind))

print(deepmind[idx])

{'question': 'Question: What is the process in which one cell divides to form two new cells called?\n\nOptions:\nA. Cell Formation\nB. cell contribution\nC. cell direction\nD. cell division\n\nAnswer:', 'answer': 'D', 'explanation': 'Cell division is part of the life cycle of virtually all cells. Cell division is the process in which one cell divides to form two new cells.', 'related': True}

{'question': 'Question: A, B and C rents a pasture for Rs.950. A put in 12 horses for 8 months, B 16 horses for 9 months and 18 horses for 6 months. How much should C pay?\n\nOptions:\nA)295\nB)199\nC)676\nD)156\nE)122\n\nAnswer:', 'answer': 'A', 'explanation': '12*8 :16*9 = 18*6\n8: 12: 9\n9/29 * 950= 295\nAnswer: A'}


### Create test dataset
Uses 20% of each dataset as test

In [21]:
sciq = sciq.train_test_split(test_size=0.2)
sciq_train, sciq_test = sciq["train"], sciq["test"]

print(f"{len(sciq_train) = } {len(sciq_test) = }")

deepmind  = deepmind.train_test_split(test_size=0.2)
deepmind_train, deepmind_test = deepmind["train"], deepmind["test"]

print(f"{len(deepmind_train) = } {len(deepmind_test) = }")

test_dataset = concatenate_datasets([sciq_test, deepmind_test])

test_dataset.to_json("test_dataset.jsonl", lines=True)

print(f"{len(test_dataset) = }")

len(sciq_train) = 7997 len(sciq_test) = 2000
len(deepmind_train) = 77973 len(deepmind_test) = 19494


Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Creating json from Arrow format: 100%|██████████| 22/22 [00:00<00:00, 79.25ba/s]

len(test_dataset) = 21494





### Create train datasets

In [22]:
def create_combined_datasets(datasetA, datasetB, sizeA, sizeB):
    sampledA = datasetA.shuffle(seed=seed).select(range(sizeA))
    sampledB= datasetB.shuffle(seed=seed).select(range(sizeB))

    combined = concatenate_datasets([sampledA, sampledB])

    return combined

In [23]:
TOTAL_SIZE = 50000
len_sciq = len(sciq_train)

sizes = [(len_sciq, TOTAL_SIZE - len_sciq), (int(0.75 * len_sciq), TOTAL_SIZE - int(0.75 * len_sciq)), (int(0.5 * len_sciq), TOTAL_SIZE - int(0.5 * len_sciq)), (int(0.25 * len_sciq), TOTAL_SIZE - int(0.25 * len_sciq))]

In [24]:
for size_sciq, size_deepmind in sizes:
    dataset = create_combined_datasets(sciq_train, deepmind_train, size_sciq, size_deepmind)
    dataset.to_json("train_dataset_{}.jsonl".format(size_sciq), lines=True)

Creating json from Arrow format: 100%|██████████| 50/50 [00:00<00:00, 94.21ba/s]
Creating json from Arrow format: 100%|██████████| 50/50 [00:00<00:00, 96.29ba/s] 
Creating json from Arrow format: 100%|██████████| 50/50 [00:00<00:00, 92.51ba/s]
Creating json from Arrow format: 100%|██████████| 50/50 [00:00<00:00, 99.26ba/s] 
