In [1]:
import os
import json
from datasets import load_dataset

# Define the domains and types
domains = ["math", "code", "instruction_following"]
types = ["ig", "qe", "mri", "ei"]
alternate_domains_ig = ["gsm8k", "mbpp", "lima"]
output_dir = "data/synthetic"
os.makedirs(output_dir, exist_ok=True)
for dataset_type in types:
    if dataset_type == "ig":
        domain_list = alternate_domains_ig
    else:
        domain_list = domains

    for domain in domain_list:
        dataset_name = f"SidhaarthMurali/synthetic_{domain}_{dataset_type}_mini"

        try:
            # Load the dataset
            print(f"Loading dataset: {dataset_name}")
            dataset = load_dataset(dataset_name)

            # Convert to JSONL and save
            output_path = os.path.join(output_dir, f"{domain}_{dataset_type}.jsonl")
            with open(output_path, "w") as f:
                for row in dataset["train"]:  # Assuming we are writing the "train" split
                    f.write(json.dumps(row) + "\n")

            print(f"Saved dataset to {output_path}")
        except Exception as e:
            print(f"Failed to process {dataset_name}: {e}")


Loading dataset: SidhaarthMurali/synthetic_gsm8k_ig


Downloading data: 100%|██████████| 9.50M/9.50M [00:02<00:00, 4.33MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Saved dataset to data/synthetic/gsm8k_ig.jsonl
Loading dataset: SidhaarthMurali/synthetic_mbpp_ig


Downloading data: 100%|██████████| 647k/647k [00:01<00:00, 583kB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Saved dataset to data/synthetic/mbpp_ig.jsonl
Loading dataset: SidhaarthMurali/synthetic_lima_ig


Downloading readme:   0%|          | 0.00/347 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 581k/581k [00:01<00:00, 446kB/s]


Generating train split:   0%|          | 0/1011 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/lima_ig.jsonl
Loading dataset: SidhaarthMurali/synthetic_math_qe


Downloading readme:   0%|          | 0.00/370 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 984k/984k [00:00<00:00, 1.19MB/s]


Generating train split:   0%|          | 0/1081 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/math_qe.jsonl
Loading dataset: SidhaarthMurali/synthetic_code_qe


Downloading readme:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 689k/689k [00:00<00:00, 768kB/s]


Generating train split:   0%|          | 0/2089 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/code_qe.jsonl
Loading dataset: SidhaarthMurali/synthetic_instruction_following_qe


Downloading readme:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 1.06M/1.06M [00:02<00:00, 454kB/s]


Generating train split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/instruction_following_qe.jsonl
Loading dataset: SidhaarthMurali/synthetic_math_mri


Downloading readme:   0%|          | 0.00/519 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 3.81M/3.81M [00:01<00:00, 2.40MB/s]


Generating train split:   0%|          | 0/3117 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/math_mri.jsonl
Loading dataset: SidhaarthMurali/synthetic_code_mri


Downloading readme:   0%|          | 0.00/518 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 7.95M/7.95M [00:01<00:00, 6.02MB/s]


Generating train split:   0%|          | 0/4614 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/code_mri.jsonl
Loading dataset: SidhaarthMurali/synthetic_instruction_following_mri


Downloading readme:   0%|          | 0.00/519 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 13.0M/13.0M [00:01<00:00, 8.77MB/s]


Generating train split:   0%|          | 0/9324 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/instruction_following_mri.jsonl
Loading dataset: SidhaarthMurali/synthetic_math_ei


Downloading readme:   0%|          | 0.00/519 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 13.6M/13.6M [00:01<00:00, 8.60MB/s]


Generating train split:   0%|          | 0/9459 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/math_ei.jsonl
Loading dataset: SidhaarthMurali/synthetic_code_ei


Downloading readme:   0%|          | 0.00/518 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 8.98M/8.98M [00:01<00:00, 6.34MB/s]


Generating train split:   0%|          | 0/4520 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/code_ei.jsonl
Loading dataset: SidhaarthMurali/synthetic_instruction_following_ei


Downloading readme:   0%|          | 0.00/519 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 18.4M/18.4M [00:01<00:00, 11.7MB/s]


Generating train split:   0%|          | 0/9203 [00:00<?, ? examples/s]

Saved dataset to data/synthetic/instruction_following_ei.jsonl
