In [1]:
import json
import os
from tqdm import tqdm
from random import shuffle
from dotenv import load_dotenv
from datasets import load_dataset

load_dotenv()

all_rows = []
HF_TOKEN = os.environ["HF_TOKEN"]

os.makedirs("raw_data", exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("Open-Orca/1million-gpt-4", trust_remote_code=True, token=HF_TOKEN)

for row in tqdm(ds["train"], total=len(ds)):
    sys_prompt = (
        row["system_prompt"].strip()
        if row["system_prompt"]
        else "You are a helpful AI assistant. Answer the following question, as per the given instructions."
    )
    user_prompt = row["question"].strip()
    assistant = row["response"].strip()

    all_rows.append({"messages": [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant}
    ]})

Downloading data: 100%|██████████| 1.01G/1.01G [00:14<00:00, 69.1MB/s]
Generating train split: 100%|██████████| 994896/994896 [00:05<00:00, 174758.20 examples/s]
994896it [00:30, 32706.50it/s]       


In [3]:
ds = load_dataset("PawanKrd/math-gpt-4o-200k", trust_remote_code=True, token=HF_TOKEN)

for row in tqdm(ds["train"], total=len(ds)):
    sys_prompt = "You are a helpful AI math assistant. Answer the following question, and provide a step-by-step logical solution."
    user_prompt = row["prompt"].strip()
    assistant = row["response"].strip()

    all_rows.append({"messages": [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant}
    ]})

Downloading readme: 100%|██████████| 321/321 [00:00<00:00, 2.55MB/s]
Downloading data: 100%|██████████| 115M/115M [00:01<00:00, 70.5MB/s] 
Generating train split: 100%|██████████| 200035/200035 [00:00<00:00, 245737.30 examples/s]
200035it [00:03, 52929.25it/s]       


In [4]:
ds = load_dataset("PawanKrd/gpt-4o-200k", trust_remote_code=True, token=HF_TOKEN)

for row in tqdm(ds["train"], total=len(ds)):
    sys_prompt = (
        row["system"].strip()
        if row["system"]
        else "You are a helpful AI assistant. Answer the following question, as per the given instructions."
    )
    user_prompt = row["prompt"].strip()
    assistant = row["response"].strip()

    all_rows.append({"messages": [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant}
    ]})

Downloading readme: 100%|██████████| 356/356 [00:00<00:00, 3.54MB/s]
Downloading data: 100%|██████████| 239M/239M [00:15<00:00, 15.6MB/s] 
Generating train split: 100%|██████████| 200023/200023 [00:01<00:00, 192992.09 examples/s]
200023it [00:05, 35909.66it/s]       


In [5]:
ds = load_dataset("llm-wizard/alpaca-gpt4-data", trust_remote_code=True, token=HF_TOKEN)

for row in tqdm(ds["train"], total=len(ds)):
    sys_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    user_prompt = (
        f"{row['instruction'].strip()}\n\n{row['input'].strip()}"
        if row["input"]
        else row["instruction"].strip()
    )
    assistant = row["output"].strip()

    all_rows.append({"messages": [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant}
    ]})

Downloading readme: 100%|██████████| 1.39k/1.39k [00:00<00:00, 9.25MB/s]
Downloading data: 100%|██████████| 43.4M/43.4M [00:01<00:00, 33.9MB/s]
Generating train split: 100%|██████████| 52002/52002 [00:00<00:00, 174420.13 examples/s]
52002it [00:01, 42963.70it/s]        


In [6]:
shuffle(all_rows)

with open("raw_data/sft_data.jsonl", "w", encoding="utf-8") as f:
    for row in all_rows:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print(f"Wrote {len(all_rows)} rows to sft_data.jsonl")

Wrote 1446956 rows to sft_data.jsonl


In [7]:
all_rows = []

ds = load_dataset("PawanKrd/dpo-gpt-4o", trust_remote_code=True, token=HF_TOKEN)

for row in tqdm(ds["train"], total=len(ds)):
    sys_prompt = (
        row["system"].strip()
        if row["system"]
        else "You are a helpful AI assistant. Answer the following question, as per the given instructions."
    )
    user_prompt = row["prompt"].strip()
    chosen = row["chosen"].strip()
    rejected = row["rejected"].strip()

    all_rows.append(
        {
            "chosen": [
                {"role": "system", "content": sys_prompt},
                {"role": "user", "content": user_prompt},
                {"role": "assistant", "content": chosen},
            ],
            "rejected": [
                {"role": "system", "content": sys_prompt},
                {"role": "user", "content": user_prompt},
                {"role": "assistant", "content": rejected},
            ],
        }
    )

Downloading readme: 100%|██████████| 387/387 [00:00<00:00, 1.70MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:00<00:00, 20.7MB/s]
Generating train split: 100%|██████████| 12359/12359 [00:00<00:00, 146039.37 examples/s]
12359it [00:00, 31618.45it/s]        


In [8]:
shuffle(all_rows)

with open("raw_data/dpo_data.jsonl", "w", encoding="utf-8") as f:
    for row in all_rows:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print(f"Wrote {len(all_rows)} rows to dpo_data.jsonl")

Wrote 12359 rows to dpo_data.jsonl
