In [3]:
import json
import random
from datasets import load_dataset

import warnings
warnings.filterwarnings("ignore")

DATASET_ID = "sahil2801/CodeAlpaca-20k"
SPLIT = "train"
OUTPUT_PATH = "pretrain_code_alpaca20k.jsonl"

MAX_SAMPLES = 20000
MAX_CHARS = 512
CHUNK_SIZE = 2000

BOS = "<|im_start|>"
EOS = "<|im_end|>"

# ================================
# BUILD PROMPT
# ================================
def build_prompt(sample):
    instr = sample.get("instruction", "").strip()
    inp = sample.get("input", "").strip()
    out = sample.get("output", "").strip()

    # merge instruction + input into user message
    user_msg = instr
    if inp:
        user_msg += "\n" + inp

    # Format:
    # <|im_start|>user
    # {user_msg}
    # <|im_end|>
    #
    # <|im_start|>assistant
    # {answer}
    # <|im_end|>

    return (
        f"{BOS}user\n{user_msg}{EOS}\n"
        f"{BOS}assistant\n{out}{EOS}"
    )

# ================================
# CHUNK WRITER
# ================================
def write_chunk(buffer, file_path):
    random.shuffle(buffer)
    with open(file_path, "a", encoding="utf-8") as f:
        for row in buffer:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

# ================================
# MAIN
# ================================
def main():
    print(f"üöÄ Loading dataset: {DATASET_ID}")
    dataset = load_dataset(DATASET_ID, split=SPLIT, streaming=False)

    buffer = []
    written = 0
    chunk_id = 1
    total_chars = 0

    for sample in dataset:
        if written >= MAX_SAMPLES:
            break

        prompt = build_prompt(sample)

        # length filtering
        if len(prompt) >= MAX_CHARS:
            continue

        total_chars += len(prompt)
        buffer.append({"text": prompt})
        written += 1

        if len(buffer) >= CHUNK_SIZE:
            print(f"üìù Writing chunk {chunk_id}‚Ä¶ total {written} samples")
            write_chunk(buffer, OUTPUT_PATH)
            buffer = []
            chunk_id += 1

    # final write
    if buffer:
        write_chunk(buffer, OUTPUT_PATH)

    # ================================
    # SUMMARY
    # ================================
    print("\nüéâ DONE")
    print(f"üì¶ Total samples written: {written}")
    print(f"üî£ Total characters: {total_chars:,}")
    est_tokens = total_chars / 4
    print(f"üî¢ Estimated tokens: {est_tokens:,.0f}")
    print(f"üî¢ Tokens (millions): {est_tokens / 1e6:.3f}M")
    print(f"üìÑ Saved to: {OUTPUT_PATH}")


if __name__ == "__main__":
    main()

üöÄ Loading dataset: sahil2801/CodeAlpaca-20k


code_alpaca_20k.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/20022 [00:00<?, ? examples/s]

üìù Writing chunk 1‚Ä¶ total 2000 samples
üìù Writing chunk 2‚Ä¶ total 4000 samples
üìù Writing chunk 3‚Ä¶ total 6000 samples
üìù Writing chunk 4‚Ä¶ total 8000 samples
üìù Writing chunk 5‚Ä¶ total 10000 samples
üìù Writing chunk 6‚Ä¶ total 12000 samples
üìù Writing chunk 7‚Ä¶ total 14000 samples
üìù Writing chunk 8‚Ä¶ total 16000 samples

üéâ DONE
üì¶ Total samples written: 16746
üî£ Total characters: 4,720,438
üî¢ Estimated tokens: 1,180,110
üî¢ Tokens (millions): 1.180M
üìÑ Saved to: pretrain_code_alpaca20k.jsonl
