In [1]:
import os
import json
import random

INPUT_FOLDER = "final_dataset/"
OUTPUT_FILE = "pretrain_en.jsonl"

BUFFER_SIZE = 200_000   # controls shuffle quality vs memory use


def merge_and_shuffle():
    buffer = []
    total_written = 0
    file_count = 0

    # Remove old output if exists
    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

    print("üîç Scanning folder:", INPUT_FOLDER)

    jsonl_files = [f for f in os.listdir(INPUT_FOLDER) if f.endswith(".jsonl")]
    jsonl_files.sort()  # optional

    print(f"üìÅ Found {len(jsonl_files)} JSONL files.\n")

    for filename in jsonl_files:
        path = os.path.join(INPUT_FOLDER, filename)
        file_count += 1

        print(f"üìÑ Processing file {file_count}/{len(jsonl_files)} ‚Üí {filename}")

        file_entries = 0   # count entries for THIS file

        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    entry = json.loads(line)
                except json.JSONDecodeError:
                    continue  # skip invalid lines

                buffer.append(entry)
                file_entries += 1

                # flush buffer if too large
                if len(buffer) >= BUFFER_SIZE:
                    random.shuffle(buffer)
                    with open(OUTPUT_FILE, "a", encoding="utf-8") as out:
                        for item in buffer:
                            out.write(json.dumps(item, ensure_ascii=False) + "\n")
                    total_written += len(buffer)
                    print(f"   üìù Flushed {len(buffer)} entries (running total: {total_written})")
                    buffer = []

        print(f"   üìå Entries in {filename}: {file_entries}\n")

    # flush remaining buffer
    if buffer:
        random.shuffle(buffer)
        with open(OUTPUT_FILE, "a", encoding="utf-8") as out:
            for item in buffer:
                out.write(json.dumps(item, ensure_ascii=False) + "\n")
        total_written += len(buffer)

    print("üéâ MERGE COMPLETE!")
    print(f"üì¶ Total entries written: {total_written}")
    print(f"üìÑ Output saved to: {OUTPUT_FILE}")

    return total_written


if __name__ == "__main__":
    merge_and_shuffle()

üîç Scanning folder: final_dataset/
üìÅ Found 6 JSONL files.

üìÑ Processing file 1/6 ‚Üí pretrain_alpaca_gpt4_en.jsonl
   üìå Entries in pretrain_alpaca_gpt4_en.jsonl: 23093

üìÑ Processing file 2/6 ‚Üí pretrain_c4.jsonl
   üìù Flushed 200000 entries (running total: 200000)
   üìù Flushed 200000 entries (running total: 400000)
   üìå Entries in pretrain_c4.jsonl: 500000

üìÑ Processing file 3/6 ‚Üí pretrain_code_alpaca20k.jsonl
   üìå Entries in pretrain_code_alpaca20k.jsonl: 16746

üìÑ Processing file 4/6 ‚Üí pretrain_smollm.jsonl
   üìù Flushed 200000 entries (running total: 600000)
   üìù Flushed 200000 entries (running total: 800000)
   üìù Flushed 200000 entries (running total: 1000000)
   üìù Flushed 200000 entries (running total: 1200000)
   üìù Flushed 200000 entries (running total: 1400000)
   üìå Entries in pretrain_smollm.jsonl: 999978

üìÑ Processing file 5/6 ‚Üí pretrain_stack_smol_code.jsonl
   üìå Entries in pretrain_stack_smol_code.jsonl: 1428

üìÑ 