In [None]:
# first install the required packages in your Google Colab environment
!pip install datasets tiktoken tqdm
# then run the script below

In [None]:
# first install the required packages in your Google Colab environment
# !pip install datasets tiktoken tqdm
# then run the script below

import os
import multiprocessing as mp
import numpy as np
import tiktoken
from datasets import load_dataset
from tqdm import tqdm
from google.colab import drive

# --- CONFIGURATION ---

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define the path in your Google Drive to save the tokenized files.
#    Make sure this folder exists in your Google Drive.
drive_path = "/content/drive/My Drive/FineWeb-Edu-Tokens"
os.makedirs(drive_path, exist_ok=True)

remote_name = "sample-10BT"
shard_size = int(1e8) # 100M tokens per shard

# --- SCRIPT LOGIC ---

# ✅ Use streaming=True to avoid downloading the entire dataset to the Colab disk.
# This downloads and processes the data on-the-fly.
print("Loading dataset in streaming mode...")
fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train", streaming=True)

# Init the tokenizer
enc = tiktoken.get_encoding("gpt2")
eot = enc._special_tokens['<|endoftext|>'] # end of text token
def tokenize(doc):
    """Tokenizes a single document and returns a numpy array of uint16 tokens."""
    tokens = [eot] # a special token to delimit documents
    tokens.extend(enc.encode_ordinary(doc["text"]))
    tokens_np = np.array(tokens)
    assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "Token dictionary too large for uint16"
    return tokens_np.astype(np.uint16)

def write_datafile(filename, tokens_np):
    """Saves the token array to a .npy file."""
    np.save(filename, tokens_np)

# Set the number of processes for parallel tokenization
nprocs = max(1, os.cpu_count() // 2)
print(f"Using {nprocs} processes for tokenization...")

with mp.Pool(nprocs) as pool:
    shard_index = 0
    all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
    token_count = 0
    progress_bar = None

    # Use pool.imap for lazy iteration, which works well with streaming
    for tokens in pool.imap(tokenize, fw, chunksize=16):
        if token_count + len(tokens) < shard_size:
            all_tokens_np[token_count : token_count + len(tokens)] = tokens
            token_count += len(tokens)
            if progress_bar is None:
                progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}")
            progress_bar.update(len(tokens))
        else:
            split = "val" if shard_index == 0 else "train"
            filename = os.path.join(drive_path, f"edufineweb_{split}_{shard_index:06d}.npy")

            remainder = shard_size - token_count
            if progress_bar is not None:
                progress_bar.update(remainder)
                progress_bar.close()

            all_tokens_np[token_count : token_count + remainder] = tokens[:remainder]
            print(f"\nWriting shard {shard_index} to {filename}...")
            write_datafile(filename, all_tokens_np)

            shard_index += 1
            progress_bar = None # Reset for the next shard

            # Populate the next shard with the leftovers
            leftover_tokens = tokens[remainder:]
            all_tokens_np[0 : len(leftover_tokens)] = leftover_tokens
            token_count = len(leftover_tokens)

# Write the final shard
if token_count != 0:
    split = "val" if shard_index == 0 else "train"
    filename = os.path.join(drive_path, f"edufineweb_{split}_{shard_index:06d}.npy")
    print(f"\nWriting final shard {shard_index} to {filename}...")
    write_datafile(filename, all_tokens_np[:token_count])

print(f"\n✅ Tokenization complete. Files are saved in your Google Drive folder: {drive_path}")