In [25]:
from datasets import load_dataset
# Download the full Rust subset to disk (non-streaming)
# This will download all Rust files — may take a while depending on size
rust_ds = load_dataset(
    "bigcode/the-stack-dedup",
    data_dir="data/rust",
    split="train",
)

print(f"Total Rust samples: {len(rust_ds)}")
print(f"Columns: {rust_ds.column_names}")
print(rust_ds)

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

Total Rust samples: 1386585
Columns: ['hexsha', 'size', 'ext', 'lang', 'max_stars_repo_path', 'max_stars_repo_name', 'max_stars_repo_head_hexsha', 'max_stars_repo_licenses', 'max_stars_count', 'max_stars_repo_stars_event_min_datetime', 'max_stars_repo_stars_event_max_datetime', 'max_issues_repo_path', 'max_issues_repo_name', 'max_issues_repo_head_hexsha', 'max_issues_repo_licenses', 'max_issues_count', 'max_issues_repo_issues_event_min_datetime', 'max_issues_repo_issues_event_max_datetime', 'max_forks_repo_path', 'max_forks_repo_name', 'max_forks_repo_head_hexsha', 'max_forks_repo_licenses', 'max_forks_count', 'max_forks_repo_forks_event_min_datetime', 'max_forks_repo_forks_event_max_datetime', 'content', 'avg_line_length', 'max_line_length', 'alphanum_fraction']
Dataset({
    features: ['hexsha', 'size', 'ext', 'lang', 'max_stars_repo_path', 'max_stars_repo_name', 'max_stars_repo_head_hexsha', 'max_stars_repo_licenses', 'max_stars_count', 'max_stars_repo_stars_event_min_datetime', 'ma

In [15]:
%pip install tokenizers

Looking in indexes: https://pypi.org/simple/
Collecting tokenizers
  Downloading tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.3 kB)
Downloading tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.22.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Fix torch_shm_manager permission issue (required for streaming datasets with torch installed)
import subprocess, site, os
shm_path = os.path.join(site.getsitepackages()[0], "torch", "bin", "torch_shm_manager")
if os.path.exists(shm_path):
    subprocess.run(["chmod", "+x", shm_path], check=True)
    print(f"Fixed permissions on {shm_path}")
else:
    print("torch_shm_manager not found — skipping")

In [26]:
# the-stack-v2-dedup only has metadata (no source code content).
# Switch to the-stack-dedup (v1) which includes the actual code in a "content" column.
# NOTE: This is a gated dataset — accept terms at https://huggingface.co/datasets/bigcode/the-stack-dedup
from datasets import load_dataset

rust_code_ds = load_dataset(
    "bigcode/the-stack-dedup",
    data_dir="data/rust",
    split="train"
)

# Preview first sample
sample = next(iter(rust_code_ds))
print(f"Columns: {list(sample.keys())}")
print(f"Language: {sample['lang']}")
print(f"Size: {sample['size']} bytes")
print(f"Content preview:\n{sample['content'][:500]}")

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

Columns: ['hexsha', 'size', 'ext', 'lang', 'max_stars_repo_path', 'max_stars_repo_name', 'max_stars_repo_head_hexsha', 'max_stars_repo_licenses', 'max_stars_count', 'max_stars_repo_stars_event_min_datetime', 'max_stars_repo_stars_event_max_datetime', 'max_issues_repo_path', 'max_issues_repo_name', 'max_issues_repo_head_hexsha', 'max_issues_repo_licenses', 'max_issues_count', 'max_issues_repo_issues_event_min_datetime', 'max_issues_repo_issues_event_max_datetime', 'max_forks_repo_path', 'max_forks_repo_name', 'max_forks_repo_head_hexsha', 'max_forks_repo_licenses', 'max_forks_count', 'max_forks_repo_forks_event_min_datetime', 'max_forks_repo_forks_event_max_datetime', 'content', 'avg_line_length', 'max_line_length', 'alphanum_fraction']
Language: Rust
Size: 4965 bytes
Content preview:
use crate::interactive::{
    widgets::{
        Entries, EntriesProps, Footer, FooterProps, Header, HelpPane, HelpPaneProps, MarkPane,
        MarkPaneProps, COLOR_MARKED,
    },
    AppState, DisplayOpti

In [28]:
# Train a BPE tokenizer on a subset of Rust code
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders

# Collect training corpus by streaming a subset
TRAIN_SAMPLES = 50_000  # number of files to use for tokenizer training

def batch_iterator(dataset, batch_size=1000):
    batch = []
    for i, sample in enumerate(dataset):
        if i >= TRAIN_SAMPLES:
            break
        batch.append(sample["content"])
        if len(batch) == batch_size:
            yield batch
            batch = []
    if batch:
        yield batch

# Initialize a BPE tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = decoders.ByteLevel()

# Configure the trainer
trainer = trainers.BpeTrainer(
    vocab_size=32_000,
    min_frequency=2,
    special_tokens=["<|endoftext|>", "<|padding|>"],
    show_progress=True,
)

# Re-create the streaming dataset for training
train_stream = load_dataset(
    "bigcode/the-stack-dedup",
    data_dir="data/rust",
    split="train",
    streaming=True,
)

# Train the tokenizer
tokenizer.train_from_iterator(batch_iterator(train_stream), trainer=trainer)
print(f"Vocab size: {tokenizer.get_vocab_size()}")

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]




Vocab size: 32000


In [29]:
# Test the tokenizer on a Rust code snippet
test_code = """fn main() {
    let mut v: Vec<i32> = Vec::new();
    v.push(42);
    println!("Hello, Rust! {}", v[0]);
}"""

encoded = tokenizer.encode(test_code)
print(f"Tokens ({len(encoded.ids)}): {encoded.ids[:30]}...")
print(f"Decoded: {encoded.tokens[:30]}...")

# Verify round-trip
decoded = tokenizer.decode(encoded.ids)
print(f"\nRound-trip decoded:\n{decoded}")

Tokens (40): [648, 1885, 241, 239, 227, 268, 399, 527, 27, 659, 29, 74, 362, 31, 238, 659, 225, 355, 356, 227, 527, 15, 794, 9, 2351, 261, 227, 1109, 510, 5761]...
Decoded: ['fn', 'Ġmain', '()', 'Ġ{', 'ĊĠĠĠ', 'Ġlet', 'Ġmut', 'Ġv', ':', 'ĠVec', '<', 'i', '32', '>', 'Ġ=', 'ĠVec', '::', 'new', '();', 'ĊĠĠĠ', 'Ġv', '.', 'push', '(', '42', ');', 'ĊĠĠĠ', 'Ġprintln', '!("', 'Hello']...

Round-trip decoded:
fn main() {
    let mut v: Vec<i32> = Vec::new();
    v.push(42);
    println!("Hello, Rust! {}", v[0]);
}


In [None]:
from IPython.display import HTML, display
import json, html as html_mod

# --- Tiktokenizer-style interactive visualizer ---
COLORS = [
    "#bae6fd", "#fde68a", "#bfdbfe", "#bbf7d0", "#fed7aa",
    "#a5f3fc", "#e5e7eb", "#e9d5ff", "#c7d2fe", "#d9f99d",
    "#fecdd3", "#ddd6fe", "#fef08a", "#a7f3d0", "#e4e4e7",
    "#fecaca", "#f5d0fe", "#fbcfe8", "#99f6e4",
]

def visualize_tokens(text, tokenizer):
    encoded = tokenizer.encode(text)
    tokens = encoded.tokens
    ids = encoded.ids
    offsets = encoded.offsets

    # Build colored spans HTML
    spans_html = []
    ids_html = []
    for i, (tok, tid, (start, end)) in enumerate(zip(tokens, ids, offsets)):
        color = COLORS[i % len(COLORS)]
        # Decode the displayed text from offsets for accuracy
        display_text = text[start:end] if end > start else tok
        escaped = html_mod.escape(display_text)
        # Show whitespace visually
        escaped = escaped.replace(" ", '<span style="opacity:0.5">\u00b7</span>')
        escaped = escaped.replace("\n", '<span style="opacity:0.5">\\n</span>\n')
        escaped = escaped.replace("\t", '<span style="opacity:0.5">\\t</span>')
        spans_html.append(
            f'<span class="tok" data-i="{i}" style="background:{color};'
            f'padding:1px 0;border-radius:3px;cursor:pointer;"'
            f' onmouseenter="hlTok({i})" onmouseleave="hlTok(-1)">'
            f'{escaped}</span>'
        )
        ids_html.append(
            f'<span class="tid" data-i="{i}" style="background:{color};'
            f'padding:1px 4px;border-radius:3px;margin:1px;display:inline-block;'
            f'font-size:12px;cursor:pointer;"'
            f' onmouseenter="hlTok({i})" onmouseleave="hlTok(-1)">'
            f'{tid}</span>'
        )

    page = f"""
    <style>
      .tv-box {{ border:1px solid #e2e8f0; border-radius:8px; padding:16px;
                 font-family:ui-monospace,monospace; background:#f8fafc; margin:4px 0;
                 min-height:100px; white-space:pre-wrap; word-break:break-all;
                 line-height:1.8; }}
      .tv-label {{ font-weight:600; font-size:13px; color:#64748b; margin-bottom:6px; }}
      .tok.hl, .tid.hl {{ outline:2px solid #334155; z-index:1; position:relative; }}
    </style>
    <div style="font-family:system-ui,sans-serif; max-width:900px;">
      <div style="display:flex; align-items:center; gap:16px; margin-bottom:12px;">
        <span style="font-size:18px; font-weight:700;">Rust Tokenizer Visualizer</span>
        <span style="background:#e2e8f0; padding:4px 12px; border-radius:6px;
               font-size:14px; font-weight:600;">{len(ids)} tokens</span>
        <span style="background:#e2e8f0; padding:4px 12px; border-radius:6px;
               font-size:14px; font-weight:600;">{len(text)} chars</span>
      </div>
      <div class="tv-label">Text</div>
      <div class="tv-box">{''.join(spans_html)}</div>
      <div class="tv-label" style="margin-top:12px;">Token IDs</div>
      <div class="tv-box" style="line-height:2.2;">{''.join(ids_html)}</div>
    </div>
    <script>
    function hlTok(idx) {{
      document.querySelectorAll('.tok,.tid').forEach(el => {{
        if (idx >= 0 && el.dataset.i == idx) el.classList.add('hl');
        else el.classList.remove('hl');
      }});
    }}
    </script>
    """
    display(HTML(page))

# --- Visualize the Rust test snippet ---
test_code = """fn main() {
    let mut v: Vec<i32> = Vec::new();
    v.push(42);
    println!("Hello, Rust! {}", v[0]);
}"""

visualize_tokens(test_code, tokenizer)

In [32]:
# Interactive version — type any Rust code and see live tokenization
import ipywidgets as widgets
from IPython.display import HTML, display, clear_output
import html as html_mod

COLORS = [
    "#bae6fd", "#fde68a", "#bfdbfe", "#bbf7d0", "#fed7aa",
    "#a5f3fc", "#e5e7eb", "#e9d5ff", "#c7d2fe", "#d9f99d",
    "#fecdd3", "#ddd6fe", "#fef08a", "#a7f3d0", "#e4e4e7",
    "#fecaca", "#f5d0fe", "#fbcfe8", "#99f6e4",
]

def render_tokens(text, tokenizer):
    encoded = tokenizer.encode(text)
    tokens, ids, offsets = encoded.tokens, encoded.ids, encoded.offsets

    spans_html, ids_html = [], []
    for i, (tok, tid, (start, end)) in enumerate(zip(tokens, ids, offsets)):
        color = COLORS[i % len(COLORS)]
        display_text = text[start:end] if end > start else tok
        escaped = html_mod.escape(display_text)
        escaped = escaped.replace(" ", '<span style="opacity:0.4">\u00b7</span>')
        escaped = escaped.replace("\n", '<span style="opacity:0.4">\\n</span>\n')
        escaped = escaped.replace("\t", '<span style="opacity:0.4">\\t</span>')
        spans_html.append(
            f'<span class="tok" data-i="{i}" style="background:{color};'
            f'padding:1px 0;border-radius:3px;cursor:pointer;"'
            f' onmouseenter="hlTok({i})" onmouseleave="hlTok(-1)">{escaped}</span>'
        )
        ids_html.append(
            f'<span class="tid" data-i="{i}" style="background:{color};'
            f'padding:1px 4px;border-radius:3px;margin:1px;display:inline-block;'
            f'font-size:12px;cursor:pointer;"'
            f' onmouseenter="hlTok({i})" onmouseleave="hlTok(-1)">{tid}</span>'
        )

    return f"""
    <style>
      .tv-box {{ border:1px solid #e2e8f0; border-radius:8px; padding:16px;
                 font-family:ui-monospace,monospace; background:#f8fafc; margin:4px 0;
                 min-height:60px; white-space:pre-wrap; word-break:break-all;
                 line-height:1.8; }}
      .tv-label {{ font-weight:600; font-size:13px; color:#64748b; margin-bottom:6px; }}
      .tok.hl, .tid.hl {{ outline:2px solid #334155; z-index:1; position:relative; }}
    </style>
    <div style="font-family:system-ui,sans-serif;">
      <div style="display:flex; align-items:center; gap:12px; margin-bottom:10px;">
        <span style="font-size:16px; font-weight:700;">Rust Tokenizer Visualizer</span>
        <span style="background:#dbeafe; padding:3px 10px; border-radius:6px;
               font-size:13px; font-weight:600;">{len(ids)} tokens</span>
        <span style="background:#e2e8f0; padding:3px 10px; border-radius:6px;
               font-size:13px; font-weight:600;">{len(text)} chars</span>
        <span style="background:#fef3c7; padding:3px 10px; border-radius:6px;
               font-size:13px; font-weight:600;">ratio: {len(ids)/max(len(text),1):.2f} tok/char</span>
      </div>
      <div class="tv-label">Tokenized Text</div>
      <div class="tv-box">{''.join(spans_html)}</div>
      <div class="tv-label" style="margin-top:10px;">Token IDs</div>
      <div class="tv-box" style="line-height:2.2;">{''.join(ids_html)}</div>
    </div>
    <script>
    function hlTok(idx) {{
      document.querySelectorAll('.tok,.tid').forEach(el => {{
        if (idx >= 0 && el.dataset.i == idx) el.classList.add('hl');
        else el.classList.remove('hl');
      }});
    }}
    </script>
    """

default_code = """fn main() {
    let mut v: Vec<i32> = Vec::new();
    v.push(42);
    println!("Hello, Rust! {}", v[0]);
}"""

textarea = widgets.Textarea(
    value=default_code, layout=widgets.Layout(width="100%", height="180px"),
    description="Rust code:", style={"description_width": "80px"},
)
output = widgets.Output()

def on_change(change):
    with output:
        clear_output(wait=True)
        display(HTML(render_tokens(change["new"], tokenizer)))

textarea.observe(on_change, names="value")

display(textarea, output)
# Trigger initial render
with output:
    display(HTML(render_tokens(default_code, tokenizer)))

Textarea(value='fn main() {\n    let mut v: Vec<i32> = Vec::new();\n    v.push(42);\n    println!("Hello, Rust…

Output()

In [31]:
# Tokenize the full dataset and save as token IDs
# Stream the dataset again and tokenize each file
EOT_ID = tokenizer.token_to_id("<|endoftext|>")

tokenized_stream = load_dataset(
    "bigcode/the-stack-dedup",
    data_dir="data/rust",
    split="train",
    streaming=True,
)

import numpy as np

all_tokens = []
MAX_FILES = 100_000  # limit for manageable size; increase as needed

for i, sample in enumerate(tokenized_stream):
    if i >= MAX_FILES:
        break
    ids = tokenizer.encode(sample["content"]).ids
    all_tokens.extend(ids)
    all_tokens.append(EOT_ID)  # separate documents with EOT token
    if (i + 1) % 10_000 == 0:
        print(f"Tokenized {i + 1} files, {len(all_tokens):,} tokens so far...")

all_tokens = np.array(all_tokens, dtype=np.uint16)
print(f"\nTotal tokens: {len(all_tokens):,}")
print(f"Array size: {all_tokens.nbytes / 1e6:.1f} MB")

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Tokenized 10000 files, 19,895,335 tokens so far...
Tokenized 20000 files, 39,470,086 tokens so far...
Tokenized 30000 files, 59,801,287 tokens so far...
Tokenized 40000 files, 77,448,586 tokens so far...


KeyboardInterrupt: 

In [None]:
# Train/val split and save
split_idx = int(len(all_tokens) * 0.9)
train_tokens = all_tokens[:split_idx]
val_tokens = all_tokens[split_idx:]

np.save("data/rust_train_tokens.npy", train_tokens)
np.save("data/rust_val_tokens.npy", val_tokens)

print(f"Train tokens: {len(train_tokens):,}")
print(f"Val tokens:   {len(val_tokens):,}")
print("Saved to data/rust_train_tokens.npy and data/rust_val_tokens.npy")