In [None]:
%pip install datasets -q

In [None]:
from huggingface_hub import login

login(token = "") #put your token here

In [None]:
from datasets import load_dataset
import torch
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from tokenizers.normalizers import Sequence, Lowercase, NFKC
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import UnigramTrainer

# Определяем устройство
device = "cuda" if torch.cuda.is_available() else "cpu"
# 1. Подгружаем датасет
dataset = load_dataset("Skylion007/openwebtext", split="train", streaming=True)
texts = (example["text"] for example in dataset.take(10_000_000))

In [None]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from tokenizers.normalizers import Sequence, Lowercase, NFKC
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import UnigramTrainer
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from tqdm import tqdm
import os

# 2. Загружаем LLama tokenizer для получения спецтокенов
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

# 3. Формируем список спецтокенов + reserved токены
special_tokens = list(set(llama_tokenizer.special_tokens_map.values()))
if "[UNK]" not in special_tokens:
    special_tokens.append("[UNK]")

# Зарезервируем последние токены
RESERVED_TOKENS = 256
reserved_tokens = [f"[RESERVED_{i}]" for i in range(RESERVED_TOKENS)]
special_tokens += reserved_tokens

# 4. Создаём токенизатор
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = Sequence([NFKC(), Lowercase()])
tokenizer.pre_tokenizer = Whitespace()

trainer = UnigramTrainer(
    vocab_size=128256,
    special_tokens=special_tokens,
    unk_token="[UNK]",
)

# Обучаем токенизатор
tokenizer.train_from_iterator(tqdm(texts, desc="Training Tokenizer"), trainer)

# Сохраняем tokenizer.json
save_dir = "llama3_unigram_tokenizer"
os.makedirs(save_dir, exist_ok=True)
tokenizer_path = os.path.join(save_dir, "tokenizer.json")
tokenizer.save(tokenizer_path)

# 5. Оборачиваем в PreTrainedTokenizerFast
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=tokenizer_path,
    unk_token="[UNK]",
    pad_token=llama_tokenizer.pad_token,
    bos_token=llama_tokenizer.bos_token,
    eos_token=llama_tokenizer.eos_token,
    cls_token=llama_tokenizer.cls_token,
    sep_token=llama_tokenizer.sep_token,
    mask_token=llama_tokenizer.mask_token,
)

# Добавим зарезервированные токены явно
fast_tokenizer.add_special_tokens({"additional_special_tokens": reserved_tokens})

# 6. Сохраняем всё как HuggingFace токенизатор
fast_tokenizer.save_pretrained(save_dir)

print(f"✅ Tokenizer saved to folder: {save_dir}")