In [1]:
import sentencepiece as spm
import os

# Check if input file exists
if not os.path.exists('nanollama_training_corpus.txt'):
    raise FileNotFoundError("Input file 'basic_knowledge_corpus.txt' not found!")

# Train the tokenizer
# Thanks to https://github.com/google/sentencepiece for providing relevant documentations
try:
    spm.SentencePieceTrainer.train(
        input='nanollama_training_corpus.txt',
        model_prefix='nanollama_tokenizer',
        vocab_size=16000,
        model_type='bpe',
        pad_id=0, pad_piece='<pad>',
        unk_id=1, unk_piece='<unk>',
        bos_id=2, bos_piece='<s>',
        eos_id=3, eos_piece='</s>',
        user_defined_symbols=['<pad>', '<s>', '</s>']
    )
    print(f'Tokenizer trained. Saved as "nanollama_tokenizer.model"')

    # Verify the model
    sp = spm.SentencePieceProcessor(model_file='nanollama_tokenizer.model')
    print(f"Vocabulary size: {sp.get_piece_size()}")
    print(f"Special tokens: {sp.id_to_piece(0)}, {sp.id_to_piece(1)}, {sp.id_to_piece(2)}, {sp.id_to_piece(3)}")

except Exception as e:
    print(f"Error training tokenizer: {e}")
    raise

Tokenizer trained. Saved as "nanollama_tokenizer.model"
Vocabulary size: 16000
Special tokens: <pad>, <unk>, <s>, </s>
