In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
from transformers import PreTrainedTokenizerFast
import os

# --- CONFIGURATION ---
# Path to your massive 5 million line corpus
DATA_PATH = "/kaggle/input/konkani-book-corpus/konkani_book_corpus.txt"  
SAVE_DIR = "konkani-tokenizer-v3-32k"
VOCAB_SIZE = 32000  # Optimized for 160M parameter models

def train_konkani_tokenizer():
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)

    print(f"ðŸš€ Initializing BPE Tokenizer for {VOCAB_SIZE} vocab size...")

    # 1. Initialize BPE Model
    # We use [UNK] for any characters not found in the vocab (rare emojis, etc.)
    tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

    # 2. Pre-tokenizer
    # ByteLevel is crucial for GPT models. It ensures we can encode any UTF-8 string,
    # which is vital for Konkani's Devanagari script.
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

    # 3. Trainer Configuration
    # We add specific special tokens that will be used in Phase 2 (Instruction Tuning)
    trainer = trainers.BpeTrainer(
        vocab_size=VOCAB_SIZE,
        special_tokens=[
            "<s>",      # Beginning of Sequence
            "<pad>",    # Padding
            "</s>",     # End of Sequence
            "[UNK]",    # Unknown
            "[INST]",   # User Instruction Start
            "[/INST]"   # User Instruction End
        ],
        # initial_alphabet ensures all base characters are included before merging
        initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
    )

    # 4. Train the Tokenizer
    # This might take a few minutes on 5M lines
    print(f"ðŸ”¥ Training started on {DATA_PATH}...")
    tokenizer.train(files=[DATA_PATH], trainer=trainer)

    # 5. Post-Processing & Decoding
    # This automatically handles the adding/stripping of spaces and special tokens
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
    tokenizer.decoder = decoders.ByteLevel()

    # 6. Save as Hugging Face "Fast" Tokenizer
    # This wrapper allows it to be loaded easily with AutoTokenizer or PreTrainedTokenizerFast
    fast_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        bos_token="<s>",
        eos_token="</s>",
        pad_token="<pad>",
        unk_token="[UNK]",
    )

    fast_tokenizer.save_pretrained(SAVE_DIR)
    print(f"âœ… Success! Tokenizer saved to: {SAVE_DIR}")
    print(f"ðŸ“Š Final Vocab Size: {len(fast_tokenizer)}")

if __name__ == "__main__":
    train_konkani_tokenizer()