In [12]:
# -*- coding: utf-8 -*-
"""
# Hybrid Telugu Tokenizer Project

This notebook will guide you through the complete process of creating a sophisticated Telugu tokenizer. We will use a hybrid approach that combines:

1.  A Rule-Based Sandhi Engine: To intelligently split words based on grammatical rules.
2.  A Frequency-Based Vocabulary: To validate the splits and ensure they result in real, meaningful words.
3.  A BPE (Byte-Pair Encoding) Model:To learn tokenization patterns from our pre-processed corpus, resulting in a fast and robust final model.

"""

# @title ## Phase 1: Setup and Environment

import os
# --- 1.1: Create Project Directories ---
print("Creating project folder structure...")
project_root = "telugu_tokenizer"
data_folder = os.path.join(project_root, "data")
src_folder = os.path.join(project_root, "src")
model_folder = os.path.join(project_root, "model")

os.makedirs(data_folder, exist_ok=True)
os.makedirs(src_folder, exist_ok=True)
os.makedirs(model_folder, exist_ok=True)
print(f"Folders created: {project_root}, {data_folder}, {src_folder}, {model_folder}")

# --- 1.2: Install Dependencies ---
print("\nInstalling the 'sentencepiece' library...")
!pip install sentencepiece -q
print("Installation complete.")

Creating project folder structure...
Folders created: telugu_tokenizer, telugu_tokenizer/data, telugu_tokenizer/src, telugu_tokenizer/model

Installing the 'sentencepiece' library...
Installation complete.


In [11]:


# @title ## Phase 2: Upload Your Data

vocab_path = os.path.join(data_folder, "te_full.txt")
corpus_path = os.path.join(data_folder, "telugu_corpus.txt")

if os.path.exists(vocab_path) and os.path.exists(corpus_path):
    print("✅ Success! Both te_full.txt and telugu_corpus.txt are in the correct location.")
else:
    print("❌ Files not found. Please make sure you have uploaded both files to the 'telugu_tokenizer/data/' directory.")

❌ Files not found. Please make sure you have uploaded both files to the 'telugu_tokenizer/data/' directory.


In [13]:
# @title ## Phase 3.1: Create the Sandhi Splitter (Rule Engine)

%%writefile telugu_tokenizer/src/sandhi_splitter.py
# -*- coding: utf-8 -*-

# తెలుగు అక్షరాల నిర్వచనం
telugu_consonants = "కఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలవశషసహళక్షఱ"
virama = '్'
vowels = "అఆఇఈఉఊఋౠఎఏఐఒఓఔ"
matra_map = {
    'ా': 'ఆ', 'ి': 'ఇ', 'ీ': 'ఈ', 'ు': 'ఉ', 'ూ': 'ఊ', 'ృ': 'ఋ', 'ౄ': 'ౠ',
    'ె': 'ఎ', 'ే': 'ఏ', 'ై': 'ఐ', 'ొ': 'ఒ', 'ో': 'ఓ', 'ౌ': 'ఔ'
}
vowel_to_matra = {v: k for k, v in matra_map.items()}

def find_consonant_cluster_start(word, end_index):
    """Helper to find the start of a consonant cluster before a given index."""
    start_index = -1
    j = end_index - 1
    while j >= 0:
        current_char = word[j]
        if current_char in telugu_consonants:
            if j == 0 or word[j-1] != virama:
                start_index = j
                break
        elif current_char != virama:
            break
        j -= 1
    return start_index

# --- తెలుగు సంధులు ---

def split_savarnadeergha(word):
    """సవర్ణదీర్ఘ సంధి నియమాల ప్రకారం పదాన్ని విడదీస్తుంది."""
    sandhi_map = {'ా': ('అ', ''), 'ీ': ('ఇ', 'ి'), 'ూ': ('ఉ', 'ు'), 'ౄ': ('ఋ', 'ృ')}
    splits = []
    for i, char in enumerate(word):
        if char in sandhi_map:
            start_of_cluster = find_consonant_cluster_start(word, i)
            if start_of_cluster != -1:
                short_vowel_char, short_matra = sandhi_map[char]
                prefix = word[:start_of_cluster]
                consonant = word[start_of_cluster:i]
                word1 = prefix + consonant + short_matra if short_matra else prefix + consonant
                word2 = short_vowel_char + word[i+1:]
                splits.append(f"'{word1}' + '{word2}'")
    return splits

def split_utva_ikara_sandhi(word):
    """ఉకార/ఇకార (లోప) సంధి ప్రకారం విడదీస్తుంది. (ఉదా: రాముడితడు)"""
    splits = []
    possible_elided_matras = {'ు': 'ఉ', 'ి': 'ఇ'}
    for i in range(1, len(word)):
        current_char = word[i]
        if current_char in matra_map:
            start_of_cluster = find_consonant_cluster_start(word, i)
            if start_of_cluster != -1:
                prefix = word[:start_of_cluster]
                consonant = word[start_of_cluster:i]
                word2_start_vowel = matra_map[current_char]
                word2 = word2_start_vowel + word[i+1:]
                for matra, vowel in possible_elided_matras.items():
                    word1 = prefix + consonant + matra
                    splits.append(f"'{word1}' + '{word2}'")
    return splits

def split_yadagama_sandhi(word):
    """యడాగమ సంధి ప్రకారం విడదీస్తుంది. (ఉదా: మణియంత)"""
    splits = []
    for i in range(1, len(word) - 1):
        if word[i] == 'య' and i > 1 and word[i-1] == 'ి' and word[i-2] in telugu_consonants:
            if (i + 1 < len(word)) and (word[i+1] not in matra_map and word[i+1] in telugu_consonants):
                 word1 = word[:i]
                 word2 = "అ" + word[i+1:]
                 splits.append(f"'{word1}' + '{word2}'")
            elif (i + 1 < len(word)) and word[i+1] in matra_map:
                 word1 = word[:i]
                 word2 = matra_map[word[i+1]] + word[i+2:]
                 splits.append(f"'{word1}' + '{word2}'")
    return splits

def split_amredita_sandhi(word):
    """ఆమ్రేడిత సంధి ప్రకారం విడదీస్తుంది. (ఉదా: ఔరౌర)"""
    splits = []
    mid = len(word) // 2
    if len(word) > 1 and len(word) % 2 == 0 and word[:mid] == word[mid:]:
        splits.append(f"'{word[:mid]}' + '{word[mid:]}'")
    return splits

def split_gasada_dava_adesa(word):
    """గసడదవాదేశ సంధి ప్రకారం విడదీస్తుంది. (ఉదా: వాడుగొట్టె)"""
    splits = []
    reverse_map = {'గ': 'క', 'స': 'చ', 'డ': 'ట', 'ద': 'త', 'వ': 'ప'}
    for i, char in enumerate(word):
        if char in reverse_map:
            start_of_cluster = find_consonant_cluster_start(word, i)
            if start_of_cluster != -1:
                word1_candidate = word[:start_of_cluster]
                if len(word1_candidate) > 1:
                    word1 = word1_candidate
                    original_consonant = reverse_map[char]
                    suffix = word[start_of_cluster:]
                    word2 = original_consonant + suffix[1:]
                    splits.append(f"'{word1}' + '{word2}'")
    return splits

def split_trika(word):
    """త్రికసంధి ప్రకారం విడదీస్తుంది. (ఉదా: అక్కన్య)"""
    splits = []
    trika_vowels = {'అ': 'ఆ', 'ఇ': 'ఈ', 'ఎ': 'ఏ'}
    if len(word) > 3 and word[0] in trika_vowels:
        if word[2] == '్' and word[1] == word[3]:
            word1 = trika_vowels[word[0]]
            word2 = word[3:]
            splits.append(f"'{word1}' + '{word2}'")
    return splits

# --- సంస్కృత సంధులు ---

def split_guna(word):
    """గుణ సంధి ప్రకారం విడదీస్తుంది."""
    sandhi_map = {'ే': ('ఇ', 'ఈ'), 'ో': ('ఉ', 'ఊ')}
    splits = []
    for i, char in enumerate(word):
        if char in sandhi_map:
            start_of_cluster = find_consonant_cluster_start(word, i)
            if start_of_cluster != -1:
                prefix = word[:start_of_cluster]
                consonant = word[start_of_cluster:i]
                word1_a = prefix + consonant
                vowel1, vowel2 = sandhi_map[char]
                suffix = word[i+1:]
                splits.append(f"'{word1_a}' + '{vowel1 + suffix}'")
                splits.append(f"'{word1_a}' + '{vowel2 + suffix}'")
    return splits

def split_vriddhi(word):
    """వృద్ధి సంధి ప్రకారం విడదీస్తుంది."""
    sandhi_map = {'ై': ('ఏ', 'ఐ'), 'ౌ': ('ఓ', 'ఔ')}
    splits = []
    for i, char in enumerate(word):
        if char in sandhi_map:
            start_of_cluster = find_consonant_cluster_start(word, i)
            if start_of_cluster != -1:
                prefix = word[:start_of_cluster]
                consonant = word[start_of_cluster:i]
                word1_a = prefix + consonant
                vowel1, vowel2 = sandhi_map[char]
                suffix = word[i+1:]
                splits.append(f"'{word1_a}' + '{vowel1 + suffix}'")
                splits.append(f"'{word1_a}' + '{vowel2 + suffix}'")
    return splits

def split_yanadesa(word):
    """యణాదేశ సంధి ప్రకారం విడదీస్తుంది."""
    splits = []
    for i in range(1, len(word)):
        if word[i] == '్' and i + 1 < len(word):
            base_consonant_char = word[i-1]
            adesa_char = word[i+1]

            original_matra = None
            if adesa_char == 'య': original_matra = 'ి'
            elif adesa_char == 'వ': original_matra = 'ు'
            elif adesa_char == 'ర': original_matra = 'ృ'

            if original_matra:
                cluster_and_matra = word[i+2:]
                next_char = cluster_and_matra[0] if cluster_and_matra else ''
                vowel_sound = matra_map.get(next_char, 'అ')

                word1 = word[:i-1] + base_consonant_char + original_matra
                word2 = vowel_sound + (cluster_and_matra[1:] if next_char in matra_map else cluster_and_matra)
                splits.append(f"'{word1}' + '{word2}'")
    return splits

def split_jashtva(word):
    """జశ్త్వ సంధి ప్రకారం విడదీస్తుంది. (ఉదా: వాగీశుడు)"""
    splits = []
    reverse_map = {'గ': 'క', 'జ': 'చ', 'డ': 'ట', 'ద': 'త', 'బ': 'ప'}
    for i, char in enumerate(word):
        if char in reverse_map:
            vowel_of_char = 'అ'
            suffix_start_index = i + 1
            if i + 1 < len(word) and word[i+1] in matra_map:
                vowel_of_char = matra_map[word[i+1]]
                suffix_start_index = i + 2

            word1 = word[:i] + reverse_map[char] + virama
            word2 = vowel_of_char + word[suffix_start_index:]
            splits.append(f"'{word1}' + '{word2}'")
    return splits

def split_schutva(word):
    """శ్చుత్వ సంధి ప్రకారం విడదీస్తుంది. (ఉదా: సచ్చరిత్ర)"""
    splits = []
    if 'చ్చ' in word:
        index = word.find('చ్చ')
        word1 = word[:index] + 'త్'
        word2 = 'చ' + word[index+2:]
        splits.append(f"'{word1}' + '{word2}'")
    if 'శ్శ' in word:
        index = word.find('శ్శ')
        word1 = word[:index] + 'స్'
        word2 = 'శ' + word[index+2:]
        splits.append(f"'{word1}' + '{word2}'")
    return splits

def split_anunasika(word):
    """అనునాసిక సంధి ప్రకారం విడదీస్తుంది. (ఉదా: జగన్నాథుడు)"""
    splits = []
    reverse_map = {'ఙ': 'క', 'ఞ': 'చ', 'ణ': 'ట', 'న': 'త', 'మ': 'ప'}
    for i in range(1, len(word)):
        if word[i] == '్' and word[i-1] in reverse_map:
            nasal = word[i-1]
            word1 = word[:i-1] + reverse_map[nasal] + virama
            word2 = word[i+1:]
            splits.append(f"'{word1}' + '{word2}'")
    if 'న్న' in word:
        index = word.find('న్న')
        word1 = word[:index] + 'త్'
        word2 = 'న' + word[index+2:]
        splits.append(f"'{word1}' + '{word2}'")
    return splits


def find_all_splits(word):
    """అన్ని సంధి ఫంక్షన్‌లను అమలు చేస్తుంది."""
    all_splits = {
        "సవర్ణదీర్ఘ సంధి": split_savarnadeergha(word),
        "గుణ సంధి": split_guna(word),
        "వృద్ధి సంధి": split_vriddhi(word),
        "యణాదేశ సంధి": split_yanadesa(word),
        "ఉత్వ/ఇత్వ సంధి (లోప)": split_utva_ikara_sandhi(word),
        "యడాగమ సంధి": split_yadagama_sandhi(word),
        "గసడదవాదేశ సంధి": split_gasada_dava_adesa(word),
        "ఆమ్రేడిత సంధి": split_amredita_sandhi(word),
        "త్రిక సంధి": split_trika(word),
        "జశ్త్వ సంధి": split_jashtva(word),
        "శ్చుత్వ సంధి": split_schutva(word),
        "అనునాసిక సంధి": split_anunasika(word),
    }
    return {sandhi: list(set(splits)) for sandhi, splits in all_splits.items() if splits}



Overwriting telugu_tokenizer/src/sandhi_splitter.py


In [14]:
# -*- coding: utf-8 -*-
import os
import sys

project_root = "/content/telugu_tokenizer"
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from sandhi_splitter import find_all_splits

def load_vocabulary_with_frequency(filepath):
    """
    Loads the vocabulary from the user-provided file (e.g., 'te_full.txt').
    It parses each line to get the word and its frequency, storing them
    in a dictionary for fast lookups.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Vocabulary file not found at: {filepath}")

    vocab = {}
    print(f"Loading vocabulary from '{filepath}'...")
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            parts = line.split()
            if len(parts) >= 2:
                word = ' '.join(parts[:-1])
                freq_str = parts[-1]

                if freq_str.isdigit():
                    vocab[word] = int(freq_str)
                else:

                    vocab[line] = 1
            else:

                vocab[line] = 1

    print(f"Loaded {len(vocab)} words into the vocabulary.")
    return vocab

def get_best_frequency_split(word, vocab):
    """
    This is the core intelligent function. It gets all possible rule-based splits
    and then validates them against the loaded vocabulary. It chooses the best split
    based on the frequency of the resulting morphemes.
    """
    possible_splits = find_all_splits(word)
    if not possible_splits:
        return [word]

    valid_splits = []
    for sandhi_type, splits in possible_splits.items():
        for split_str in splits:
            parts = [p.strip().strip("'") for p in split_str.split('+')]

            if len(parts) == 2:
                part1, part2 = parts

                p1_in_vocab = part1 in vocab
                p2_in_vocab = part2 in vocab

                score = 0
                if p1_in_vocab and p2_in_vocab:
                    score = 1000 + min(vocab.get(part1, 1), vocab.get(part2, 1))
                elif p1_in_vocab or p2_in_vocab:

                    score = 100 + (vocab.get(part1, 1) if p1_in_vocab else vocab.get(part2, 1))

                if score > 0:
                    valid_splits.append({'parts': [part1, part2], 'score': score, 'type': sandhi_type})

    if not valid_splits:
        return [word]

    # Choose the split with the highest score
    best_split = max(valid_splits, key=lambda x: x['score'])
    return best_split['parts']

def pre_tokenize_corpus(input_filepath, output_filepath, vocab):
    """
    Reads a corpus file line by line, applies the smart splitting to each word,
    and writes the pre-tokenized sentences to an output file.
    """
    print(f"\nStarting pre-tokenization of '{input_filepath}'...")
    with open(input_filepath, 'r', encoding='utf-8') as infile, \
         open(output_filepath, 'w', encoding='utf-8') as outfile:

        total_lines = 0
        try:
            with open(input_filepath, 'r', encoding='utf-8') as f_count:
                total_lines = sum(1 for line in f_count)
        except Exception:
            total_lines = -1

        infile.seek(0)

        for i, line in enumerate(infile):
            words = line.strip().split()
            pre_tokenized_words = []
            for word in words:
                # Also split words connected by non-breaking space or other common joiners
                sub_words = word.replace('\u200c', ' ').split()
                for sub_word in sub_words:
                    split_parts = get_best_frequency_split(sub_word, vocab)
                    pre_tokenized_words.extend(split_parts)

            outfile.write(' '.join(pre_tokenized_words) + '\n')

            progress_msg = f"  ...processed {i+1} lines"
            if total_lines > 0:
                progress_msg = f"  ...processed {i+1} of {total_lines} lines"

            if (i + 1) % 500 == 0 or (total_lines > 0 and (i + 1) == total_lines):
                print(progress_msg)

    print(f"Pre-tokenization complete. Output saved to '{output_filepath}'.")


def main():
    """Main function to run the entire pipeline."""
    project_root_main = "telugu_tokenizer"
    vocab_file = os.path.join(project_root_main, "data", "te_full.txt")
    input_corpus = os.path.join(project_root_main, "data", "telugu_corpus.txt")
    output_corpus = os.path.join(project_root_main, "data", "corpus.pretokenized.txt")

    try:
        vocabulary = load_vocabulary_with_frequency(vocab_file)
        pre_tokenize_corpus(input_corpus, output_corpus, vocabulary)

        print("\n--- Pipeline Finished ---")
        print(f"The pre-tokenized corpus is ready at: '{output_corpus}'")

    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please make sure your data files are uploaded to the correct directory.")

if __name__ == "__main__":
    main()



Error: Vocabulary file not found at: telugu_tokenizer/data/te_full.txt
Please make sure your data files are uploaded to the correct directory.


In [None]:
# @title ## Phase 3.3: Confirm File Creation
#
# Let's quickly verify that both Python modules were created successfully in the `src` directory.

import os
src_folder = "telugu_tokenizer/src"
sandhi_file = os.path.join(src_folder, "sandhi_splitter.py")
tokenizer_file = os.path.join(src_folder, "frequency_based_tokenizer.py")

if os.path.exists(sandhi_file):
  print(f" Success! sandhi_splitter.py has been created.")
else:
  print(f" Error! sandhi_splitter.py was not created.")

if os.path.exists(tokenizer_file):
  print(f" Success! frequency_based_tokenizer.py has been created.")
else:
  print(f"Error! frequency_based_tokenizer.py was not created.")


 Success! sandhi_splitter.py has been created.
Error! frequency_based_tokenizer.py was not created.


In [None]:
# @title ## Phase 4: Run the Pre-tokenization Pipeline
#
# Now we execute the script we just created.
#
# It will:
# 1. Load the vocabulary from `te_full.txt`.
# 2. Read your `telugu_corpus.txt`.
# 3. Apply the Sandhi rules and validate the splits for every word.
# 4. Write the results to a new file: `corpus.pretokenized.txt`.
#
# This step might take some time depending on the size of your corpus.

!python telugu_tokenizer/src/frequency_based_tokenizer.py


python3: can't open file '/content/telugu_tokenizer/src/frequency_based_tokenizer.py': [Errno 2] No such file or directory


In [None]:


# @title ## Phase 5: Train the BPE Tokenizer Model
#
# With our grammatically pre-processed corpus ready, we can now train the final SentencePiece model.
# This model will learn the morphemes we have identified.
#
# The output will be two files saved in the `model` directory:
# * `telugu_bpe.model`: The trained tokenizer model.
# * `telugu_bpe.vocab`: A human-readable vocabulary file.

print("Starting SentencePiece model training...")

import sentencepiece as spm

input_file = os.path.join(data_folder, "corpus.pretokenized.txt")
model_prefix = os.path.join(model_folder, "telugu_bpe")
vocab_size = 16000 # This is a good starting point, can be tuned.
model_type = "bpe"

spm.SentencePieceTrainer.train(
    f'--input={input_file} '
    f'--model_prefix={model_prefix} '
    f'--vocab_size={vocab_size} '
    f'--model_type={model_type} '
    f'--character_coverage=1.0'
)

print(f"\n✅ Training complete! Your tokenizer model is saved at '{model_prefix}.model'")

Starting SentencePiece model training...


OSError: Not found: "telugu_tokenizer/data/corpus.pretokenized.txt": No such file or directory Error #2

In [None]:


# @title ## Phase 6: Test Your New Tokenizer
#
# Let's see our new tokenizer in action! We will load the model we just trained
# and use it to tokenize some sample sentences.

import sentencepiece as spm

# Load the trained model
tokenizer = spm.SentencePieceProcessor()
tokenizer.load(f'{model_prefix}.model')

# --- Test Sentences ---
test_sentences = [
    "రామాలయం చాలా అందంగా ఉంది.",
    "ప్రభుత్వం రాష్ట్రాలలోని సమస్యలను పరిష్కరించింది.",
    "అతను అత్యంత ముఖ్యమైన వ్యక్తి.",
    "విద్యార్థి పాఠశాలకు వెళ్ళాడు.",
    "ఔరౌర, ఎంత గొప్ప ప్రదర్శన!",
    "సీతా దేవి ఉదయాన్నే తోటలో నడిచింది.",
    "ఈ రోజు వాతావరణం చల్లగా ఉంది.",
    "మన సంస్కృతి ప్రపంచంలో ప్రత్యేక స్థానం పొందింది.",
    "రాత్రి ఆకాశంలో నక్షత్రాలు మెరుస్తున్నాయి.",
    "పిల్లలు ఆనందంగా ఆటలు ఆడుతున్నారు.",
    "పుస్తకం చదివి కొత్త విషయాలు నేర్చుకున్నాను.",
    "రాముడు మరియు లక్ష్మణుడు అడవికి వెళ్లారు.",
    "చిన్న పక్షి చెట్టుపైన గూడు కట్టింది.",
    "మేఘాలు కమ్ముకొని వర్షం కురిసింది.",
    "ఆమె చిరునవ్వు అందరినీ ఆకట్టుకుంది."
]


print("--- Tokenizer Test Results ---")
for sentence in test_sentences:
    print(f"\nOriginal:  {sentence}")
    tokenized = tokenizer.encode_as_pieces(sentence)
    # The ' ' character represents a space.
    print(f"Tokenized: {tokenized}")

In [None]:


# @title ## Phase 7: Save Your Work
#
# The Colab environment is temporary. To save your trained model and the important
# data files for future use, run this cell. It will download the files to your local computer.

from google.colab import files

print("Downloading the trained tokenizer model...")
files.download(f'{model_prefix}.model')

print("\nDownloading the vocabulary file...")
files.download(f'{model_prefix}.vocab')

print("\nDownloading the pre-tokenized corpus...")
files.download(os.path.join(data_folder, "corpus.pretokenized.txt"))

print("\n✅ All essential files have been downloaded.")

# --- End of Notebook ---
