<a href="https://colab.research.google.com/github/Rishpraveen/Natural-Language-Processing-21MID0151/blob/main/TamilLexicon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Creating a Lexicon of words for Tamil.

In [None]:
import pandas as pd
import random

#The Tamil Script
Tamil letters come in **two** main groups:

####**உயிரெழுத்துக்கள் (Independent Vowels)**:
Example: அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ

####**மெய்யெழுத்துக்கள் (Consonants)**:
Written with a "virama" (்) to indicate that they normally carry no vowel sound by themselves. For instance, க், ங்,ச், ஞ், ...

When forming a syllable, a consonant (after removing the virama) combines with a vowel marker. For example,

- Base: க் (which is actually "k" with a virama)
- Combined with the vowel "அ" (the inherent vowel) gives you க
- With other vowels, you attach a marker. For instance:
 - "ஆ" → marker ா gives கா
 - "இ" → marker ி gives கி

| Vowel | Marker | Resulting Form (with base "க்") |
|---|---|---|
| அ | (none) | க |
| ஆ | ா | கா |
| இ | ி | கி |
| ஈ | ீ | கீ |
| உ | ு | கு |
| ஊ | ூ | கூ |
| எ | ெ | கெ |
| ஏ | ே | கே |
| ஐ | ை | கை |
| ஒ | ொ | கொ |
| ஓ | ோ | கோ |
| ஔ | ௌ | கௌ |

# Define Allowed Syllable Structures
For a more natural word, we want syllables that follow Tamil phonotactics. A simplified set of allowed syllable types is:

- V: A vowel-only syllable.
- CV: A consonant-vowel pair (most common).

(Note: Real Tamil can have more complex structures, but CV is a good starting point.)

In [None]:
# Define Tamil characters
tamil_vowels = ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ']
tamil_consonants = ['க', 'ங', 'ச', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ப', 'ம', 'ய', 'ர', 'ல', 'வ', 'ழ', 'ள', 'ற', 'ன']

# Vowel markers (to be added to consonants)
vowel_markers = {
    'அ': '',  # No marker for 'அ' (inherent vowel)
    'ஆ': 'ா',
    'இ': 'ி',
    'ஈ': 'ீ',
    'உ': 'ு',
    'ஊ': 'ூ',
    'எ': 'ெ',
    'ஏ': 'ே',
    'ஐ': 'ை',
    'ஒ': 'ொ',
    'ஓ': 'ோ',
    'ஔ': 'ௌ'
}

def generate_tamil_syllable():
    """
    Generate a single Tamil syllable.
    Returns either a vowel (V) or a consonant-vowel (CV) combination.
    """
    # Decide syllable type: V or CV (70% chance for CV)
    syllable_type = random.choices(['V', 'CV'], weights=[20, 80])[0]

    if syllable_type == 'V':
        # Just return an independent vowel (உயிரெழுத்து)
        return random.choice(tamil_vowels)
    else:
        # Generate a CV combination
        consonant = random.choice(tamil_consonants)
        vowel = random.choice(tamil_vowels)

        # Add vowel marker to consonant
        marker = vowel_markers[vowel]
        if marker:
            return consonant + marker
        else:
            # For 'அ', just return the consonant as it already has the inherent vowel
            return consonant

def generate_tamil_word(min_syllables=2, max_syllables=4):
    """
    Generate a Tamil-like word with a random number of syllables.
    """
    num_syllables = random.randint(min_syllables, max_syllables)
    word = ''

    # First syllable can be V or CV
    word += generate_tamil_syllable()

    # For remaining syllables
    for _ in range(num_syllables - 1):
        # For middle syllables, we can add some rules to make words more natural
        # For example, avoiding too many vowels in sequence
        if word[-1] in tamil_vowels:
            # If last char is a vowel, force a CV syllable
            consonant = random.choice(tamil_consonants)
            vowel = random.choice(tamil_vowels)
            marker = vowel_markers[vowel]
            if marker:
                word += consonant + marker
            else:
                word += consonant
        else:
            word += generate_tamil_syllable()

    return word

def generate_tamil_text(num_words=5, min_syllables=1, max_syllables=4):
    """
    Generate multiple Tamil-like words.
    """
    words = []
    for _ in range(num_words):
        words.append(generate_tamil_word(min_syllables, max_syllables))

    return ' '.join(words)

# Create a visually appealing output in Colab
from IPython.display import HTML, display
import pandas as pd

def display_generated_text(num_examples=10):
    """
    Generate and display Tamil-like text examples in a nice format.
    """
    examples = []
    for i in range(num_examples):
        word_count = random.randint(1, 5)
        text = generate_tamil_text(word_count, 1, 4)
        examples.append({
            "Example": i+1,
            "Generated Text": text,
            "Word Count": word_count
        })

    df = pd.DataFrame(examples)
    display(df)

    # Also create a colorful box with a random example
    random_example = generate_tamil_text(random.randint(3, 7), 2, 4)
    html = f"""
    <div style="background: linear-gradient(45deg, #FF9671, #FFC75F);
                border-radius: 10px;
                padding: 20px;
                margin: 20px 0;
                color: white;
                font-size: 24px;
                text-align: center;
                box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
        {random_example}
    </div>
    """
    display(HTML(html))

    # Add an explanation section
    explanation_html = f"""
    <div style="background: blue;
                border-left: 5px solid #4285F4;
                padding: 15px;
                margin: 20px 0;">
        <h3>Tamil Script Explanation</h3>
        <p>This generator creates Tamil-like text following these rules:</p>
        <ul>
            <li><strong>உயிரெழுத்துக்கள் (Independent Vowels)</strong>: {', '.join(tamil_vowels)}</li>
            <li><strong>மெய்யெழுத்துக்கள் (Consonants)</strong>: {', '.join(tamil_consonants)}</li>
            <li>Vowel markers are only applied to consonants, not to independent vowels</li>
            <li>Words are formed from syllables that follow Tamil phonotactics (V or CV structure)</li>
        </ul>
    </div>
    """
    display(HTML(explanation_html))

    return examples

# Run the generation and display
print("🇮🇳 Tamil-Like Word Generator 🇮🇳")
print("==================================")
print("Generating examples based on Tamil script rules...")
examples = display_generated_text(10)

# Interactive generation
from ipywidgets import interact, IntSlider

@interact(words=IntSlider(min=1, max=10, step=1, value=3, description="Words:"),
          min_syllables=IntSlider(min=1, max=5, step=1, value=1, description="Min Syllables:"),
          max_syllables=IntSlider(min=1, max=7, step=1, value=3, description="Max Syllables:"))
def generate_interactive(words, min_syllables, max_syllables):
    text = generate_tamil_text(words, min_syllables, max_syllables)
    html = f"""
    <div style="background: linear-gradient(45deg, #845EC2, #D65DB1);
                border-radius: 10px;
                padding: 15px;
                margin: 10px 0;
                color: white;
                font-size: 20px;
                text-align: center;">
        {text}
    </div>
    """
    display(HTML(html))

    # Show the breakdown of syllables
    parts = []
    for word in text.split():
        # This is a simplified syllable breakdown and won't be perfect
        syllables = []
        i = 0
        while i < len(word):
            if i+1 < len(word) and word[i+1] in ''.join(vowel_markers.values()):
                syllables.append(word[i:i+2])
                i += 2
            else:
                syllables.append(word[i])
                i += 1
        parts.append(syllables)

    breakdown_html = f"""
    <div style="background: blue;
                border: 1px solid #ddd;
                border-radius: 5px;
                padding: 10px;
                margin: 10px 0;">
        <h4>Word Breakdown:</h4>
        <ul style="list-style-type: none; padding-left: 0;">
            {"".join(f"<li>{' + '.join(word)} = {' '.join(word)}</li>" for word in parts)}
        </ul>
    </div>
    """
    display(HTML(breakdown_html))

🇮🇳 Tamil-Like Word Generator 🇮🇳
Generating examples based on Tamil script rules...


Unnamed: 0,Example,Generated Text,Word Count
0,1,செஏ றோரெ ர நௌதே ஙறொடௌணு,5
1,2,ணிலா டூறெஈடௌ சீஊ அசோபௌ,4
2,3,டுநேபொசை ரேஎ பிதீகெறீ,3
3,4,யெதீசீ உணௌடௌஒ,2
4,5,ஞெ,1
5,6,ளினீஇளே,1
6,7,ழளஅறெ லு செஆயீஞி மபொசௌஙொ உ,5
7,8,ஏ மு நௌ,3
8,9,றொ,1
9,10,தை ழௌளூ ஞோநீரெடௌ வி,4


interactive(children=(IntSlider(value=3, description='Words:', max=10, min=1), IntSlider(value=1, description=…

 # Syllable Generator
- above we wrote a function that generates a syllable according to the rules above. where we  decide randomly whether to create a vowel-only syllable or a consonant-vowel (CV) syllable. In a more advanced version, you might favor CV syllables or vary them based on word position.


#Word Generator Using the Syllable Generator
- Then we combined syllables to form a word. We decide on a number of syllables (say between 2 and 5) and then join them. For a more natural feel, we might allow the first syllable to sometimes be vowel-only and then force subsequent syllables to be CV.

In [None]:
# Tamil to Roman transliteration system

# Mapping dictionary for Tamil characters to Roman
romanized_map = {
    # Independent vowels
    'அ': 'a', 'ஆ': 'aa', 'இ': 'i', 'ஈ': 'ii', 'உ': 'u', 'ஊ': 'uu',
    'எ': 'e', 'ஏ': 'ee', 'ஐ': 'ai', 'ஒ': 'o', 'ஓ': 'oo', 'ஔ': 'au',

    # Consonants
    'க': 'k', 'ங': 'ng', 'ச': 'ch', 'ஞ': 'nj', 'ட': 't', 'ண': 'n',
    'த': 'th', 'ந': 'n', 'ப': 'p', 'ம': 'm', 'ய': 'y', 'ர': 'r',
    'ல': 'l', 'வ': 'v', 'ழ': 'zh', 'ள': 'l', 'ற': 'r', 'ன': 'n','ha':'ஹ',

    # Vowel markers (combining signs)
    'ா': 'aa', 'ி': 'i', 'ீ': 'ii', 'ு': 'u', 'ூ': 'uu',
    'ெ': 'e', 'ே': 'ee', 'ை': 'ai', 'ொ': 'o', 'ோ': 'oo', 'ௌ': 'au',

    # Special characters
    '்': '', # pulli (removes inherent vowel)
    'ஃ': 'h' ,# aaytham

}

# List of Tamil consonants
tamil_consonants = [
    'க', 'ங', 'ச', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ப', 'ம',
    'ய', 'ர', 'ல', 'வ', 'ழ', 'ள', 'ற', 'ன'
]

# List of vowel markers
vowel_markers = ['ா', 'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ௌ', '்']

def romanize_tamil_word(word):
    """
    Convert a Tamil word into its romanized form.

    Args:
        word (str): Tamil word to be romanized

    Returns:
        str: Romanized version of the Tamil word

    Notes:
        - Consonants without vowel markers take the inherent vowel 'a'
        - Consonants with pulli ('்') have no vowel sound
        - Consonants with vowel markers combine with those markers
        - Independent vowels are directly transliterated
    """
    romanized = ""
    i = 0

    while i < len(word):
        char = word[i]

        # Case 1: Character is a consonant
        if char in tamil_consonants:
            consonant_roman = romanized_map[char]

            # Look ahead for vowel marker or pulli
            if i + 1 < len(word) and word[i+1] in vowel_markers:
                if word[i+1] == '்':  # pulli - no vowel
                    romanized += consonant_roman
                else:  # vowel marker
                    romanized += consonant_roman + romanized_map[word[i+1]]
                i += 2
            else:
                # No vowel marker: add inherent 'a'
                romanized += consonant_roman + 'a'
                i += 1

        # Case 2: Character is an independent vowel or other mapped character
        elif char in romanized_map and char not in vowel_markers:
            romanized += romanized_map[char]
            i += 1

        # Case 3: Vowel marker appearing independently (unusual, but handle it)
        elif char in vowel_markers:
            # Skip if it's a stray vowel marker (unusual case)
            i += 1

        # Case 4: Character not in our mapping
        else:
            romanized += char  # Keep as is (numbers, punctuation, etc.)
            i += 1

    return romanized

# Function to handle multiple words or sentences
def romanize_tamil_text(text):
    """
    Romanize a Tamil text (can contain multiple words).

    Args:
        text (str): Tamil text to romanize

    Returns:
        str: Romanized version of the Tamil text
    """
    words = text.split()
    romanized_words = [romanize_tamil_word(word) for word in words]
    return " ".join(romanized_words)

# Example usage
if __name__ == "__main__":
    # Test with various Tamil words
    test_words = [
        "காதல்",       # kaathal (love)
        "தமிழ்",       # thamizh (Tamil)
        "வணக்கம்",     # vanakkam (hello)
        "நன்றி",       # nanri (thank you)
        "இந்தியா"      # inthiyaa (India)
    ]

    print("Tamil to Roman Transliteration Examples:")
    print("---------------------------------------")

    for word in test_words:
        romanized = romanize_tamil_word(word)
        print(f"Tamil: {word} → Romanized: {romanized}")

    # Test with a sentence
    tamil_sentence = " தமிழ் மொழி மிகவும் அழகானது"  # Tamil mozhi mihavum azhahanathu
    romanized_sentence = romanize_tamil_text(tamil_sentence)
    print("\nTamil sentence:")
    print(f"Original: {tamil_sentence}")
    print(f"Romanized: {romanized_sentence}")

Tamil to Roman Transliteration Examples:
---------------------------------------
Tamil: காதல் → Romanized: kaathal
Tamil: தமிழ் → Romanized: thamizh
Tamil: வணக்கம் → Romanized: vanakkam
Tamil: நன்றி → Romanized: nanri
Tamil: இந்தியா → Romanized: inthiyaa

Tamil sentence:
Original:  தமிழ் மொழி மிகவும் அழகானது
Romanized: thamizh mozhi mikavum azhakaanathu


In [None]:
# Roman to Tamil transliteration system

# First, create the reverse mapping for simple conversions
tamil_map = {
    # Vowels (independent forms)
    'a': 'அ', 'aa': 'ஆ', 'i': 'இ', 'ii': 'ஈ', 'u': 'உ', 'uu': 'ஊ',
    'e': 'எ', 'ee': 'ஏ', 'ai': 'ஐ', 'o': 'ஒ', 'oo': 'ஓ', 'au': 'ஔ',

    # Consonants without vowels (with pulli)
    'k': 'க்', 'ng': 'ங்', 'ch': 'ச்', 'nj': 'ஞ்', 't': 'ட்', 'n': 'ந்',
    'th': 'த்', 'p': 'ப்', 'm': 'ம்', 'y': 'ய்', 'r': 'ர்', 'l': 'ல்',
    'v': 'வ்', 'zh': 'ழ்', 'r': 'ற்', 'n': 'ன்',

    # Special character
    'h': 'ஃ' , # aaytham
    # 'ha':'ஹ'
}

# Dictionary for consonants with vowels
consonant_vowel_combinations = {
    # k + vowels
    'ka': 'க', 'kaa': 'கா', 'ki': 'கி', 'kii': 'கீ', 'ku': 'கு', 'kuu': 'கூ',
    'ke': 'கெ', 'kee': 'கே', 'kai': 'கை', 'ko': 'கொ', 'koo': 'கோ', 'kau': 'கௌ',

    # ng + vowels
    'nga': 'ங', 'ngaa': 'ஙா', 'ngi': 'ஙி', 'ngii': 'ஙீ', 'ngu': 'ஙு', 'nguu': 'ஙூ',
    'nge': 'ஙெ', 'ngee': 'ஙே', 'ngai': 'ஙை', 'ngo': 'ஙொ', 'ngoo': 'ஙோ', 'ngau': 'ஙௌ',

    # ch + vowels
    'cha': 'ச', 'chaa': 'சா', 'chi': 'சி', 'chii': 'சீ', 'chu': 'சு', 'chuu': 'சூ',
    'che': 'செ', 'chee': 'சே', 'chai': 'சை', 'cho': 'சொ', 'choo': 'சோ', 'chau': 'சௌ',

    # nj + vowels
    'nja': 'ஞ', 'njaa': 'ஞா', 'nji': 'ஞி', 'njii': 'ஞீ', 'nju': 'ஞு', 'njuu': 'ஞூ',
    'nje': 'ஞெ', 'njee': 'ஞே', 'njai': 'ஞை', 'njo': 'ஞொ', 'njoo': 'ஞோ', 'njau': 'ஞௌ',

    # t + vowels
    'ta': 'ட', 'taa': 'டா', 'ti': 'டி', 'tii': 'டீ', 'tu': 'டு', 'tuu': 'டூ',
    'te': 'டெ', 'tee': 'டே', 'tai': 'டை', 'to': 'டொ', 'too': 'டோ', 'tau': 'டௌ',

    # th + vowels
    'tha': 'த', 'thaa': 'தா', 'thi': 'தி', 'thii': 'தீ', 'thu': 'து', 'thuu': 'தூ',
    'the': 'தெ', 'thee': 'தே', 'thai': 'தை', 'tho': 'தொ', 'thoo': 'தோ', 'thau': 'தௌ',

    # n + vowels (multiple 'n' consonants in Tamil, using the dental 'ந' version)
    'na': 'ந', 'naa': 'நா', 'ni': 'நி', 'nii': 'நீ', 'nu': 'நு', 'nuu': 'நூ',
    'ne': 'நெ', 'nee': 'நே', 'nai': 'நை', 'no': 'நொ', 'noo': 'நோ', 'nau': 'நௌ',

    # p + vowels
    'pa': 'ப', 'paa': 'பா', 'pi': 'பி', 'pii': 'பீ', 'pu': 'பு', 'puu': 'பூ',
    'pe': 'பெ', 'pee': 'பே', 'pai': 'பை', 'po': 'பொ', 'poo': 'போ', 'pau': 'பௌ',

    # m + vowels
    'ma': 'ம', 'maa': 'மா', 'mi': 'மி', 'mii': 'மீ', 'mu': 'மு', 'muu': 'மூ',
    'me': 'மெ', 'mee': 'மே', 'mai': 'மை', 'mo': 'மொ', 'moo': 'மோ', 'mau': 'மௌ',

    # y + vowels
    'ya': 'ய', 'yaa': 'யா', 'yi': 'யி', 'yii': 'யீ', 'yu': 'யு', 'yuu': 'யூ',
    'ye': 'யெ', 'yee': 'யே', 'yai': 'யை', 'yo': 'யொ', 'yoo': 'யோ', 'yau': 'யௌ',

    # r + vowels (using 'ர')
    'ra': 'ர', 'raa': 'ரா', 'ri': 'ரி', 'rii': 'ரீ', 'ru': 'ரு', 'ruu': 'ரூ',
    're': 'ரெ', 'ree': 'ரே', 'rai': 'ரை', 'ro': 'ரொ', 'roo': 'ரோ', 'rau': 'ரௌ',

    # l + vowels (using 'ல')
    'la': 'ல', 'laa': 'லா', 'li': 'லி', 'lii': 'லீ', 'lu': 'லு', 'luu': 'லூ',
    'le': 'லெ', 'lee': 'லே', 'lai': 'லை', 'lo': 'லொ', 'loo': 'லோ', 'lau': 'லௌ',

    # v + vowels
    'va': 'வ', 'vaa': 'வா', 'vi': 'வி', 'vii': 'வீ', 'vu': 'வு', 'vuu': 'வூ',
    've': 'வெ', 'vee': 'வே', 'vai': 'வை', 'vo': 'வொ', 'voo': 'வோ', 'vau': 'வௌ',

    # zh + vowels
    'zha': 'ழ', 'zhaa': 'ழா', 'zhi': 'ழி', 'zhii': 'ழீ', 'zhu': 'ழு', 'zhuu': 'ழூ',
    'zhe': 'ழெ', 'zhee': 'ழே', 'zhai': 'ழை', 'zho': 'ழொ', 'zhoo': 'ழோ', 'zhau': 'ழௌ',

    # Additional variants for different Tamil 'n' sounds
    # Alveolar 'n' (ன) + vowels
    'na': 'ன', 'naa': 'னா', 'ni': 'னி', 'nii': 'னீ', 'nu': 'னு', 'nuu': 'னூ',
    'ne': 'னெ', 'nee': 'னே', 'nai': 'னை', 'no': 'னொ', 'noo': 'னோ', 'nau': 'னௌ',

    # Retroflex 'n' (ண) + vowels
    'na': 'ண', 'naa': 'ணா', 'ni': 'ணி', 'nii': 'ணீ', 'nu': 'ணு', 'nuu': 'ணூ',
    'ne': 'ணெ', 'nee': 'ணே', 'nai': 'ணை', 'no': 'ணொ', 'noo': 'ணோ', 'nau': 'ணௌ',

    # Retroflex 'r' (ற) + vowels
    'ra': 'ற', 'raa': 'றா', 'ri': 'றி', 'rii': 'றீ', 'ru': 'று', 'ruu': 'றூ',
    're': 'றெ', 'ree': 'றே', 'rai': 'றை', 'ro': 'றொ', 'roo': 'றோ', 'rau': 'றௌ',

    # Retroflex 'l' (ள) + vowels
    'la': 'ள', 'laa': 'ளா', 'li': 'ளி', 'lii': 'ளீ', 'lu': 'ளு', 'luu': 'ளூ',
    'le': 'ளெ', 'lee': 'ளே', 'lai': 'ளை', 'lo': 'ளொ', 'loo': 'ளோ', 'lau': 'ளௌ'
}

# List of consonant prefixes
consonant_prefixes = [
    'k', 'ng', 'ch', 'nj', 't', 'th', 'n', 'p',
    'm', 'y', 'r', 'l', 'v', 'zh','ha'
]
# List of vowel suffixes
vowel_suffixes = [
    'a', 'aa', 'i', 'ii', 'u', 'uu',
    'e', 'ee', 'ai', 'o', 'oo', 'au'
]

def convert_roman_to_tamil(romanized_text):
    """
    Convert romanized text to Tamil.

    Args:
        romanized_text (str): The romanized text to convert

    Returns:
        str: Tamil text
    """
    words = romanized_text.split()
    tamil_words = []

    for word in words:
        tamil_word = ""
        i = 0

        while i < len(word):
            # Try to match longest possible segments
            matched = False

            # First check for consonant-vowel combinations (longest matches)
            for prefix in sorted(consonant_prefixes, key=len, reverse=True):
                if word[i:].startswith(prefix):
                    for suffix in sorted(vowel_suffixes, key=len, reverse=True):
                        combo = prefix + suffix
                        if word[i:].startswith(combo) and combo in consonant_vowel_combinations:
                            tamil_word += consonant_vowel_combinations[combo]
                            i += len(combo)
                            matched = True
                            break
                    if matched:
                        break

            # If no consonant-vowel combo matched, check for standalone vowels or consonants
            if not matched:
                # Try matching vowels (independent forms)
                for vowel in sorted(vowel_suffixes, key=len, reverse=True):
                    if word[i:].startswith(vowel) and vowel in tamil_map:
                        tamil_word += tamil_map[vowel]
                        i += len(vowel)
                        matched = True
                        break

                # Try matching consonants (with pulli)
                if not matched:
                    for consonant in sorted(consonant_prefixes, key=len, reverse=True):
                        if word[i:].startswith(consonant) and consonant in tamil_map:
                            tamil_word += tamil_map[consonant]
                            i += len(consonant)
                            matched = True
                            break

            # If still no match, keep the character as is
            if not matched:
                tamil_word += word[i]
                i += 1

        tamil_words.append(tamil_word)

    return " ".join(tamil_words)

# Example usage
if __name__ == "__main__":
    # Test with romanized Tamil words
    test_words = [
        "kaathal",      # காதல் (love)
        "thamizh",      # தமிழ் (Tamil)
        "vanakkam",     # வணக்கம் (hello)
        "nanri",        # நன்றி (thank you)
        "inthiyaa"      # இந்தியா (India)
    ]

    print("Roman to Tamil Transliteration Examples:")
    print("---------------------------------------")

    for word in test_words:
        tamil = convert_roman_to_tamil(word)
        print(f"Romanized: {word} → Tamil: {tamil}")

    # Test with a romanized sentence
    romanized_sentence = " thamizh mozhi mihavum azhahanathu"  # தமிழ் மொழி மிகவும் அழகானது
    tamil_sentence = convert_roman_to_tamil(romanized_sentence)
    print("\nRomanized sentence:")
    print(f"Original: {romanized_sentence}")
    print(f"Tamil: {tamil_sentence}")

Roman to Tamil Transliteration Examples:
---------------------------------------
Romanized: kaathal → Tamil: காதல்
Romanized: thamizh → Tamil: தமிழ்
Romanized: vanakkam → Tamil: வணக்கம்
Romanized: nanri → Tamil: ணன்றி
Romanized: inthiyaa → Tamil: இன்தியா

Romanized sentence:
Original:  thamizh mozhi mihavum azhahanathu
Tamil: தமிழ் மொழி மிhஅவும் அழhஅணது


**Syllable Structure:** Tamil characters are syllabic. A typical syllable is formed by combining a consonant with a vowel. When a consonant stands alone, it usually carries an inherent vowel sound or is marked with a special symbol (the "pulli") to indicate the absence of a vowel.

**Complete Words:** Genuine Tamil words usually consist of both vowels (உயிரெழுத்துக்கள்) and consonants (மெய்யெழுத்துக்கள்). Using only consonants, as in the dummy function, produces strings that do not represent standard syllable structures, making them less natural or pronounceable.

In [None]:
# Enhanced Tamil transliteration system with grammar rules
import re

# ===== TRANSLITERATION MAPPINGS =====

# Mapping dictionary for Tamil characters to Roman
romanized_map = {
    # Independent vowels
    'அ': 'a', 'ஆ': 'aa', 'இ': 'i', 'ஈ': 'ii', 'உ': 'u', 'ஊ': 'uu',
    'எ': 'e', 'ஏ': 'ee', 'ஐ': 'ai', 'ஒ': 'o', 'ஓ': 'oo', 'ஔ': 'au',

    # Consonants
    'க': 'k', 'ங': 'ng', 'ச': 'ch', 'ஞ': 'nj', 'ட': 't', 'ண': 'n',
    'த': 'th', 'ந': 'n', 'ப': 'p', 'ம': 'm', 'ய': 'y', 'ர': 'r',
    'ல': 'l', 'வ': 'v', 'ழ': 'zh', 'ள': 'l', 'ற': 'r', 'ன': 'n', 'ஹ': 'ha',

    # Vowel markers (combining signs)
    'ா': 'aa', 'ி': 'i', 'ீ': 'ii', 'ு': 'u', 'ூ': 'uu',
    'ெ': 'e', 'ே': 'ee', 'ை': 'ai', 'ொ': 'o', 'ோ': 'oo', 'ௌ': 'au',

    # Special characters
    '்': '',  # pulli (removes inherent vowel)
    'ஃ': 'h'  # aaytham
}

# Reverse mapping for Roman to Tamil
tamil_map = {
    # Vowels (independent forms)
    'a': 'அ', 'aa': 'ஆ', 'i': 'இ', 'ii': 'ஈ', 'u': 'உ', 'uu': 'ஊ',
    'e': 'எ', 'ee': 'ஏ', 'ai': 'ஐ', 'o': 'ஒ', 'oo': 'ஓ', 'au': 'ஔ',

    # Consonants without vowels (with pulli)
    'k': 'க்', 'ng': 'ங்', 'ch': 'ச்', 'nj': 'ஞ்', 't': 'ட்', 'n': 'ந்',
    'th': 'த்', 'p': 'ப்', 'm': 'ம்', 'y': 'ய்', 'r': 'ர்', 'l': 'ல்',
    'v': 'வ்', 'zh': 'ழ்', 'r': 'ற்', 'n': 'ன்',

    # Special character
    'h': 'ஃ',  # aaytham
    'ha': 'ஹ'
}

# Dictionary for consonants with vowels (similar to original)
consonant_vowel_combinations = {
   # k + vowels
    'ka': 'க', 'kaa': 'கா', 'ki': 'கி', 'kii': 'கீ', 'ku': 'கு', 'kuu': 'கூ',
    'ke': 'கெ', 'kee': 'கே', 'kai': 'கை', 'ko': 'கொ', 'koo': 'கோ', 'kau': 'கௌ',

    # ng + vowels
    'nga': 'ங', 'ngaa': 'ஙா', 'ngi': 'ஙி', 'ngii': 'ஙீ', 'ngu': 'ஙு', 'nguu': 'ஙூ',
    'nge': 'ஙெ', 'ngee': 'ஙே', 'ngai': 'ஙை', 'ngo': 'ஙொ', 'ngoo': 'ஙோ', 'ngau': 'ஙௌ',

    # ch + vowels
    'cha': 'ச', 'chaa': 'சா', 'chi': 'சி', 'chii': 'சீ', 'chu': 'சு', 'chuu': 'சூ',
    'che': 'செ', 'chee': 'சே', 'chai': 'சை', 'cho': 'சொ', 'choo': 'சோ', 'chau': 'சௌ',

    # nj + vowels
    'nja': 'ஞ', 'njaa': 'ஞா', 'nji': 'ஞி', 'njii': 'ஞீ', 'nju': 'ஞு', 'njuu': 'ஞூ',
    'nje': 'ஞெ', 'njee': 'ஞே', 'njai': 'ஞை', 'njo': 'ஞொ', 'njoo': 'ஞோ', 'njau': 'ஞௌ',

    # t + vowels
    'ta': 'ட', 'taa': 'டா', 'ti': 'டி', 'tii': 'டீ', 'tu': 'டு', 'tuu': 'டூ',
    'te': 'டெ', 'tee': 'டே', 'tai': 'டை', 'to': 'டொ', 'too': 'டோ', 'tau': 'டௌ',

    # th + vowels
    'tha': 'த', 'thaa': 'தா', 'thi': 'தி', 'thii': 'தீ', 'thu': 'து', 'thuu': 'தூ',
    'the': 'தெ', 'thee': 'தே', 'thai': 'தை', 'tho': 'தொ', 'thoo': 'தோ', 'thau': 'தௌ',

    # n + vowels (multiple 'n' consonants in Tamil, using the dental 'ந' version)
    'na': 'ந', 'naa': 'நா', 'ni': 'நி', 'nii': 'நீ', 'nu': 'நு', 'nuu': 'நூ',
    'ne': 'நெ', 'nee': 'நே', 'nai': 'நை', 'no': 'நொ', 'noo': 'நோ', 'nau': 'நௌ',

    # p + vowels
    'pa': 'ப', 'paa': 'பா', 'pi': 'பி', 'pii': 'பீ', 'pu': 'பு', 'puu': 'பூ',
    'pe': 'பெ', 'pee': 'பே', 'pai': 'பை', 'po': 'பொ', 'poo': 'போ', 'pau': 'பௌ',

    # m + vowels
    'ma': 'ம', 'maa': 'மா', 'mi': 'மி', 'mii': 'மீ', 'mu': 'மு', 'muu': 'மூ',
    'me': 'மெ', 'mee': 'மே', 'mai': 'மை', 'mo': 'மொ', 'moo': 'மோ', 'mau': 'மௌ',

    # y + vowels
    'ya': 'ய', 'yaa': 'யா', 'yi': 'யி', 'yii': 'யீ', 'yu': 'யு', 'yuu': 'யூ',
    'ye': 'யெ', 'yee': 'யே', 'yai': 'யை', 'yo': 'யொ', 'yoo': 'யோ', 'yau': 'யௌ',

    # r + vowels (using 'ர')
    'ra': 'ர', 'raa': 'ரா', 'ri': 'ரி', 'rii': 'ரீ', 'ru': 'ரு', 'ruu': 'ரூ',
    're': 'ரெ', 'ree': 'ரே', 'rai': 'ரை', 'ro': 'ரொ', 'roo': 'ரோ', 'rau': 'ரௌ',

    # l + vowels (using 'ல')
    'la': 'ல', 'laa': 'லா', 'li': 'லி', 'lii': 'லீ', 'lu': 'லு', 'luu': 'லூ',
    'le': 'லெ', 'lee': 'லே', 'lai': 'லை', 'lo': 'லொ', 'loo': 'லோ', 'lau': 'லௌ',

    # v + vowels
    'va': 'வ', 'vaa': 'வா', 'vi': 'வி', 'vii': 'வீ', 'vu': 'வு', 'vuu': 'வூ',
    've': 'வெ', 'vee': 'வே', 'vai': 'வை', 'vo': 'வொ', 'voo': 'வோ', 'vau': 'வௌ',

    # zh + vowels
    'zha': 'ழ', 'zhaa': 'ழா', 'zhi': 'ழி', 'zhii': 'ழீ', 'zhu': 'ழு', 'zhuu': 'ழூ',
    'zhe': 'ழெ', 'zhee': 'ழே', 'zhai': 'ழை', 'zho': 'ழொ', 'zhoo': 'ழோ', 'zhau': 'ழௌ',

    # Additional variants for different Tamil 'n' sounds
    # Alveolar 'n' (ன) + vowels
    'na': 'ன', 'naa': 'னா', 'ni': 'னி', 'nii': 'னீ', 'nu': 'னு', 'nuu': 'னூ',
    'ne': 'னெ', 'nee': 'னே', 'nai': 'னை', 'no': 'னொ', 'noo': 'னோ', 'nau': 'னௌ',

    # Retroflex 'n' (ண) + vowels
    'na': 'ண', 'naa': 'ணா', 'ni': 'ணி', 'nii': 'ணீ', 'nu': 'ணு', 'nuu': 'ணூ',
    'ne': 'ணெ', 'nee': 'ணே', 'nai': 'ணை', 'no': 'ணொ', 'noo': 'ணோ', 'nau': 'ணௌ',

    # Retroflex 'r' (ற) + vowels
    'ra': 'ற', 'raa': 'றா', 'ri': 'றி', 'rii': 'றீ', 'ru': 'று', 'ruu': 'றூ',
    're': 'றெ', 'ree': 'றே', 'rai': 'றை', 'ro': 'றொ', 'roo': 'றோ', 'rau': 'றௌ',

    # Retroflex 'l' (ள) + vowels
    'la': 'ள', 'laa': 'ளா', 'li': 'ளி', 'lii': 'ளீ', 'lu': 'ளு', 'luu': 'ளூ',
    'le': 'ளெ', 'lee': 'ளே', 'lai': 'ளை', 'lo': 'ளொ', 'loo': 'ளோ', 'lau': 'ளௌ'

}

# ===== GRAMMATICAL CATEGORIES =====

# Classification of Rationality
rational_nouns = [
    'ஆண்', 'பெண்', 'மனிதன்', 'அவன்', 'அவள்', 'நான்', 'நீ', 'நாம்', 'நீங்கள்',
    'அவர்', 'இவர்', 'உவர்', 'தாம்', 'தாங்கள்', 'யார்', 'ஒருவர்'
]

irrational_nouns = [
    'குழந்தை', 'நாய்', 'பூனை', 'மரம்', 'வீடு', 'கல்', 'பொருள்', 'செடி', 'யானை'
    # Add more irrational nouns
]

# Vallinam consonants for special rules
vallinam_consonants = ['க்', 'ச்', 'ட்', 'த்', 'ப்', 'ற்']
vallinam_with_u = ['கு', 'சு', 'டு', 'து', 'பு', 'று']  # For Kutriyalukaram

# Case suffixes for noun declension
case_suffixes = {
    'nominative': '',
    'accusative': 'ஐ',
    'instrumental': 'ஆல்',
    'sociative': 'ஓடு',
    'dative': 'க்கு',
    'benefactive': 'க்காக',
    'ablative': 'இலிருந்து',
    'genitive': 'உடைய',
    'locative': 'இல்',
    'vocative': 'ஏ'
}

# Roman equivalents of case suffixes
roman_case_suffixes = {
    'nominative': '',
    'accusative': 'ai',
    'instrumental': 'aal',
    'sociative': 'otu',
    'dative': 'kku',
    'benefactive': 'kkaaka',
    'ablative': 'ilirunthu',
    'genitive': 'utaiya',
    'locative': 'il',
    'vocative': 'ee'
}

# ===== TRANSLITERATION FUNCTIONS =====

def romanize_tamil_word(word):
    """
    Convert a Tamil word into its romanized form.

    Args:
        word (str): Tamil word to be romanized

    Returns:
        str: Romanized version of the Tamil word
    """
    romanized = ""
    i = 0

    while i < len(word):
        char = word[i]

        # List of Tamil consonants
        tamil_consonants = [
            'க', 'ங', 'ச', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ப', 'ம',
            'ய', 'ர', 'ல', 'வ', 'ழ', 'ள', 'ற', 'ன', 'ஹ'
        ]

        # List of vowel markers
        vowel_markers = ['ா', 'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ௌ', '்']

        # Case 1: Character is a consonant
        if char in tamil_consonants:
            consonant_roman = romanized_map[char]

            # Look ahead for vowel marker or pulli
            if i + 1 < len(word) and word[i+1] in vowel_markers:
                if word[i+1] == '்':  # pulli - no vowel
                    romanized += consonant_roman
                else:  # vowel marker
                    romanized += consonant_roman + romanized_map[word[i+1]]
                i += 2
            else:
                # No vowel marker: add inherent 'a'
                romanized += consonant_roman + 'a'
                i += 1

        # Case 2: Character is an independent vowel or other mapped character
        elif char in romanized_map and char not in vowel_markers:
            romanized += romanized_map[char]
            i += 1

        # Case 3: Vowel marker appearing independently (unusual, but handle it)
        elif char in vowel_markers:
            # Skip if it's a stray vowel marker (unusual case)
            i += 1

        # Case 4: Character not in our mapping
        else:
            romanized += char  # Keep as is (numbers, punctuation, etc.)
            i += 1

    # Check for Kutriyalukaram (shortened 'u')
    if any(word.endswith(u_cons) for u_cons in vallinam_with_u):
        # Check if preceded by multiple letters or a nedil (long vowel)
        if len(word) > 1:
            # If it ends with one of the 'u' forms, modify the romanization
            if romanized.endswith('u'):
                romanized = romanized[:-1] + 'ŭ'  # Mark shortened 'u'

    # Check for Aikarakurukkam (shortened 'ai')
    if word.startswith('ஐ') and len(word) > 1:
        # Replace the first 'ai' with a shortened version
        romanized = romanized.replace('ai', 'ai̯', 1)

    # Check for Aukarakurukkam (shortened 'au')
    if word.startswith('ஔ') and len(word) > 1:
        # Replace the first 'au' with a shortened version
        romanized = romanized.replace('au', 'au̯', 1)

    return romanized

def romanize_tamil_text(text):
    """
    Romanize a Tamil text (can contain multiple words).

    Args:
        text (str): Tamil text to romanize

    Returns:
        str: Romanized version of the Tamil text
    """
    words = text.split()
    romanized_words = [romanize_tamil_word(word) for word in words]
    return " ".join(romanized_words)

def convert_roman_to_tamil(romanized_text):
    """
    Convert romanized text to Tamil.

    Args:
        romanized_text (str): The romanized text to convert

    Returns:
        str: Tamil text
    """
    # List of consonant prefixes and vowel suffixes from original code
    consonant_prefixes = [
        'k', 'ng', 'ch', 'nj', 't', 'th', 'n', 'p',
        'm', 'y', 'r', 'l', 'v', 'zh', 'ha'
    ]

    vowel_suffixes = [
        'a', 'aa', 'i', 'ii', 'u', 'uu',
        'e', 'ee', 'ai', 'o', 'oo', 'au'
    ]

    words = romanized_text.split()
    tamil_words = []

    for word in words:
        # Handle special shortened forms
        word = word.replace('ŭ', 'u')  # Handle Kutriyalukaram
        word = word.replace('ai̯', 'ai')  # Handle Aikarakurukkam
        word = word.replace('au̯', 'au')  # Handle Aukarakurukkam

        tamil_word = ""
        i = 0

        while i < len(word):
            # Try to match longest possible segments
            matched = False

            # First check for consonant-vowel combinations (longest matches)
            for prefix in sorted(consonant_prefixes, key=len, reverse=True):
                if word[i:].startswith(prefix):
                    for suffix in sorted(vowel_suffixes, key=len, reverse=True):
                        combo = prefix + suffix
                        if word[i:].startswith(combo) and combo in consonant_vowel_combinations:
                            tamil_word += consonant_vowel_combinations[combo]
                            i += len(combo)
                            matched = True
                            break
                    if matched:
                        break

            # Check for case suffixes
            if not matched:
                for case, suffix in roman_case_suffixes.items():
                    if suffix and word[i:].startswith(suffix):
                        # Don't add empty suffixes (nominative)
                        if suffix:
                            # Convert Roman case suffix to Tamil
                            tamil_case_suffix = convert_case_suffix_to_tamil(case)
                            tamil_word += tamil_case_suffix
                            i += len(suffix)
                            matched = True
                            break

            # If no consonant-vowel combo matched, check for standalone vowels or consonants
            if not matched:
                # Try matching vowels (independent forms)
                for vowel in sorted(vowel_suffixes, key=len, reverse=True):
                    if word[i:].startswith(vowel) and vowel in tamil_map:
                        tamil_word += tamil_map[vowel]
                        i += len(vowel)
                        matched = True
                        break

                # Try matching consonants (with pulli)
                if not matched:
                    for consonant in sorted(consonant_prefixes, key=len, reverse=True):
                        if word[i:].startswith(consonant) and consonant in tamil_map:
                            tamil_word += tamil_map[consonant]
                            i += len(consonant)
                            matched = True
                            break

            # If still no match, keep the character as is
            if not matched:
                tamil_word += word[i]
                i += 1

        tamil_words.append(tamil_word)

    return " ".join(tamil_words)

# ===== GRAMMATICAL FUNCTIONALITY =====

def is_rational(word):
    """
    Determine if a noun is rational (உயர்திணை) or irrational (அஃறிணை).

    Args:
        word (str): Tamil word to check

    Returns:
        bool: True if rational, False if irrational
    """
    # Basic check against known lists
    if word in rational_nouns:
        return True
    if word in irrational_nouns:
        return False

    # Check for common endings that indicate rational nouns
    rational_endings = ['ன்', 'ள்', 'ர்']
    for ending in rational_endings:
        if word.endswith(ending):
            return True

    # Default to irrational if unknown
    return False

def decline_noun(noun, case, is_plural=False):
    """
    Decline a Tamil noun according to case and number.

    Args:
        noun (str): Tamil noun in its base form
        case (str): Grammatical case ('nominative', 'accusative', etc.)
        is_plural (bool): Whether the noun is plural

    Returns:
        str: Declined noun in Tamil script
    """
    # Base noun without any case ending
    base = noun

    # Add plural marker if needed
    if is_plural:
        # Remove any final consonant pulli for certain nouns before adding plural
        if base.endswith('்'):
            base = base[:-1]
        base += 'கள்'

    # No change for nominative case
    if case == 'nominative':
        return base

    # Add euphonic increment for certain nouns and cases
    needs_increment = case in ['accusative', 'dative', 'genitive'] and not base.endswith('ம்')

    # Add appropriate case suffix
    if needs_increment and not base.endswith('்'):
        # Add euphonic increment
        if is_rational(noun):
            base += 'இன்'
        else:
            base += 'அத்'

    # Special handling for certain cases
    if case == 'dative':
        # Check if the word ends with a consonant
        if base.endswith('்'):
            base = base[:-1]  # Remove pulli
            base += 'உக்கு'
        else:
            base += 'க்கு'
    elif case in case_suffixes:
        base += case_suffixes[case]

    return base

def convert_case_suffix_to_tamil(case):
    """
    Convert a case name to its Tamil suffix.

    Args:
        case (str): Name of the grammatical case

    Returns:
        str: Tamil suffix for the case
    """
    return case_suffixes.get(case, '')

def handle_kutriyalukaram(word):
    """
    Apply Kutriyalukaram rules (shortening of 'u' sound).

    Args:
        word (str): Tamil word to process

    Returns:
        str: Word with Kutriyalukaram rule applied
    """
    # Check if word ends with a vallinam + 'u'
    for val_u in vallinam_with_u:
        if word.endswith(val_u):
            # Check if preceded by multiple letters or long vowel
            if len(word) > 1:
                # Mark for shortened pronunciation in romanization
                return True
    return False

def handle_aaytham(word):
    """
    Apply special rules for Aaytham.

    Args:
        word (str): Tamil word to process

    Returns:
        str: Word with Aaytham rules applied
    """
    # Aaytham should be preceded by a short vowel and followed by a hard consonant
    if 'ஃ' in word:
        idx = word.index('ஃ')
        if idx > 0 and idx < len(word) - 1:
            # Check if preceded by short vowel and followed by hard consonant
            # This is a simplified check
            return True
    return False

# ===== DEMO FUNCTIONS =====

def demo_noun_declension():
    """
    Demonstrate the noun declension functionality.
    """
    print("\nNoun Declension Examples:")
    print("------------------------")

    nouns = [('மனிதன்', True), ('மரம்', False), ('நாய்', False)]
    cases = ['nominative', 'accusative', 'dative', 'genitive', 'locative']

    for noun, is_rational in nouns:
        print(f"\nDeclension of {noun} {'(Rational)' if is_rational else '(Irrational)'}:")
        for case in cases:
            singular = decline_noun(noun, case, False)
            plural = decline_noun(noun, case, True)
            rom_singular = romanize_tamil_word(singular)
            rom_plural = romanize_tamil_word(plural)

            print(f"  {case.capitalize()}: {singular} (romanized: {rom_singular})")
            print(f"  {case.capitalize()} Plural: {plural} (romanized: {rom_plural})")

def demo_special_rules():
    """
    Demonstrate the application of special phonological rules.
    """
    print("\nSpecial Phonological Rules Examples:")
    print("---------------------------------")

    # Kutriyalukaram examples
    kutriyalukaram_words = ['வரவு', 'பாடு', 'எடுப்பு']
    print("\nKutriyalukaram (shortened 'u'):")
    for word in kutriyalukaram_words:
        is_kutriyal = handle_kutriyalukaram(word)
        rom_word = romanize_tamil_word(word)
        print(f"  {word} → {rom_word} {'(shortened)' if is_kutriyal else ''}")

    # Aikarakurukkam examples
    aikarakurukkam_words = ['ஐந்து', 'ஐம்பது']
    print("\nAikarakurukkam (shortened 'ai'):")
    for word in aikarakurukkam_words:
        rom_word = romanize_tamil_word(word)
        print(f"  {word} → {rom_word}")

    # Aaytham examples
    aaytham_words = ['அஃது', 'எஃகு']
    print("\nAaytham:")
    for word in aaytham_words:
        has_aaytham = handle_aaytham(word)
        rom_word = romanize_tamil_word(word)
        print(f"  {word} → {rom_word} {'(with aaytham)' if has_aaytham else ''}")

def main():
    """
    Main demonstration function.
    """
    print("Enhanced Tamil Transliteration System with Grammar Rules")
    print("======================================================")

    # Tamil to Roman examples
    test_words = [
        "காதல்",       # kaathal (love)
        "தமிழ்",       # thamizh (Tamil)
        "வணக்கம்",     # vanakkam (hello)
        "நன்றி",       # nanri (thank you)
        "இந்தியா"      # inthiyaa (India)
    ]

    print("\nTamil to Roman Transliteration Examples:")
    print("---------------------------------------")

    for word in test_words:
        romanized = romanize_tamil_word(word)
        print(f"Tamil: {word} → Romanized: {romanized}")

    # Roman to Tamil examples
    roman_words = [
        "kaathal",      # காதல் (love)
        "thamizh",      # தமிழ் (Tamil)
        "vanakkam",     # வணக்கம் (hello)
        "nanri",        # நன்றி (thank you)
        "inthiyaa"      # இந்தியா (India)
    ]

    print("\nRoman to Tamil Transliteration Examples:")
    print("---------------------------------------")

    for word in roman_words:
        tamil = convert_roman_to_tamil(word)
        print(f"Romanized: {word} → Tamil: {tamil}")

    # Demonstrate noun declension
    demo_noun_declension()

    # Demonstrate special phonological rules
    demo_special_rules()

    # Sentence examples
    tamil_sentence = "அனைவருக்கும் இனிய வணக்கம்"  # Hello to everyone
    romanized_sentence = romanize_tamil_text(tamil_sentence)
    print("\nTamil sentence:")
    print(f"Original: {tamil_sentence}")
    print(f"Romanized: {romanized_sentence}")

if __name__ == "__main__":
    main()

Enhanced Tamil Transliteration System with Grammar Rules

Tamil to Roman Transliteration Examples:
---------------------------------------
Tamil: காதல் → Romanized: kaathal
Tamil: தமிழ் → Romanized: thamizh
Tamil: வணக்கம் → Romanized: vanakkam
Tamil: நன்றி → Romanized: nanri
Tamil: இந்தியா → Romanized: inthiyaa

Roman to Tamil Transliteration Examples:
---------------------------------------
Romanized: kaathal → Tamil: காதல்
Romanized: thamizh → Tamil: தமிழ்
Romanized: vanakkam → Tamil: வணக்கம்
Romanized: nanri → Tamil: ணன்றி
Romanized: inthiyaa → Tamil: இன்தியா

Noun Declension Examples:
------------------------

Declension of மனிதன் (Rational):
  Nominative: மனிதன் (romanized: manithan)
  Nominative Plural: மனிதனகள் (romanized: manithanakal)
  Accusative: மனிதன்ஐ (romanized: manithanai)
  Accusative Plural: மனிதனகள்ஐ (romanized: manithanakalai)
  Dative: மனிதனஉக்கு (romanized: manithanaukkŭ)
  Dative Plural: மனிதனகளஉக்கு (romanized: manithanakalaukkŭ)
  Genitive: மனிதன்உடைய (romani

In [None]:
import re

# ===== ENHANCED TRANSLITERATION WITH GRAMMAR RULES =====
# (Includes noun declension, pluralization, and special phonological rules)

# ... [Keep previous romanized_map, tamil_map, consonant_vowel_combinations] ...

# ===== GRAMMATICAL ENHANCEMENTS =====
# Based on Tamil Grammar Handbook (Pages 1-53)

# Noun classifications from handbook (Page 54-55)
rational_nouns = [
    'மனிதன்', 'பெண்', 'அவன்', 'அவள்', 'குழந்தை', 'அரசன்',
    'ஆசிரியர்', 'மாணவன்', 'நண்பர்', 'குரு'
]

irrational_nouns = [
    'மரம்', 'வீடு', 'நாய்', 'புத்தகம்', 'மலை', 'நதி',
    'பூ', 'கல்', 'பழம்', 'ஆடு'
]

# Detailed case system from handbook (Page 57-61)
case_system = {
    'nominative': {'suffix': '', 'example': 'மரம்'},
    'accusative': {'suffix': 'ஐ', 'example': 'மரத்தை'},
    'instrumental': {'suffix': 'ஆல்', 'example': 'மரத்தால்'},
    'dative': {'suffix': 'க்கு', 'example': 'மரத்துக்கு'},
    'ablative': {'suffix': 'இலிருந்து', 'example': 'மரத்திலிருந்து'},
    'genitive': {'suffix': 'இன்', 'example': 'மரத்தின்'},
    'locative': {'suffix': 'இல்', 'example': 'மரத்தில்'},
    'vocative': {'suffix': 'ஏ', 'example': 'மரமே'}
}

# Plural markers from handbook (Page 55)
plural_markers = {
    'rational': 'கள்',
    'irrational': 'க்கள்'
}

# Phonological rules from handbook (Pages 9-20)
special_consonants = {
    'vallinam': ['க்', 'ச்', 'ட்', 'த்', 'ப்', 'ற்'],
    'mellinam': ['ங்', 'ஞ்', 'ண்', 'ந்', 'ம்', 'ன்'],
    'idayinam': ['ய்', 'ர்', 'ல்', 'வ்', 'ழ்', 'ள்']
}

def apply_plural(noun, is_rational):
    """Apply pluralization rules from handbook (Page 55)"""
    if noun.endswith('்'):
        base = noun[:-1]
    else:
        base = noun

    marker = plural_markers['rational'] if is_rational else plural_markers['irrational']
    return base + marker

def decline_noun(noun, case, is_plural=False, is_rational=None):
    """Enhanced noun declension based on handbook (Pages 57-61)"""
    # Determine noun type
    if is_rational is None:
        is_rational = noun in rational_nouns

    # Base form processing
    if noun.endswith('்'):
        base = noun[:-1]
    else:
        base = noun

    # Apply plural
    if is_plural:
        base = apply_plural(base, is_rational)

    # Get case suffix
    case_info = case_system.get(case, case_system['nominative'])
    suffix = case_info['suffix']

    # Apply phonological combinations (Page 13-20)
    # Rule: Avoid consecutive consonants with pulli
    if base.endswith('்') and suffix.startswith(('அ', 'இ', 'உ')):
        base = base[:-1]  # Remove pulli before vowel

    # Special handling for dative case (Page 59)
    if case == 'dative':
        if base.endswith(('ம்', 'ன்')):
            return base + 'க்கு'
        elif base.endswith('ள்'):
            return base[:-1] + 'ட்கு'

    return base + suffix

# Enhanced transliteration with grammar rules
def enhanced_romanize(word):
    """Romanization with grammatical awareness"""
    romanized = romanize_tamil_word(word)

    # Apply Kurikazhi rules (Page 15)
    # Shortened 'u' for vallinam consonants
    if any(word.endswith(c + 'ு') for c in special_consonants['vallinam']):
        romanized = romanized.replace('u', 'ŭ', 1)

    # Aikarakurukkam (Page 15)
    if word.startswith('ஐ') and len(word) > 1:
        romanized = romanized.replace('ai', 'aⁱ', 1)

    return romanized

# ===== DEMONSTRATION =====
def grammar_aware_demo():
    """Show grammar-integrated transliteration"""
    print("\nGrammar-Enhanced Tamil Processing")
    print("=================================")

    test_nouns = [
        ('மனிதன்', True),    # Rational
        ('மரம்', False),     # Irrational
        ('நாய்', False),     # Irrational
        ('பெண்', True)       # Rational
    ]

    for noun, rational in test_nouns:
        print(f"\nNoun: {noun} ({'Rational' if rational else 'Irrational'})")

        # Singular forms
        for case in case_system:
            declined = decline_noun(noun, case)
            romanized = enhanced_romanize(declined)
            print(f"{case.capitalize():<12} {declined} → {romanized}")

        # Plural forms
        plural_form = decline_noun(noun, 'nominative', is_plural=True)
        print(f"\nPlural Base: {plural_form}")
        for case in case_system:
            declined_plural = decline_noun(noun, case, is_plural=True)
            romanized_plural = enhanced_romanize(declined_plural)
            print(f"{case.capitalize():<12} {declined_plural} → {romanized_plural}")

def main():
    """Main demonstration function"""
    grammar_aware_demo()
    print("\nSpecial Phonological Rules:")
    print("Kutriyalukaram (குற்றியலுகரம்): மரத்து →", enhanced_romanize('மரத்து'))
    print("Aikarakurukkam (ஐகாரக் குறுக்கம்): ஐந்து →", enhanced_romanize('ஐந்து'))

if __name__ == "__main__":
    main()


Grammar-Enhanced Tamil Processing

Noun: மனிதன் (Rational)
Nominative   மனிதன → 
Accusative   மனிதனஐ → 
Instrumental மனிதனஆல் → 
Dative       மனிதனக்கு → 
Ablative     மனிதனஇலிருந்து → 
Genitive     மனிதனஇன் → 
Locative     மனிதனஇல் → 
Vocative     மனிதனஏ → 

Plural Base: மனிதனகள்
Nominative   மனிதனகள் → 
Accusative   மனிதனகள்ஐ → 
Instrumental மனிதனகள்ஆல் → 
Dative       மனிதனகளட்கு → 
Ablative     மனிதனகளஇலிருந்து → 
Genitive     மனிதனகளஇன் → 
Locative     மனிதனகளஇல் → 
Vocative     மனிதனகள்ஏ → 

Noun: மரம் (Irrational)
Nominative   மரம → 
Accusative   மரமஐ → 
Instrumental மரமஆல் → 
Dative       மரமக்கு → 
Ablative     மரமஇலிருந்து → 
Genitive     மரமஇன் → 
Locative     மரமஇல் → 
Vocative     மரமஏ → 

Plural Base: மரமக்கள்
Nominative   மரமக்கள் → 
Accusative   மரமக்கள்ஐ → 
Instrumental மரமக்கள்ஆல் → 
Dative       மரமக்களட்கு → 
Ablative     மரமக்களஇலிருந்து → 
Genitive     மரமக்களஇன் → 
Locative     மரமக்களஇல் → 
Vocative     மரமக்கள்ஏ → 

Noun: நாய் (Irrational)
Nominative   நாய → 


The **புணர்ச்சி விதிகள் (Punarchi Vidigal)** in Tamil grammar govern the formation of compound words through phonological and orthographic adjustments. Below is a detailed explanation of the rules and special cases:

---

### **1. Types of Joining**
#### **இயல்பு புணர்ச்சி (Natural Joining)**  
- **Definition**: No changes occur between the base (நிலைமொழி) and affixed word (வருமொழி).  
- **Conditions**:  
  - Base ends in a vowel, and affixed word begins with a consonant.  
  - Example:  
    - வாழை (banana) + மரம் (tree) → வாழைமரம் (banana tree).  
    - நீர் (water) + கோப்பை (cup) → நீர்கோப்பை (water cup).  

#### **விகாரப் புணர்ச்சி (Modified Joining)**  
Modifications occur at the junction, categorized into:  
- **தோன்றல் (Insertion)**:  
  - A glide consonant (ய், வ்) or nasal (ங், ம்) is inserted between vowels or consonant clusters.  
  - Examples:  
    - மா (mango) + அடி (base) → மாவடி (māvaḍi) [வ் inserted].  
    - பூ (flower) + கொடி (vine) → பூங்கொடி (pūṅkoṭi) [ங் inserted].  

- **திரிதல் (Transformation)**:  
  - The final consonant of the base changes to harmonize with the affixed word.  
  - Examples:  
    - மரம் (tree) + கிளை (branch) → மரக்கிளை (marakkiḷai) [final ம் → க்].  
    - பல் (tooth) + பூ (flower) → பல்பூ (palpū) [final ல் remains].  

- **கெடுதல் (Deletion)**:  
  - A letter is dropped from the base or affixed word.  
  - Examples:  
    - தெங்கு (coconut tree) + காய் (fruit) → தேங்காய் (thēngāy) [final உ deleted].  
    - வடக்கு (north) + கிழக்கு (east) → வடகிழக்கு (vaḍakizhakku) [final கு deleted].  

---

### **2. Phonological Adjustments**
#### **Final Sound of Base + Initial Sound of Affixed Word**  
- **Vowel + Vowel**: Insert ய் or வ்.  
  - Example: கோ (king) + அரசன் (ruler) → கோயரசன் (kōyaracaṉ).  
- **Vowel + Consonant**: Natural joining (no change).  
  - Example: மலை (mountain) + ஏறு (climb) → மலையேறு (malaiyēṟu).  
- **Consonant + Vowel**: Double the consonant or transform.  
  - Example: நாடு (country) + மொழி (language) → நாட்டுமொழி (nāṭṭumoḻi).  
- **Consonant + Consonant**: Insert a vowel or modify.  
  - Example: மண் (earth) + கோவில் (temple) → மண்கோவில் (maṇkōvil).  

---

### **3. Specific Compound Categories**
#### **திசைப் பெயர்ப் புணர்ச்சி (Directional Compounds)**  
- Combine directional terms with deletions.  
  - Example: வடக்கு (north) + மேற்கு (west) → வடமேற்கு (northwest).  

#### **மையீற்றுப் பண்புப் பெயர்ப் புணர்ச்சி (Adjectival Compounds)**  
- Form adjectives by combining qualifiers.  
  - Example: மா (great) + பெரிய (big) → மாபெரிய (māperiya, "huge").  

#### **பூப்பெயர்ப் புணர்ச்சி (Floral Compounds)**  
- Insert ங் when பூ (flower) combines with another word.  
  - Example: பூ + மலர் (blossom) → பூங்கொடி (pūṅkoṭi, "flower vine").  

#### **தேங்காய் புணர்ச்சி (Coconut Compounds)**  
- Special deletion and vowel elongation.  
  - Example: தெங்கு (theṅgu) + காய் (kāy) → தேங்காய் (thēngāy, "coconut").  

---

### **4. Special Cases and Exceptions**
- **ஆய்தம் (ஃ)**: Words ending with ஃ may drop it or merge.  
  - Example: அஃது (that) + என (like) → அதுபோல் (athupōla, "like that").  
- **Nasal Consonants**: Final ம், ன், ண் may change to stops (க், ட்).  
  - Example: சாமி (god) + கோவில் (temple) → சாமியார் கோவில் (sāmiyār kōvil).  
- **Loanwords**: Borrowed words follow Tamil rules.  
  - Example: டீ (tea) + கோப்பை (cup) → டீக்கோப்பை (ṭīkkōppai).  

---

### **5. Importance of Rules**  
- **Clarity**: Prevents ambiguity (e.g., மரத்திலை vs. மர இலை).  
- **Euphony**: Ensures smooth pronunciation.  
- **Consistency**: Standardizes compound formation.  

---

### **Summary**  
The புணர்ச்சி விதிகள் ensure systematic and harmonious word formation in Tamil. By applying rules of insertion, transformation, deletion, and category-specific adjustments, Tamil maintains its phonetic elegance and semantic precision. Mastery of these rules is essential for correct usage in literature, speech, and everyday communication

In [None]:
class TamilGrammarRules:
    # ... (existing class code remains the same)

    def ends_with_vowel(self, word):
        """Check if a word ends with a vowel or vowel sign."""
        if not word:
            return False
        vowels = {'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ'}
        vowel_signs = {'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ா', 'ௌ'}
        return word[-1] in vowels or word[-1] in vowel_signs

    def starts_with_consonant(self, word):
        """Check if a word starts with a consonant."""
        if not word:
            return False
        vowels = {'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ'}
        vowel_signs = {'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ா', 'ௌ'}
        return word[0] not in vowels and word[0] not in vowel_signs

    def get_last_vowel(self, word):
        """Get the last vowel (or equivalent) in a word."""
        vowels = {'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ'}
        vowel_signs = {'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ா', 'ௌ'}
        vowel_map = {
            'ி': 'இ', 'ீ': 'ஈ', 'ு': 'உ', 'ூ': 'ஊ',
            'ெ': 'எ', 'ே': 'ஏ', 'ை': 'ஐ', 'ொ': 'ஒ',
            'ோ': 'ஓ', 'ா': 'ஆ', 'ௌ': 'ஔ'
        }
        for char in reversed(word):
            if char in vowels:
                return char
            elif char in vowel_signs:
                return vowel_map.get(char, None)
        return None

    def get_first_vowel(self, word):
        """Get the first vowel (or equivalent) in a word."""
        vowels = {'அ', 'ஆ', 'இ', '', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ'}
        vowel_signs = {'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ா', 'ௌ'}
        vowel_map = {
            'ி': 'இ', 'ீ': 'ஈ', 'ு': 'உ', 'ூ': 'ஊ',
            'ெ': 'எ', 'ே': 'ஏ', 'ை': 'ஐ', 'ொ': 'ஒ',
            'ோ': 'ஓ', 'ா': 'ஆ', 'ௌ': 'ஔ'
        }
        for char in word:
            if char in vowels:
                return char
            elif char in vowel_signs:
                return vowel_map.get(char, None)
        return None

    def apply_sandhi_rules(self, word1, word2):
        ##"""Apply Tamil sandhi rules to combine two words [[2]][[3]][[5]]"""
        # Natural Joining (இயல்பு புணர்ச்சி)
        if self.ends_with_vowel(word1) and self.starts_with_consonant(word2):
            return word1 + word2

        # Modified Joining - Vowel + Vowel (தோன்றல்)
        elif self.ends_with_vowel(word1) and self.starts_with_vowel(word2):
            last_vowel = self.get_last_vowel(word1)
            first_vowel = self.get_first_vowel(word2)

            # Insert glide based on vowel combination
            if last_vowel == 'ஆ' and first_vowel == 'அ':
                return word1 + 'வ்' + word2
            elif last_vowel in ['ஓ', 'ஒ'] and first_vowel == 'அ':
                return word1 + 'ய்' + word2
            elif word1 == 'பூ':
                return 'பூங்' + word2  # Floral compound rule [[6]]
            else:
                return word1 + 'ய்' + word2  # Default insertion [[1]]

        # Modified Joining - Consonant + Consonant (திரிதல்)
        elif word1.endswith('ம்') and word2.startswith('க'):
            return word1[:-1] + 'க்' + word2  # ம் → க் transformation [[2]]
        elif word1.endswith('ன்') and word2.startswith('ட'):
            return word1[:-1] + 'ட்' + word2  # ன் → ட் transformation [[3]]

        # Deletion (கெடுதல்)
        elif word1 == 'தெங்கு':
            return 'தேங்' + word2  # தெங்கு → தேங் deletion [[5]]
        elif word1.endswith('கு') and len(word1) >= 2:
            return word1[:-2] + 'க்' + word2  # கு → க் conversion [[4]]

        # Default case
        return word1 + word2

        # Default case: concatenate
        return word1 + word2

    # ... (rest of the class remains the same)

In [None]:
def apply_sandhi_rules(self, word1, word2):
        """Apply Tamil sandhi rules to combine two words."""
        # Natural Joining (இயல்பு புணர்ச்சி)
        if self.ends_with_vowel(word1) and self.starts_with_consonant(word2):
            return word1 + word2

        # Modified Joining - Vowel + Vowel (தோன்றல்)
        elif self.ends_with_vowel(word1) and self.starts_with_vowel(word2):
            last_vowel = self.get_last_vowel(word1)
            first_vowel = self.get_first_vowel(word2)

            # Insert glide based on vowel combination
            if last_vowel == 'ஆ' and first_vowel == 'அ':
                return word1 + 'வ்' + word2
            elif last_vowel in ['ஓ', 'ஒ'] and first_vowel == 'அ':
                return word1 + 'ய்' + word2
            # Floral compound rule (பூ + word → insert ங்)
            elif word1 == 'பூ':
                return 'பூங்' + word2
            else:
                # Default insertion (ய் for general cases)
                return word1 + 'ய்' + word2

        # Modified Joining - Consonant + Consonant (திரிதல்)
        elif word1.endswith('ம்') and word2.startswith('க'):
            return word1[:-1] + 'க்' + word2
        elif word1.endswith('ன்') and word2.startswith('ட'):
            return word1[:-1] + 'ட்' + word2

        # Deletion (கெடுதல்)
        elif word1 == 'தெங்கு':
            return 'தேங்' + word2
        elif word1.endswith('கு'):
            return word1[:-2] + word2

        # Default case: concatenate
        return word1 + word2

In [None]:
grammar = TamilGrammarRules()
print(grammar.apply_sandhi_rules('தெங்கு', 'காய்'))  # Output: 'தேங்காய்'
print(grammar.apply_sandhi_rules('மரம்', 'கிளை'))     # Output: 'மரக்கிளை'
print(grammar.apply_sandhi_rules('பூ', 'கொடி'))       # Output: 'பூங்கொடி'

தெங்குகாய்
மரமக்கிளை
பூகொடி
