In [16]:
import random
import nltk
from nltk.corpus import indian # added for testing, but didnt find a useful database.

In [17]:
class TrieNode:
    """A node in the trie structure that represents a character in a word."""
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    """The trie data structure for storing words efficiently."""
    def __init__(self):
        self.root = TrieNode()
    
    def insert(self, word):
        """Insert a word into the trie."""
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
    
    def search(self, word):
        """Search for a word in the trie to validate it's not a known word."""
        node = self.root
        for char in word:
            if char not in node.children:
                return False
            node = node.children[char]
        return node.is_end_of_word

def load_hindi_words(trie):
    """Load Hindi words from the NLTK Indian corpus into the trie."""
    words = indian.words('hindi.pos')
    for word in words:
        trie.insert(word)

def generate_nonword(trie, length=3):
    """Generate a nonword that does not exist in the trie.
       Currently we are using letters but Ideally we should use phonemes with phonotactic rules."""
    vowels = 'अआइईउऊएऐओऔअंअः'
    consonants = 'कखगघचछजझटठडढतथदधनपफबभमयरलवशषसह'
    
    while True:
        nonword = ''
        pattern = [consonants, vowels] * (length // 2)  # Basic CV pattern

        for i in range(length):
            char_list = pattern[i % 2]
            char = random.choice(char_list)
            nonword += char
        
        # Ensure the nonword is not a real word and meets length criteria
        if not trie.search(nonword):
            return nonword

# Initialize and load the trie with Hindi words
trie = Trie()
load_hindi_words(trie)

# Example: Generate 10 Hindi two-letter and three-letter nonwords
print("Two-letter Nonwords:")
for _ in range(10):
    print(generate_nonword(trie, 2))

print("\nThree-letter Nonwords:")
for _ in range(10):
    print(generate_nonword(trie, 3))

Two-letter Nonwords:
धए
वअ
फई
यउ
यअ
लअ
कइ
बअ
झआ
लउ

Three-letter Nonwords:
ठअप
रइच
छअग
पऐर
मअय
खअठ
धओथ
घउर
धइम
ठइख
