In [15]:
import pandas as pd
import random

In [16]:
class UnmodifiedLettersException(Exception):
    pass

In [17]:
ERROR_CAUSE = {
    "deletion",
    "insertion",
    "substitution",
    "transposition"
}

ERROR_WORDS_MIN = 2
ERROR_WORDS_MAX = 5

ERRORS_PER_WORD_MIN = 1
ERRORS_PER_WORD_MAX = 1
ERRORS_PER_WORD_BASED_ON_LEN_FACTOR = 4

In [18]:
src = "./src/tamil_letters_similar_pronounc_b10_vowel_name_changed.csv"
tamil_letters_similar_pronounc = pd.read_csv(src, header=None)

tamil_vowels_and_letters = tamil_letters_similar_pronounc[
    (
        (tamil_letters_similar_pronounc[2].str.contains("TAMIL-VOWEL")) & 
        ~(tamil_letters_similar_pronounc[2].str.contains("TAMIL-VOWEL-SIGN"))
    ) |
    (tamil_letters_similar_pronounc[2].str.contains("TAMIL-LETTER"))
].iloc[:, 0].tolist()

tamil_letters = tamil_letters_similar_pronounc[
    (tamil_letters_similar_pronounc[2].str.contains("TAMIL-LETTER"))
].iloc[:, 0].tolist()


def get_symbol_type(letter_b10):
    return tamil_letters_similar_pronounc[
        tamil_letters_similar_pronounc[0] == letter_b10
    ].iloc[0, 2]

def letters_b10_to_symbol_type(letters_b10):
    return [
        get_symbol_type(letter_b10) for letter_b10 in letters_b10
    ]


def compute_error(cause, chosen_letter, letters_b10):
    letter_b10 = tamil_letters_similar_pronounc[
        tamil_letters_similar_pronounc[0] == letters_b10[chosen_letter]
    ]

    if len(letter_b10) != 1:
        raise Exception(f"Unknown letter with unicode b10-{letters_b10[chosen_letter]}")
    
    symbol_type = get_symbol_type(letters_b10[chosen_letter])

    alternatives = [b10 for b10 in letter_b10.iloc[0, 3].split("-")]
    if alternatives == ['', '']:
        raise UnmodifiedLettersException
    
    alternatives = [int(b10) for b10 in alternatives]
    
    
    if cause == "deletion":
        if chosen_letter < len(letters_b10) - 1:
            if "VOWEL-SIGN" in get_symbol_type(letters_b10[chosen_letter + 1]):
                letters_b10.pop(chosen_letter)

        letters_b10.pop(chosen_letter)
    
    elif cause == "insertion":
        if chosen_letter >= len(letters_b10) - 2:
            raise UnmodifiedLettersException
        
        if "VOWEL-SIGN" in symbol_type:
            letters_b10.insert(chosen_letter + 1, random.choice(tamil_vowels_and_letters))
        elif "LETTER" in symbol_type:
            letters_b10.insert(chosen_letter + 1, random.choice(tamil_letters))
        else:
            raise UnmodifiedLettersException

    
    elif cause == "substitution":
        if "TAMIL-LETTER" in symbol_type or "TAMIL-VOWEL" in symbol_type:
            to_sub = random.choice(alternatives)
            letters_b10[chosen_letter] = to_sub
        
    elif cause == 'transposition':
        syms = letters_b10_to_symbol_type(letters_b10)
        if "VOWEL-SIGN" in symbol_type:
            sym_vow_sign = []
            for i in range(len(syms)):
                if "VOWEL-SIGN" in syms[i]:
                    sym_vow_sign.append(letters_b10[i])

            if len(sym_vow_sign) <= 1:
                raise UnmodifiedLettersException
            
            random_vow_sign = random.choice(sym_vow_sign)

            i = letters_b10.index(random_vow_sign)
            j = letters_b10.index(letter_b10.iloc[0, 0])

            letters_b10[i], letters_b10[j] = letters_b10[j], letters_b10[i]
            
        elif "TAMIL-LETTER" in symbol_type:
            sym_letter = []
            for i in range(len(syms)):
                if "TAMIL-LETTER" in syms[i]:
                    sym_letter.append(letters_b10[i])

            if len(sym_letter) <= 1:
                raise UnmodifiedLettersException
            
            random_letter = random.choice(sym_letter)

            i = letters_b10.index(random_letter)
            j = letters_b10.index(letter_b10.iloc[0, 0])

            letters_b10[i], letters_b10[j] = letters_b10[j], letters_b10[i]
        
        elif "TAMIL-VOWEL" in symbol_type:
            sym_vow = []
            for i in range(len(syms)):
                if "VOWEL" in syms[i] and "VOWEL-SIGN" not in syms[i]:
                    sym_vow.append(letters_b10[i])
            
            if len(sym_vow) <= 1:
                raise UnmodifiedLettersException
            
            random_letter = random.choice(sym_vow)

            i = letters_b10.index(random_letter)
            j = letters_b10.index(letter_b10.iloc[0, 0])

            letters_b10[i], letters_b10[j] = letters_b10[j], letters_b10[i]
        
    else:
        raise Exception(f"Unknown Error Cause: {cause}")
    
    return letters_b10

def letters_to_word(letters_b10):
    s = ""
    for i in letters_b10:
        s += chr(i)
    
    return s

    

def commit_error(word):
    letters = list(word)

    unchangedWords = 0
    errors = []

    for err_word in range(random.randint(ERROR_WORDS_MIN, ERROR_WORDS_MAX)):
        letters_b10 = [ord(letter) for letter in letters]
        nerr = random.randint(
            ERRORS_PER_WORD_MIN,
            min(
                ERRORS_PER_WORD_MAX,
                int(len(letters_b10) / 2)
            )
        )

        for err in range(nerr):
            for _ in range(10):
                cause = random.choice(list(ERROR_CAUSE))
                chosen_letter = random.randrange(0, len(letters_b10))
                try:
                    letters_b10 = compute_error(cause, chosen_letter, letters_b10)
                    break
                except UnmodifiedLettersException:
                    pass
            else:
                unchangedWords += 1

        errors.append(letters_to_word(letters_b10))
    
    return errors



In [21]:
src = "./src/spell_error_wiki.csv"

wiki_error = pd.read_csv(src, header=None)

i = 0
for row in wiki_error.iloc[:, 1]:
    if row != "-":
        i += 1
    else:
        break

print(i)

for word in wiki_error.iloc[:, 0]:
    if len(word) == 1:
        i += 1
        continue
    
    for _ in range(10):
        try:
            wiki_error.iloc[i, 1] = "-".join(commit_error(word))
            break
        except Exception as e:
            pass
    else:
        wiki_error.iloc[i, 1] = "ERROR"

    i += 1

target = "./src/spell_error_wiki.csv"
wiki_error.to_csv(target, index=False, header=False)


0
