## Data collection

In [1]:
import json
import requests

In [2]:
# 1. https://github.com/eymenefealtun/all-words-in-all-languages
words1 = set()
url_comma_txt = 'https://raw.githubusercontent.com/eymenefealtun/all-words-in-all-languages/main/Georgian/Georgian.txt'

response = requests.get(url_comma_txt)

for word in response.text.split(','):
    word = word.strip()
    if word:
        words1.add(word)

print(f'1 found {len(words1)} words')

# 2. https://github.com/AleksandreSukh/GeorgianWordsDataBase
words2 = set()
json_urls = [
    'https://raw.githubusercontent.com/AleksandreSukh/GeorgianWordsDataBase/master/wordsChunk_0.json',
    'https://raw.githubusercontent.com/AleksandreSukh/GeorgianWordsDataBase/master/wordsChunk_1.json',
    'https://raw.githubusercontent.com/AleksandreSukh/GeorgianWordsDataBase/master/wordsChunk_2.json',
]

for url in json_urls:
    response = requests.get(url)

    words = json.loads(response.text)
    for word in words:
        words2.add(word)

print(f'2 found {len(words2)} words')

# take intersection of all sets to only leave correct words
all_words = words1 & words2

# filter only correct words
all_words = {word for word in all_words if "ა" <= word[0] <= "ჰ" and "ა" <= word[-1] <= "ჰ" and not word.startswith('ა-') and not word.startswith('აა-')}

all_words_list = sorted(all_words)

print()
print("Total unique Georgian words:", len(all_words_list))

with open('georgian_words.txt', 'w', encoding='utf-8') as f:
    f.write(','.join(all_words_list))

print('Saved georgian_words.txt')

1 found 193210 words
2 found 271787 words

Total unique Georgian words: 77916
Saved georgian_words.txt


## Data Preparation

To generate incorrect words, three types of typos are simulated:

1. **Keyboard typos** - Characters are replaced with neighboring keys based on the Georgian keyboard layout, including accidental 'Backspace' and 'Shift' key errors

2. **Character swaps** - Adjacent characters are transposed to simulate typing out of sequence

3. **Character doubling** - Characters are duplicated to simulate pressing a key twice

In [3]:
import random
from collections import defaultdict

In [4]:
# map of neighboring georgian letters on a standard keyboard. None = Backspace, e. g. skip current
georgian_keyboard_map = [
    ['ქ', 'წჭ', 'ე', 'რღ', 'ტთ', 'ყ', 'უ', 'ი', 'ო', 'პ'],
    ['ა', 'სშ', 'დ', 'ფ', 'გ', 'ჰ', 'ჯჟ', 'კ', 'ლ', None],
    ['ზძ', 'ხ', 'ცჩ', 'ვ', 'ბ', 'ნ', 'მ', None, None, None],
]

n, m = len(georgian_keyboard_map), len(georgian_keyboard_map[0])

dirs = [
    (-1, -1), (-1, 0), (-1, 1),
    ( 0, -1), ( 0, 0), ( 0, 1),
    ( 1, -1), ( 1, 0), ( 1, 1),
]

step_probabilities = [
    0.00005, 0.00020, 0.00005,
    0.00020, 0.99900, 0.00020,
    0.00005, 0.00200, 0.00005,
]

second_step_probabilities = [
    0.000005, 0.000020, 0.000005,
    0.000020, 0.999900, 0.000020,
    0.000005, 0.000200, 0.000005,
]

In [5]:
def keyboard_typo(word, second_step_prob = 0.15, shift_change_prob = 0.05):
    char_to_pos = {}

    for i, row in enumerate(georgian_keyboard_map):
        for j, cell in enumerate(row):
            if cell is not None:
                for shift_idx, char in enumerate(cell):
                    char_to_pos[char] = (i, j, shift_idx)
    
    corrupted_chars = []
    
    for char in word:
        if char not in char_to_pos:
            corrupted_chars.append(char)
            continue
        
        row, col, shift_idx = char_to_pos[char]
        
        direction_idx = random.choices(range(len(dirs)), weights = step_probabilities)[0]
        dr, dc = dirs[direction_idx]
        
        new_row = row + dr
        new_col = col + dc
        
        if (dr != 0 or dc != 0) and random.random() < second_step_prob:
            second_direction_idx = random.choices(range(len(dirs)), weights = second_step_probabilities)[0]
            dr2, dc2 = dirs[second_direction_idx]
            new_row += dr2
            new_col += dc2
        
        if 0 <= new_row < n and 0 <= new_col < m:
            target_cell = georgian_keyboard_map[new_row][new_col]
            
            if target_cell is None:  # None = Backspace = skip
                continue

            if random.random() < shift_change_prob and len(target_cell) > 1:
                new_shift_idx = 1 - shift_idx if shift_idx < 2 else 0
                new_shift_idx = min(new_shift_idx, len(target_cell) - 1)
            else:
                new_shift_idx = min(shift_idx, len(target_cell) - 1)
            
            corrupted_chars.append(target_cell[new_shift_idx])
        else:
            corrupted_chars.append(char)
    
    return ''.join(corrupted_chars)

def swap_adjacent_chars(word, swap_prob = 0.005):
    chars = list(word)
    current_prob = swap_prob
    i = 0
    
    while i < len(chars) - 1:
        if random.random() < current_prob:
            chars[i], chars[i + 1] = chars[i + 1], chars[i]
            current_prob /= 10 # decrease probability for less swaps
            i += 2
        else:
            i += 1
    
    return ''.join(chars)

def double_char(word, double_prob = 0.005):
    chars = []
    current_prob = double_prob
    
    for char in word:
        chars.append(char)
        if random.random() < current_prob:
            chars.append(char)
            current_prob /= 10 # decrease probability for less duplications
    
    return ''.join(chars)

def corrupt_word(word, second_step_prob = 0.1, shift_change_prob = 0.05, swap_prob = 0.005, double_prob = 0.005):
    word = keyboard_typo(word, second_step_prob, shift_change_prob)
    word = swap_adjacent_chars(word, swap_prob)
    word = double_char(word, double_prob)
    return word

# function that returns edit distance (Levenshtein distance) between two words
def edit_distance(a, b):
    n, m = len(a), len(b)
    dp = [[0] * (m + 1) for _ in range(n + 1)]

    for i in range(n + 1):
        dp[i][0] = i
    for j in range(m + 1):
        dp[0][j] = j

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            if a[i - 1] == b[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])

    return dp[n][m]

In [6]:
random.seed(42)

pairs = []
identity_count = 0

for word in all_words_list:
    corrupted = corrupt_word(word)

    if corrupted == word:
        identity_count += 1

    pairs.append((corrupted, word))

print("Total pairs:", len(pairs))
print("Already correct inputs:", identity_count)

distance_groups = defaultdict(int)

for corrupted, correct in pairs:
    d = edit_distance(corrupted, correct)
    distance_groups[d] += 1

print("Edit distance distribution:")
for d in sorted(distance_groups):
    print(f"\tdistance {d}: {distance_groups[d]}")



Total pairs: 77916
Already correct inputs: 64648
Edit distance distribution:
	distance 0: 64648
	distance 1: 9898
	distance 2: 3061
	distance 3: 290
	distance 4: 18
	distance 5: 1
