In [1]:
import torch
import random

from utils import corrupt_word, edit_distance, encode_word, decode_indices
from models import Encoder, Decoder, Seq2Seq

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

model_path = 'georgian_spellcheck_seq2seq.pt'
checkpoint = torch.load(model_path, map_location = device)

itos = checkpoint['vocab']['itos']
stoi = checkpoint['vocab']['stoi']
pad_idx = checkpoint['vocab']['pad_idx']
sos_idx = checkpoint['vocab']['sos_idx']
eos_idx = checkpoint['vocab']['eos_idx']
unk_idx = checkpoint['vocab']['unk_idx']

embedding_dim = checkpoint['config']['embedding_dim']
hidden_dim = checkpoint['config']['hidden_dim']
num_layers = checkpoint['config']['num_layers']
dropout = checkpoint['config']['dropout']
max_target_len = checkpoint['config']['max_target_len']

vocab_size = len(itos)

Using device: cpu


In [3]:
encoder = Encoder(vocab_size, embedding_dim, hidden_dim, pad_idx, num_layers, dropout)
decoder = Decoder(vocab_size, embedding_dim, hidden_dim, pad_idx, num_layers, dropout)

model = Seq2Seq(
    encoder,
    decoder,
    sos_idx,
    eos_idx,
    pad_idx,
    max_target_len,
    vocab_size
).to(device)

model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print('Model loaded')

Model loaded


In [4]:
def correct_word(word, model = model):
    src_indices = encode_word(word, stoi, unk_idx, eos_idx)
    src_tensor = torch.tensor(src_indices, dtype = torch.long).unsqueeze(0).to(device)
    src_length = torch.tensor([len(src_indices)], dtype = torch.long).to(device)

    with torch.no_grad():
        logits = model(src_tensor, src_length, teacher_forcing_ratio = 0.0)

    token_indices = logits.argmax(dim = 2)[0].tolist()
    return decode_indices(token_indices, itos, eos_idx, pad_idx)

In [5]:
with open('georgian_words.txt', 'r', encoding = 'utf-8') as f:
    all_words_list = [w for w in f.read().split(',') if w]

print("Loaded words:", len(all_words_list))

Loaded words: 77263


## Check on words from the dataset

In [6]:
random.seed(42)

sample_words_seen = random.sample(all_words_list, 1000)
results_seen = []

for word in sample_words_seen:
    corrupted = corrupt_word(word)
    corrected = correct_word(corrupted)
    results_seen.append((word, corrupted, corrected))

correct_cnt_seen = 0
print('Correctly fixed seen words:\n')

for real_world, corrupted, predicted in results_seen:
    if corrupted != real_world and predicted == real_world:
        correct_cnt_seen += 1
        print(corrupted, '->', predicted, '| real_world:', real_world)

print()
print('Fixed a total of', correct_cnt_seen, 'seen words')

print('\n==============================================================\n')

incorrect_cnt_seen = 0
print('Not fixed seen words:\n')

for real_world, corrupted, predicted in results_seen:
    if corrupted != real_world and predicted != real_world:
        incorrect_cnt_seen += 1
        print(corrupted, '->', predicted, '| real_world:', real_world)

print()
print('Couldn\'t fix a total of', incorrect_cnt_seen, 'seen words')

Correctly fixed seen words:

კანიშ -> კანის | real_world: კანის
დათქქმულ -> დათქმულ | real_world: დათქმულ
ჭევრ -> წევრ | real_world: წევრ
პირობიტად -> პირობითად | real_world: პირობითად
ნაწარმოებსი -> ნაწარმოებში | real_world: ნაწარმოებში
აშლიშ -> აშლის | real_world: აშლის
ჩამოვჟექი -> ჩამოვჯექი | real_world: ჩამოვჯექი
გადავიწყებბული -> გადავიწყებული | real_world: გადავიწყებული
ითამაშშეს -> ითამაშეს | real_world: ითამაშეს
ზურეებიანი -> ზურებიანი | real_world: ზურებიანი
იილი -> ილი | real_world: ილი
სააამისო -> საამისო | real_world: საამისო
საცცოლედ -> საცოლედ | real_world: საცოლედ
იძინებშ -> იძინებს | real_world: იძინებს
წყალივიტ -> წყალივით | real_world: წყალივით
სეჰპირდა -> შეჰპირდა | real_world: შეჰპირდა
ილაპარაკოშ -> ილაპარაკოს | real_world: ილაპარაკოს
მმოხვიდე -> მოხვიდე | real_world: მოხვიდე
ჩამოტვლისაშ -> ჩამოთვლისას | real_world: ჩამოთვლისას
ფღინველით -> ფრინველით | real_world: ფრინველით
შაწოლებს -> საწოლებს | real_world: საწოლებს
წაიქჩა -> წაიქცა | real_world: წაიქცა
ფუნქჩიონირ

In [7]:
total_corrupted_seen = correct_cnt_seen + incorrect_cnt_seen

if total_corrupted_seen > 0:
    accuracy_seen = (correct_cnt_seen / total_corrupted_seen) * 100
    print(f'Correction Accuracy (Seen Words): {accuracy_seen:.2f}%')
else:
    print('No corrupted words were found in the seen results.')

Correction Accuracy (Seen Words): 53.29%


## Check on words not from the dataset

In [8]:
import requests
import re
from bs4 import BeautifulSoup
import time

In [9]:
urls = [
    'https://www.ambebi.ge/',
    'https://www.ambebi.ge/article/336991-didi-siprtxilisken-movucodeb-qvelas-ra-unda-g/',
    'https://www.ambebi.ge/article/336981-nickelodeon-is-varskvlavis-tragikuli-dacema-msaxi/',
    'https://www.palitravideo.ge/video/185656-xvicha-kvaracxelia-videomimartvas-avrcelebs/',
    'https://www.palitravideo.ge/video/185653-axali-gadacqvetileba-romelic-2026-clis-1-eli-ianvridan-amokmeddeba-da-mzgolebs-sheexeba-ris-gamo-dajarimdebit-100-laris-odenobit/'
]

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

all_scraped_words = set()

print('Starting multi-link scrape for unseen words...')

for url in urls:
    try:
        response = requests.get(url, headers = headers, timeout = 10)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        
        page_text = soup.get_text()
        found = re.findall(r'[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ]{4,}', page_text)
        all_scraped_words.update(found)
        
        print(f'Successfully scraped {len(found)} words from: {url[:50]}...')
        time.sleep(1) 
    except Exception as e:
        print(f'Failed to scrape {url}: {e}')

existing_words_set = set(all_words_list)
unseen_words = [w for w in all_scraped_words if w not in existing_words_set]

print(f'\nTotal unique unseen words found: {len(unseen_words)}')

results_unseen = []

for word in unseen_words:
    corrupted = corrupt_word(word)
    corrected = correct_word(corrupted)
    results_unseen.append((word, corrupted, corrected))

correct_cnt_unseen = 0
print('\nCorrectly fixed unseen words:\n')

for real_world, corrupted, predicted in results_unseen:
    if corrupted != real_world and predicted == real_world:
        correct_cnt_unseen += 1
        print(corrupted, '->', predicted, '| real_world:', real_world)

print()
print('Fixed a total of', correct_cnt_unseen, 'unseen words')

print('\n==============================================================\n')

incorrect_cnt_unseen = 0
print('Not fixed unseen words:\n')

for real_world, corrupted, predicted in results_unseen:
    if corrupted != real_world and predicted != real_world:
        incorrect_cnt_unseen += 1
        if incorrect_cnt_unseen <= 15:
            print(corrupted, '->', predicted, '| real_world:', real_world)

print()
print('Couldn\'t fix a total of', incorrect_cnt_unseen, 'unseen words')

Starting multi-link scrape for unseen words...
Successfully scraped 817 words from: https://www.ambebi.ge/...
Successfully scraped 520 words from: https://www.ambebi.ge/article/336991-didi-siprtxil...
Successfully scraped 459 words from: https://www.ambebi.ge/article/336981-nickelodeon-i...
Successfully scraped 132 words from: https://www.palitravideo.ge/video/185656-xvicha-kv...
Successfully scraped 131 words from: https://www.palitravideo.ge/video/185653-axali-gad...

Total unique unseen words found: 247

Correctly fixed unseen words:

ისაუბღებენ -> ისაუბრებენ | real_world: ისაუბრებენ
გვიჭყობს -> გვიწყობს | real_world: გვიწყობს
გაუარეშდება -> გაუარესდება | real_world: გაუარესდება
ვერკჰაიზეღმა -> ვერკჰაიზერმა | real_world: ვერკჰაიზერმა
გაიმეტოო -> გაიმეტო | real_world: გაიმეტო
იშგან -> ისგან | real_world: ისგან
ცაუყენონ -> ჩაუყენონ | real_world: ჩაუყენონ
დაკალიბრებულიი -> დაკალიბრებული | real_world: დაკალიბრებული
ბავშვისთვიშაც -> ბავშვისთვისაც | real_world: ბავშვისთვისაც
ტრევიშ -> ტრე

In [10]:
total_corrupted_unseen = correct_cnt_unseen + incorrect_cnt_unseen

if total_corrupted_unseen > 0:
    accuracy_unseen = (correct_cnt_unseen / total_corrupted_unseen) * 100
    print(f'Correction Accuracy (Unseen Words): {accuracy_unseen:.2f}%')
else:
    print('No corrupted words were found in the unseen results.')

Correction Accuracy (Unseen Words): 52.17%


Based on the results, models performs decently on both seen words (but with different errors) and unseen errors (completely different errors). If dataset was bigger, results would be even better, but training such a model would take a lot more resources.