<a href="https://colab.research.google.com/github/ShiftLove/Spelling-Checker-Autocorrect/blob/main/Venacular_(Luo_Language)_autocorrect_Spell_Checker_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import re
import string
from collections import Counter
import numpy as np
import tensorflow as tf


In [5]:
def read_corpus(filename):
  with open(filename, "r") as file:
    lines = file.readlines()
    words = []
    for line in lines:
      words += re.findall(r'\w+', line.lower())

  return words

In [6]:
words = read_corpus("./Luo.txt")
print(f"There are {len(words)} total words in the corpus")


There are 6179 total words in the corpus


In [7]:
vocabs = set(words)
print(f"There are {len(vocabs)} unique words in the vocabulary")

There are 1817 unique words in the vocabulary


In [8]:
word_counts = Counter(words)
print(word_counts["mane"])

53


In [9]:
total_word_count = float(sum(word_counts.values()))
word_probas = {word: word_counts[word] / total_word_count for word in word_counts.keys()}

In [10]:
print(word_probas["mondo"])

0.008901116685547824


In [11]:
def split(word):
  return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [12]:
print(split("nege"))

[('', 'nege'), ('n', 'ege'), ('ne', 'ge'), ('neg', 'e'), ('nege', '')]


In [13]:
#List Comprehension
def delete(word):
  return [l + r[1:] for l,r in split(word) if r]

In [14]:
print(delete("mondo"))

['ondo', 'mndo', 'modo', 'mono', 'mond']


In [15]:
def swap(word):
  return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1]

In [16]:
print(swap("mondo"))

['omndo', 'mnodo', 'modno', 'monod']


In [17]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [18]:
def replace(word):
  letters = string.ascii_lowercase
  return [l + c + r[1:] for l, r in split(word) if r for c in letters]

In [19]:
print(replace("mondo"))

['aondo', 'bondo', 'condo', 'dondo', 'eondo', 'fondo', 'gondo', 'hondo', 'iondo', 'jondo', 'kondo', 'londo', 'mondo', 'nondo', 'oondo', 'pondo', 'qondo', 'rondo', 'sondo', 'tondo', 'uondo', 'vondo', 'wondo', 'xondo', 'yondo', 'zondo', 'mando', 'mbndo', 'mcndo', 'mdndo', 'mendo', 'mfndo', 'mgndo', 'mhndo', 'mindo', 'mjndo', 'mkndo', 'mlndo', 'mmndo', 'mnndo', 'mondo', 'mpndo', 'mqndo', 'mrndo', 'msndo', 'mtndo', 'mundo', 'mvndo', 'mwndo', 'mxndo', 'myndo', 'mzndo', 'moado', 'mobdo', 'mocdo', 'moddo', 'moedo', 'mofdo', 'mogdo', 'mohdo', 'moido', 'mojdo', 'mokdo', 'moldo', 'momdo', 'mondo', 'moodo', 'mopdo', 'moqdo', 'mordo', 'mosdo', 'motdo', 'moudo', 'movdo', 'mowdo', 'moxdo', 'moydo', 'mozdo', 'monao', 'monbo', 'monco', 'mondo', 'moneo', 'monfo', 'mongo', 'monho', 'monio', 'monjo', 'monko', 'monlo', 'monmo', 'monno', 'monoo', 'monpo', 'monqo', 'monro', 'monso', 'monto', 'monuo', 'monvo', 'monwo', 'monxo', 'monyo', 'monzo', 'monda', 'mondb', 'mondc', 'mondd', 'monde', 'mondf', 'mondg', 

In [20]:
def insert(word):
  letters = string.ascii_lowercase
  return [l + c + r for l, r in split(word) for c in letters]

In [21]:
print(insert("mondo"))

['amondo', 'bmondo', 'cmondo', 'dmondo', 'emondo', 'fmondo', 'gmondo', 'hmondo', 'imondo', 'jmondo', 'kmondo', 'lmondo', 'mmondo', 'nmondo', 'omondo', 'pmondo', 'qmondo', 'rmondo', 'smondo', 'tmondo', 'umondo', 'vmondo', 'wmondo', 'xmondo', 'ymondo', 'zmondo', 'maondo', 'mbondo', 'mcondo', 'mdondo', 'meondo', 'mfondo', 'mgondo', 'mhondo', 'miondo', 'mjondo', 'mkondo', 'mlondo', 'mmondo', 'mnondo', 'moondo', 'mpondo', 'mqondo', 'mrondo', 'msondo', 'mtondo', 'muondo', 'mvondo', 'mwondo', 'mxondo', 'myondo', 'mzondo', 'moando', 'mobndo', 'mocndo', 'modndo', 'moendo', 'mofndo', 'mogndo', 'mohndo', 'moindo', 'mojndo', 'mokndo', 'molndo', 'momndo', 'monndo', 'moondo', 'mopndo', 'moqndo', 'morndo', 'mosndo', 'motndo', 'moundo', 'movndo', 'mowndo', 'moxndo', 'moyndo', 'mozndo', 'monado', 'monbdo', 'moncdo', 'monddo', 'monedo', 'monfdo', 'mongdo', 'monhdo', 'monido', 'monjdo', 'monkdo', 'monldo', 'monmdo', 'monndo', 'monodo', 'monpdo', 'monqdo', 'monrdo', 'monsdo', 'montdo', 'monudo', 'monvdo',

In [22]:
#Level 1 Edit
def edit1(word):
  return set(delete(word) + swap(word) + replace(word) + insert(word))

In [23]:
print(edit1("mondo"))

{'zmondo', 'moqndo', 'mlndo', 'nondo', 'eondo', 'mondgo', 'mqondo', 'mondoc', 'mnodo', 'mondp', 'tmondo', 'mondfo', 'mondmo', 'mondi', 'londo', 'moncdo', 'mongo', 'mondz', 'moendo', 'mondqo', 'oondo', 'mondox', 'sondo', 'mondoh', 'tondo', 'monpdo', 'mondd', 'lmondo', 'gmondo', 'mondto', 'moedo', 'xmondo', 'mmndo', 'monhdo', 'moddo', 'dondo', 'monyo', 'mopndo', 'monbdo', 'msndo', 'mundo', 'monzo', 'mondt', 'monzdo', 'meondo', 'moqdo', 'emondo', 'msondo', 'mondoe', 'monxo', 'zondo', 'monjo', 'mondh', 'morndo', 'monio', 'monduo', 'moodo', 'mocndo', 'pondo', 'moydo', 'mhondo', 'mojndo', 'monmdo', 'mondoy', 'mqndo', 'wmondo', 'mondv', 'dmondo', 'mondom', 'myondo', 'bondo', 'momndo', 'mgndo', 'mondx', 'monod', 'imondo', 'monlo', 'mlondo', 'moldo', 'hmondo', 'monto', 'uondo', 'montdo', 'mofdo', 'mrondo', 'mofndo', 'pmondo', 'mvondo', 'monido', 'mokdo', 'monudo', 'mondg', 'mondoo', 'mohndo', 'mocdo', 'monvdo', 'mondco', 'mpndo', 'momdo', 'smondo', 'mondc', 'mtndo', 'mzondo', 'moyndo', 'aondo',

In [24]:
#Level 2 Edit
def edit2(word):
  return set(e2 for e1 in edit1(word) for e2 in edit1(e1))

In [25]:
print(edit2("mondo"))

{'monidj', 'monthdo', 'monwdb', 'moxndw', 'imonio', 'asndo', 'gonzo', 'fodo', 'modns', 'smoendo', 'mosjo', 'mwndu', 'rmlndo', 'mmnwdo', 'lmonfo', 'jmondou', 'monydj', 'mindzo', 'xonjdo', 'kmondwo', 'mnngo', 'mpnbdo', 'miaondo', 'mondqao', 'qmondob', 'wmonado', 'gmjndo', 'zmonro', 'monotd', 'monaf', 'moidz', 'pmonds', 'mojwdo', 'moqnmdo', 'msohndo', 'nmwondo', 'monhdko', 'mondyn', 'movnsdo', 'vmkondo', 'mogdeo', 'mwnndo', 'mynmo', 'yondr', 'mnqdo', 'conado', 'aovdo', 'zmondwo', 'noydo', 'moxndi', 'smonko', 'mbnlo', 'mnondjo', 'moldfo', 'monrhdo', 'yonudo', 'motqndo', 'fmondmo', 'mondbon', 'wmlondo', 'muondio', 'mhomndo', 'mpjdo', 'qando', 'monndf', 'qmondbo', 'mojeo', 'mondspo', 'mosbo', 'monsdk', 'tmdondo', 'monor', 'gemondo', 'kondj', 'ymondo', 'mondopf', 'mkond', 'mounde', 'mofvdo', 'mondxjo', 'qdndo', 'mondof', 'flondo', 'mlnbdo', 'moodzo', 'monkado', 'mondoal', 'mgbndo', 'noxdo', 'mhfndo', 'mtiondo', 'xmoendo', 'pondob', 'mjondof', 'mojnao', 'xonrdo', 'msonds', 'monmdeo', 'mondbco'

In [26]:
def correct_spelling(word, vocabulary, word_probabilities):
  if word in vocabulary:
    print(f"{word} is already correctly spelt")
    return 

  suggestions = edit1(word) or edit2(word) or [word]
  best_guesses = [w for w in suggestions if w in vocabulary]
  return [(w, word_probabilities[w]) for w in best_guesses]

In [32]:
word = "tich"
corrections = correct_spelling(word, vocabs, word_probas)

if corrections:
  print(corrections)
  probs = np.array([c[1] for c in corrections])
  best_ix = tf.argmax(probs)
  correct = corrections[best_ix][0]
  print(f"{correct} is suggested for {word}")
else:
  print('No word suggestion.')

tich is already correctly spelt
No word suggestion.


In [28]:
class SpellChecker(object):

  def __init__(self, corpus_file_path):
    with open(corpus_file_path, "r") as file:
      lines = file.readlines()
      words = []
      for line in lines:
        words += re.findall(r'\w+', line.lower())

    self.vocabs = set(words)
    self.word_counts = Counter(words)
    total_words = float(sum(self.word_counts.values()))
    self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs}

  def _level_one_edits(self, word):
    letters = string.ascii_lowercase
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [l + r[1:] for l,r in splits if r]
    swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
    replaces = [l + c + r[1:] for l, r in splits if r for c in letters]
    inserts = [l + c + r for l, r in splits for c in letters] 

    return set(deletes + swaps + replaces + inserts)

  def _level_two_edits(self, word):
    return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

  def check(self, word):
    candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
    valid_candidates = [w for w in candidates if w in self.vocabs]
    return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)


In [29]:
checker = SpellChecker("./Luo.txt")

In [30]:
checker.check("nomiy")

[('nomiyo', 0.0006473539407671144), ('nomiye', 0.0003236769703835572)]