In [2]:
import re
import string
from collections import Counter
import numpy as np

In [3]:
def read_corpus(filename):
  with open(filename, "r") as file:
    lines = file.readlines()
    words = []
    for line in lines:
      words += re.findall(r'\w+', line.lower())

  return words

In [6]:
words=read_corpus("t8.shakespeare.txt")
print(f'the length of my dataset is {len(words)}')

the length of my dataset is 929396


In [7]:
vocabs = set(words)
print(f"There are {len(vocabs)} unique words in the vocabulary")

There are 23902 unique words in the vocabulary


In [10]:
word_counts = Counter(words)
print(word_counts["come"])

2519


In [11]:
class SpellChecker(object):

  def __init__(self, corpus_file_path):
    with open(corpus_file_path, "r") as file:
      lines = file.readlines()
      words = []
      for line in lines:
        words += re.findall(r'\w+', line.lower())

    self.vocabs = set(words)
    self.word_counts = Counter(words)
    total_words = float(sum(self.word_counts.values()))
    self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs}

  def _level_one_edits(self, word):
    letters = string.ascii_lowercase
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [l + r[1:] for l,r in splits if r]
    swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
    replaces = [l + c + r[1:] for l, r in splits if r for c in letters]
    inserts = [l + c + r for l, r in splits for c in letters]

    return set(deletes + swaps + replaces + inserts)

  def _level_two_edits(self, word):
    return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

  def check(self, word):
    candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
    valid_candidates = [w for w in candidates if w in self.vocabs]
    return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)


In [12]:
ccc=SpellChecker("t8.shakespeare.txt")

In [17]:
if(ccc.check('usman')==None):
  ccc.vocabs.add('usman')
else:
  ccc.check('usman')


In [18]:
ccc.check('usman')

[]