In [10]:
from tensorflow import keras
model = keras.models.load_model('mymodel3.h5')

In [2]:
from keras.preprocessing.text import Tokenizer
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import re
from keras.utils import to_categorical


In [3]:
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
import re

In [4]:
corpus = []
for line in fetch_20newsgroups().data:
    line = line.replace('\n', ' ').replace('\t', ' ').lower()
    line = re.sub('[^a-z ]', ' ', line) # to remove single characters words
    tokens = line.split(' ')
    tokens = [token for token in tokens if len(token) > 0]
    corpus.extend(tokens)

corpus = Counter(corpus)

In [5]:
class SpellCheck:
    def __init__(self, dictionary=None, verbose=0):
        self.verbose = verbose
        self.dictionary = dictionary
        
    def correction(self, text):
        return ''

In [6]:
class SpellCorrector(SpellCheck):
    def __init__(self, dictionary, verbose=0):
        super().__init__(dictionary=dictionary, verbose=verbose)

    def words(text):
        return re.findall(r'\w+', text.lower())

    def P(self, word): 
        "Probability of `word`."
        N = sum(self.dictionary.values())
        return self.dictionary[word] / N

    def correction(self, word): 
        "Most probable spelling correction for word."
        return max(self.candidates(word ,display = False), key=self.P)

    def candidates(self, word, verbose=0, display = True): 
        "Generate possible spelling corrections for word."
        
        known_result = self.known([word])
        edit1_result = self.known(self.edits1(word))
        edit2_result = self.known(self.edits2(word))
        
        if (self.verbose > 0 or verbose > 0) and display:
            print('Known Result: ', known_result)
            print('Edit1 Result: ', edit1_result)
            print('Edit2 Result: ', edit2_result)
        
        return (known_result or edit1_result or edit2_result or [word])

    def known(self, words):
        "The subset of `words` that appear in the dictionary of WORDS."
        return set(w for w in words if w in self.dictionary)

    def edits1(self, word):
        "All edits that are one edit away from `word`."
        letters    = 'abcdefghijklmnopqrstuvwxyz'
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
        inserts    = [L + c + R               for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word): 
        "All edits that are two edits away from `word`."
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))

In [14]:
spell_corrector.known(spell_corrector.candidates("ello"))

Known Result:  set()
Edit1 Result:  {'ella', 'ell', 'elco', 'elmo', 'llo', 'hello', 'elbo', 'eelo'}
Edit2 Result:  {'ills', 'tell', 'elx', 'belle', 'llb', 'eclr', 'eico', 'ehl', 'esl', 'elkp', 'jll', 'elr', 'elie', 'elte', 'ewlm', 'zllq', 'solo', 'egl', 'hilo', 'kell', 'holo', 'hells', 'valo', 'enzo', 'anello', 'll', 'eslx', 'dlvo', 'gallo', 'lle', 'alla', 'ejo', 'elbow', 'lno', 'ezl', 'kolo', 'llly', 'eko', 'lo', 'etla', 'evo', 'alle', 'kelly', 'clio', 'eio', 'elco', 'vll', 'elmo', 'mlo', 'else', 'llm', 'lli', 'sll', 'elw', 'vllu', 'sell', 'pelle', 'klao', 'esao', 'ellas', 'telli', 'eso', 'llq', 'elson', 'eller', 'erla', 'hallo', 'palo', 'elton', 'ill', 'krlo', 'erno', 'lll', 'ewl', 'eliz', 'yellow', 'cells', 'wllz', 'menlo', 'eero', 'belli', 'ecco', 'lfo', 'mella', 'edlr', 'enli', 'eozo', 'delco', 'colo', 'ulo', 'eml', 'elc', 'ebo', 'exo', 'wells', 'eclu', 'leno', 'lbo', 'elh', 'mll', 'efo', 'elof', 'kelso', 'elin', 'kella', 'vlo', 'fellow', 'ally', 'els', 'lho', 'elp', 'eopo', 'epld

{'eelo', 'elbo', 'elco', 'ell', 'ella', 'elmo', 'hello', 'llo'}

In [7]:
f = open("big.txt")
data = f.read()
len(data)

3603366

In [8]:
cleaned = re.sub(r'\W+', ' ',data[:300000]).lower()
tokens = word_tokenize(cleaned)
train_len = 3+1
text_sequences = []
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)
sequences = {}
count = 1
for i in range(len(tokens)):
    if tokens[i] not in sequences:
        sequences[tokens[i]] = count
        count += 1
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [9]:
r = True
seq_len = 3
spell_corrector = SpellCorrector(dictionary=corpus, verbose=1)#creating an object
while r:
    from keras.preprocessing.sequence import pad_sequences
    input_text = input().strip().lower()
    #add spell correction here
    #converts the input text to encoded values 
    encoded_text = tokenizer.texts_to_sequences([input_text])[0]
    
    #selects the last 3 encoded values , incase of low number of words 0 is added at the begining of the array
    pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
    #print(encoded_text, pad_encoded)
    
    for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
      pred_word = tokenizer.index_word[i]
      print("Next word suggestion:",pred_word)
    spell = input_text.split(" ")[-1]
    print("spell Suggest:\t",spell_corrector.correction(spell))
    if input_text == "quit":
        r = False

how are you 
[90, 48, 12] [[90 48 12]]
Next word suggestion: the
Next word suggestion: really
Next word suggestion: he
spell Suggest:	 you
how are yoo
[90, 48] [[ 0 90 48]]
Next word suggestion: it
Next word suggestion: of
Next word suggestion: that
spell Suggest:	 yoo
ello
[] [[0 0 0]]
Next word suggestion: the
Next word suggestion: holmes
Next word suggestion: to
spell Suggest:	 hello
how are yo
[90, 48] [[ 0 90 48]]
Next word suggestion: it
Next word suggestion: of
Next word suggestion: that
spell Suggest:	 yo
quit
[] [[0 0 0]]
Next word suggestion: the
Next word suggestion: holmes
Next word suggestion: to
spell Suggest:	 quit
