In [1]:
import sys
import os
sys.path.append('..')

In [2]:
from enum import Enum
import numpy as np
from Levenshtein import distance as levenshtein_distance_function

class EditType(Enum):
    none = 0
    insert = 1
    replace = 2
    delete = 3
    transpose = 4
    
    def __lt__(self, other):
        self.value < other.value
        
    def probability(self):
        """
            A paper by Kukich (1992) suggested the following
            error rates based on an analysis of spelling errors
            in various databases:

            Substitution errors: 80%
            Deletion errors: 10%
            Insertion errors: 5%
            Transposition errors: 5%
        """
        
        
        if self == EditType.none:
            return 0
        elif self == EditType.insert:
            return 0.05
        elif self == EditType.replace:
            return 0.8
        elif self == EditType.delete:
            return 0.1
        else:
            return 0.05

class Edit:

    def __init__(self, 
                 word, 
                 edit,
                 edit_type: EditType,
                 edit_word_probability,
                 keyboard_distance,
                 prev_edit = None):
        self.word = word # the misspelled word
        self.edit = edit # the edit that would fix the misspelled word
        self.edit_type = edit_type # edit type, refer to the enum above
        self.prev_edit = prev_edit # reference to the previous edit, if any
        self.edit_word_probability = edit_word_probability # the probability of the suggested edit to occur in the whole text curpus
        self.keyboard_distance = keyboard_distance # for example, if edit_type is replace, keyboard_distance is the manhattan distance between the correct and wrong letters on a qwerty layout keyabord
        self.levenshtein_distance = levenshtein_distance_function(word, edit)
        
    def __hash__(self):
        return hash((self.edit, self.edit_type))

    def __eq__(self,other):
        return self.edit == other.edit and self.edit_type == other.edit_type
    
    def __repr__(self):
        return f"""
        Edit of
            content = {self.word}
            edit = {self.edit}
            type = {self.edit_type.name}
            edit word probaility = {self.edit_word_probability}
            keyboard_distance = {self.keyboard_distance}
            levenshtein distance = {self.levenshtein_distance}
            previous edit = 
                {self.prev_edit}
        """
    
    def probability(self, keyboard_weight = 1.0, edit_type_weight = 0.1, word_probability_weight = 20.0):
        # Scale the distances and probabilities so they have similar magnitudes.
        keyboard_distance_score = 1 / (1 + self.keyboard_distance + self.levenshtein_distance) ** keyboard_weight
        edit_type_probability = self.edit_type.probability() ** edit_type_weight
        word_probability = self.edit_word_probability ** word_probability_weight

        # Combine them using multiplication: this will return high probability only if all factors are high.
        combined_probability = keyboard_distance_score * edit_type_probability * word_probability
        return combined_probability




In [7]:
import re
from collections import Counter
from core.constants import keyboard_letters, neighbour_letters, keyboard_layouts

class SpellChecker:
    
    def __init__(self, language = "en", override_file = None):
        
        if override_file:
            file_to_open = override_file
        else:
            file_to_open = f'texts/big_{language}.txt'
        
        with open(file_to_open, "r") as file:
            raw_text = file.read()
            raw_words = re.findall(r'\w+', raw_text.lower())
            self.WORDS = Counter(raw_words)
        
        self.N = sum(self.WORDS.values())
        self.letters = keyboard_letters[language]
        self.neighbour_letters = neighbour_letters[language]
        self.keyboard_layout = keyboard_layouts[language]
        
    def keyboard_distance(self, key1, key2):

        key_coordinates = {}

        for i, row in enumerate(self.keyboard_layout):
            for j, key in enumerate(row):
                key_coordinates[key] = (i, j)

        if key1 not in key_coordinates or key2 not in key_coordinates:
            return 0

        x1, y1 = key_coordinates[key1]
        x2, y2 = key_coordinates[key2]

        return abs(x1 - x2) + abs(y1 - y2)
        
    def P(self, word): 
        "Probability of `word`."
        
        # first sort by probability, then sort by edit_type, then sort by keyboard distance, if any
        return self.WORDS[word] / self.N

    def correction(self, word): 
        "Most probable spelling correction for word."
        
        # correction is defined as following:
        candidates = list(self.candidates(word))
        
        candidates.sort(key = lambda x: self.P(x), reverse = True)
        candidates.sort(key = lambda x: x.keyboard_distance)
        candidates.sort(key = lambda x: x.edit_type.value)
        
        return candidates[0]

    def candidates(self, word): 
        "Generate possible spelling corrections for word."
        return (self.known([Edit(word, word, EditType.none, 0, 0)]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [Edit(word, word, EditType.none, 0, 0)])

    def known(self, edits): 
        "The subset of `words` that appear in the dictionary of WORDS."
        return set(w for w in edits if w.edit in self.WORDS)

    def edits1(self, word, prev_edit: Edit = None):
        "All edits that are one edit away from `word`."
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        
        deletes    = set(self.__edits_deletes(word, splits, prev_edit))
        transposes = set(self.__edits_transposes(word, splits, prev_edit))
        replaces   = set(self.__edits_replaces(word, splits, prev_edit))
        inserts    = set(self.__edits_inserts(word, splits, prev_edit))
        all_set = deletes.union(transposes).union(replaces).union(inserts)
        print([i.edit for i in replaces])
        
        return all_set

    def edits2(self, word): 
        "All edits that are two edits away from `word`."
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1.edit, prev_edit = e1))
    
    def __edits_transposes(self, word, splits, prev_edit: Edit):
        for L, R in splits:
            if len(R) > 1:
                edit_word = L + R[1] + R[0] + R[2:]
                yield Edit(
                    word = word, 
                    edit = edit_word,
                    edit_type = EditType.transpose,
                    prev_edit = prev_edit, 
                    edit_word_probability = self.P(edit_word),
                    keyboard_distance = 1
                )
    
    def __edits_inserts(self, word, splits, prev_edit: Edit):
        for L, R in splits:
            for c in self.letters:
                
                # Left's distance
                if len(L) > 0:
                    left = L[-1]
                    L_distance = self.keyboard_distance(left, c)
                else:
                    L_distance = -1
                
                edit_word = L + c + R
                yield Edit(
                    word = word,
                    edit = edit_word,
                    edit_type = EditType.insert,
                    prev_edit = prev_edit,
                    edit_word_probability = self.P(edit_word),
                    keyboard_distance = L_distance)
    
    def __edits_deletes(self, word, splits, prev_edit: Edit):
        for L, R in splits:
            if R:
                omitted = R[0]
                
                # Left's distance
                if len(L) > 0:
                    left = L[-1]
                    L_distance = self.keyboard_distance(left, omitted)
                else:
                    L_distance = -1
                    
                # Left's distance
                if len(R) > 0:
                    right = R[-1]
                    R_distance = self.keyboard_distance(right, omitted)
                else:
                    R_distance = -1
                
                key_distance = min(L_distance, R_distance)
                edit_word = L + R[1:]
                yield Edit(
                    word = word,
                    edit = edit_word,
                    edit_type = EditType.delete,
                    prev_edit = prev_edit, 
                    edit_word_probability = self.P(edit_word),
                    keyboard_distance = key_distance
                )
    
    def __edits_replaces(self, word, splits, prev_edit: Edit):
        for L, R in splits:
            if R:
                for c in self.letters:
                    
                    omitted = R[0]
                    key_distance = self.keyboard_distance(c, omitted)
                    edit_word = L + c + R[1:]
                    yield Edit(
                        word = word, 
                        edit = edit_word,
                        edit_type = EditType.replace, 
                        prev_edit = prev_edit,
                        edit_word_probability = self.P(edit_word),
                        keyboard_distance = key_distance
                    )
    

In [8]:
correct = ["տետր", "պատուհան", "Կարինե", "կարտոֆիլ", "գրիչ", "անհավանական", "քննություն", "արհեստական", "բնական", "համակարգիչ", "մատիտ", "սեղան", "գրատախտակ", "ականջակալ", "քաղաք", "հանրապետություն", "թուղթ", "գրադարան", "վարձակալություն", "գրագետ"]
correct = ["տետհ", "պատուեան", "ուարինե", "կարտոֆիլ", "գրիչ", "անհավանական", "քննություն", "արհեստական", "բնական", "համակարգիչ", "մատիտ", "սեղան", "գրատախտակ", "ականջակալ", "քաղաք", "հանրապետություն", "թուղթ", "գրադարան", "վարձակալություն", "գրագետ"]


In [11]:
checker = SpellChecker(language = "am", override_file = "../texts/big_am.txt")


In [13]:
checker.correction("քննություլ")

['քնէություլ', 'քննությոգլ', 'ոննություլ', 'քչնություլ', 'քննութծուլ', 'քղնություլ', 'քճնություլ', 'ւննություլ', 'փննություլ', 'քննոխթյուլ', 'քննզւթյուլ', 'քննոյթյուլ', 'քննություվ', 'քննություկ', 'քնմություլ', 'քննգւթյուլ', 'քննությոփլ', 'քննություփ', 'շննություլ', 'քննությում', 'քթնություլ', 'քննությորլ', 'քննություս', 'քնղություլ', 'քննութչուլ', 'քննությոծլ', 'քննությդւլ', 'քւնություլ', 'քննություպ', 'քննությզւլ', 'քննութձուլ', 'յննություլ', 'քհնություլ', 'քննությոիլ', 'քննփւթյուլ', 'քշնություլ', 'քննճւթյուլ', 'քննլւթյուլ', 'քննուբյուլ', 'քնկություլ', 'քննութւուլ', 'քնշություլ', 'քննությտւլ', 'քննոդթյուլ', 'քննուեյուլ', 'քննութիուլ', 'քննությրւլ', 'քննողթյուլ', 'քննություա', 'քննուֆյուլ', 'քննությււլ', 'քննոձթյուլ', 'քննություո', 'քննություգ', 'քնւություլ', 'քննությքւլ', 'քննուըյուլ', 'քննությոկլ', 'քննությոոլ', 'քննութնուլ', 'քննությռւլ', 'քննուվյուլ', 'քնբություլ', 'քնննւթյուլ', 'քննոհթյուլ', 'քննոեթյուլ', 'քննութօուլ', 'քննուճյուլ', 'քնրություլ', 'քննոճթյուլ', 'քննությոթլ', 'քննո


        Edit of
            content = քննություլ
            edit = քննություն
            type = replace
            edit word probaility = 3.924292548160881e-06
            keyboard_distance = 5
            levenshtein distance = 1
            previous edit = 
                None
        

In [116]:
len(checker.WORDS)

32198

In [117]:
checker.correction("receit")


        Edit of
            content = receit
            edit = receipt
            type = insert
            edit word probaility = 1.1653078877898144e-05
            keyboard_distance = 2
            levenshtein distance = 1
            previous edit = 
                None
        

In [118]:
candidates = list(checker.candidates("receit")) # receipt
candidates.sort(key = lambda x: checker.P(x), reverse = True)
candidates.sort(key = lambda x: x.keyboard_distance)
candidates.sort(key = lambda x: x.edit_type.value)
candidates

[
         Edit of
             content = receit
             edit = receipt
             type = insert
             edit word probaility = 1.1653078877898144e-05
             keyboard_distance = 2
             levenshtein distance = 1
             previous edit = 
                 None
         ,
 
         Edit of
             content = receit
             edit = deceit
             type = replace
             edit word probaility = 3.585562731660967e-06
             keyboard_distance = 2
             levenshtein distance = 1
             previous edit = 
                 None
         ,
 
         Edit of
             content = receit
             edit = recent
             type = replace
             edit word probaility = 4.750870619450781e-05
             keyboard_distance = 4
             levenshtein distance = 1
             previous edit = 
                 None
         ]

In [108]:
candidates = list(checker.candidates("receit")) # receipt
[(edit.probability(), edit.edit) for edit in candidates]

[(0.0, 'deceit'), (0.0, 'recent'), (0.0, 'receipt')]

In [110]:
checker.correction("mre")


        Edit of
            content = mre
            edit = more
            type = insert
            edit word probaility = 0.001790092193781738
            keyboard_distance = 4
            levenshtein distance = 1
            previous edit = 
                None
        

In [111]:
candidates = checker.candidates("mre") # more
candidates

{
         Edit of
             content = mre
             edit = are
             type = replace
             edit word probaility = 0.0032538981789823275
             keyboard_distance = 7
             levenshtein distance = 1
             previous edit = 
                 None
         ,
 
         Edit of
             content = mre
             edit = ere
             type = replace
             edit word probaility = 8.963906829152417e-07
             keyboard_distance = 6
             levenshtein distance = 1
             previous edit = 
                 None
         ,
 
         Edit of
             content = mre
             edit = ire
             type = replace
             edit word probaility = 8.963906829152417e-07
             keyboard_distance = 3
             levenshtein distance = 1
             previous edit = 
                 None
         ,
 
         Edit of
             content = mre
             edit = mare
             type = insert
             edit word pro

In [87]:
checker.correction("evar")


        Edit
            content = ever
            type = replace
            previous edit = 
                None
            keyboard_distance = 3
        

In [35]:
candidates = checker.candidates("evar") # ever
candidates

{
         Edit
             content = ear
             type = delete
             previous edit = 
                 None
             keyboard_distance = 2
         ,
 
         Edit
             content = eva
             type = delete
             previous edit = 
                 None
             keyboard_distance = 0
         ,
 
         Edit
             content = ever
             type = replace
             previous edit = 
                 None
             keyboard_distance = 3
         }

In [3]:
file_to_open = f'big_{"en"}.txt'
        
with open(file_to_open, "r") as file:
    corpus = file.read()
    


In [14]:
from collections import defaultdict

def train_language_model(corpus):
    # A function to train a trigram language model
    model = defaultdict(lambda: defaultdict(lambda: 0))

    for sentence in corpus:
        sentence = sentence.split()

        for w1, w2, w3 in zip(sentence[:-2], sentence[1:-1], sentence[2:]):
            model[(w1, w2)][w3] += 1  # Increase the count each time a trigram is encountered

    # Normalizing the counts to get probabilities
    for w1_w2 in model:
        total_count = float(sum(model[w1_w2].values()))
        for w3 in model[w1_w2]:
            model[w1_w2][w3] /= total_count

    return model


In [15]:
def predict_next_word(sentence, possible_words, model):
    # A function to predict the next word using a trigram language model
    sentence = sentence.split()
    if len(sentence) < 2:
        return None

    w1, w2 = sentence[-2], sentence[-1]
    probabilities = {word: model[(w1, w2)][word] for word in possible_words}
    print(probabilities)
    return max(probabilities, key=probabilities.get)


In [17]:
model = train_language_model(corpus)

In [21]:
model[("going", "to")]["him"]

0

In [131]:
len(raw_text)

6488665

In [135]:
predict_next_word("and according", ["to", "eavesdropping", "Anaconda", "me", "him"], model)

{'to': 0, 'eavesdropping': 0, 'Anaconda': 0, 'me': 0, 'him': 0}


'to'

In [12]:
import nltk
from nltk.lm import Laplace
from nltk.lm.preprocessing import padded_everygram_pipeline

def train_model(n, tokenized_text):
    train_data, padded_sentences = padded_everygram_pipeline(n, tokenized_text)
    model = Laplace(n)
    model.fit(train_data, padded_sentences)
    return model

def predict_next_word(model, sentence, list_of_words):
    tokenized_sentence = list(map(str.lower, nltk.word_tokenize(sentence)))
    # Using the model's vocabulary, find the missing words not in the vocabulary and give them a score of 0
    missing_words = set(list_of_words) - set(model.vocab)
    scores = {word: 0 for word in missing_words}
    
    # For words in the model's vocabulary, estimate the probability of the next word
    for word in model.vocab:
        if word in list_of_words:
            scores[word] = model.score(word, tokenized_sentence[-(model.order - 1):])
    print(scores)
    # Return the word with the highest probability
    return max(scores, key=scores.get)

In [9]:
# Training
n = 3
tokenized_text = [list(map(str.lower, nltk.word_tokenize(sent))) for sent in corpus]
model = train_model(n, tokenized_text)

In [13]:
# Prediction
sentence = "I am going"
list_of_words = ["to", "me", "Anaconda", "eavesdropping", "him"]
predicted_word = predict_next_word(model, sentence, list_of_words)

{'him': 0, 'eavesdropping': 0, 'to': 0, 'me': 0, 'Anaconda': 0}


In [11]:
predicted_word

'him'

In [22]:
from nltk.util import ngrams
from collections import Counter

def predict_next_word(sentence, possible_words, n=3):
    # split sentence into tokens
    tokens = nltk.word_tokenize(sentence)
    
    # create ngrams
    grams = list(ngrams(tokens, n))
    
    # count ngrams
    ngram_counts = Counter(grams)
    preceding_bigram = tuple(tokens[-(n-1):])  # get last (n-1) words from sentence

    # count preceding bigrams
    preceding_bigram_counts = Counter({key:val for key,val in ngram_counts.items() if key[:-1] == preceding_bigram})
    total_preceding_bigram_counts = sum(preceding_bigram_counts.values())

    # get trigram probabilities
    trigram_probabilities = {}
    for word in possible_words:
        trigram = preceding_bigram + (word,)
        trigram_count = ngram_counts.get(trigram, 0) + 1  # add-one Laplace Smoothing
        trigram_probability = trigram_count / (total_preceding_bigram_counts + len(possible_words))  # add V to denominator
        trigram_probabilities[word] = trigram_probability

    return trigram_probabilities
