### Install TextBlob and upgrade PIP

In [0]:
!/databricks/python3/bin/python -m pip install --upgrade pip
!pip install textblob

### Imports and create list of misspelt words

In [0]:
import textblob
from textblob import TextBlob

words = ['ageunt orage', 'agint orange', 'ageant oragne', 'ageant orange', 'agetn orange', 'asian orange', 'camp legum','camp legume', 'camp legoon', 'camp lejoon', 'Camp Lejeune', 'Camp Lejune', 'Camp Lajeune']

### Run TextBlob on misspelt words

In [0]:
[str(word + ' ------> ' + str(TextBlob(word).correct())) for word in words]

### Add words to misspelling dictionary

In [0]:
textblob.en.spelling.update({'Lejeune':1})
textblob.en.spelling.update({'Agent Orange':1})

Rerun TextBlob after new words have been added to dictionary

In [0]:
[str(word + ' ------> ' + str(TextBlob(word).correct())) for word in words]

### Put misspelt words model into function

In [0]:

def spelling_corrector(words):
    """
    Takes list as input and does a spelling correction on
    each word in the list, showing the original word with
    its corrected version as output.
    """
    import textblob
    from textblob import TextBlob
    from textblob import Word
    
    textblob.en.spelling.update({'Lejeune':1})
    textblob.en.spelling.update({'burn pit':1})
    
    output = [str(word + ' ------> ' + str(TextBlob(word).correct())) for word in words]
#     output = [str(word + ' ------> ' + str(TextBlob(word).correct()) + ' ------> ' + str(Word(word).spellcheck())) for word in words]
    return output

In [0]:
words1 = ['applaus','macth','unabele','findinh','eeople']
words2 = ['ageunt orage', 'agint orange', 'ageant oragne', 'ageant orange', 'agetn orange', 'agent orangerel','agent orag','angentorange','east orange','asian orange', 'camp legum','camp legume', 'camp legoon', 'camp lejoon', 'Camp Lejeune', 'Camp Lejune', 'Camp Lajeune','burn oit','burnpit','camp lejun','camp lejune','cqmp jejun','camp le juen','camp leguen','czmp legum','camp legume','camp legun','csmp lejoun','camp leju','camp lejun','camp lejuen',]

### Run misspelt words model and score it (correction accuracy)

In [0]:
import re

# list of acceptable answers
answers = ['agent orange','camp lejeune','burn pit']

# set input here
model_input = words2

num_corrections = 0
num_words = 0
prefixes = ['camp','burn','agent']
suffixes = ['lejeune','pit','orange']

regex_lejeune = 'l[aeiou][jg][aeiou]{1,2}[nm][e]{0,1}'
regex_agent = 'a[nei]{0,1}[gs][aei]{0,2}n[t]{0,1}'
regex_orange = 'ora{0,1}n{0,1}ge{0,1}'
regex_burnpit = 'b*u*r*n*[ ]{0,1}[po]{0,1}[oi]{0,1}t'

# print('Original word: \tCorrected word:')
for word in spelling_corrector(model_input):
    num_words += 1
    
    try:
        original = word.split('-')[0].lower()  # get original phrase
        second_word_original = original.split(' ')[1:]  # get second original word (if it exists)
        first_word_original = original.split(' ')[0]  # get first original word
        second_word_original = ''.join(second_word_original)  # get second word and everything after it as string
#         print(second_word_original)
        
        corrected = word.split('> ')[1].lower()  # get corrected phrase
        second_word_corrected = corrected.split(' ')[1:]  # get second corrected word (if it exists)
        first_word_corrected = corrected.split(' ')[0]  # get first corrected word
        second_word_corrected = ''.join(second_word_corrected)  # get second word and everything after it as string
    except IndexError:
        second_word_corrected = ''  # set second word to blank if there is no second word

#     print(num_words)
#     print(first_word_corrected)
#     print(second_word_corrected)
    print('Original:', original)
    
    # get correction
    correction = word.split('> ')[1].lower()
    # check answer
    if correction in answers:
        num_corrections += 1
        print('Correction:', correction)
        print('\n')
        continue  # don't run regex module if correction is accurate
    
    # IF CORRECTION IS NOT ACCURATE, RUN REGEX MODULE
    
    # REGEX MODULE:
    # if first word matches one of the prefixes
    if first_word_corrected in prefixes:
        # if second word matches regex for Lejeune
        if re.match(regex_lejeune, second_word_original):
#             print('LEJEUNE MATCH')
            num_corrections += 1
            correction = 'Camp Lejeune'
        # if second word matches regex for Orange
        if re.match(regex_orange, second_word_original):
            num_corrections += 1
            correction = 'Agent Orange'
    # if second word matches with one of the suffixes
    if second_word_corrected in suffixes:
#         print('second word matches')
        # if first word matches regex for Agent
        if re.match(regex_agent, first_word_original):
            num_corrections += 1
            correction = 'Agent Orange'
    # if 'agent' and 'orange' don't have a space
    if re.match(regex_agent + regex_orange, first_word_original):
        num_corrections += 1
        correction = 'Agent Orange'
    # if burn pit misspelt, whether there is a space or not between 'burn' and 'pit'
    if re.match(regex_burnpit, original):
        num_corrections += 1
        correction = 'burn pit'
    
    # print final correction
    print('Correction:', correction)
    print('\n')
    
correct_accuracy = num_corrections / num_words
print('\nNumber of words:', num_words)
print('Number of accurate corrections:',num_corrections)
print('Correction accuracy:', correct_accuracy)

In [0]:
spelling_corrector(words2)

In [0]:
#### Revision From Ryan

In [0]:
### Class Based 

### Version 1.1

In [0]:
import re
from typing import List

class TextCorrector:
    
    
    def __init__(self):
        self.acceptable = []
        self.answers = ['agent orange','camp lejeune','burn pit']

        self.prefixes = ['camp','burn','agent']
        self.suffixes = ['lejeune','pit','orange']

        self.regex_lejeune = 'l[aeiou][jg][aeiou]{1,2}[nm][e]{0,1}'
        self.regex_agent = 'a[nei]{0,1}[gs][aei]{0,2}n[t]{0,1}'
        self.regex_orange = 'ora{0,1}n{0,1}ge{0,1}'
        self.regex_burnpit = 'b*u*r*n*[ ]{0,1}[po]{0,1}[oi]{0,1}t'


    def _spelling_corrector(self,words: List[str]) -> List[str]:
        """
        Takes list as input and does a spelling correction on
        each word in the list, showing the original word with
        its corrected version as output.
        """
        import textblob
        from textblob import TextBlob
        from textblob import Word

        textblob.en.spelling.update({'Lejeune':1})
        textblob.en.spelling.update({'burn pit':1})

        output = [str(word + ' ------> ' + str(TextBlob(word).correct())) for word in words]
#         output = [str(TextBlob(word).correct()) for word in words]
    #     output = [str(word + ' ------> ' + str(TextBlob(word).correct()) + ' ------> ' + str(Word(word).spellcheck())) for word in words]
#         print('Output:')
#         print(output)
        return output
    
    # print('Original word: \tCorrected word:')
    def apply(self,model_input: List[str]) -> List[str]:
        num_corrections = 0
        num_words = 0
        corrections =[]
        
        for word in self._spelling_corrector(model_input):
            num_words += 1
            try:
                original = word.split('-')[0].lower()  # get original phrase
#                 print('original:',original)
                second_word_original = original.split(' ')[1:]  # get second original word (if it exists)  # DELETE????
#                 print('second_word_original:',second_word_original)
                first_word_original = original.split(' ')[0]  # get first original word
#                 print('first_word_original:',first_word_original)
                second_word_original = ''.join(second_word_original)  # get second word and everything after it as string
        #         print(second_word_original)

                corrected = word.split('> ')[1].lower()  # get corrected phrase
                
#                 print('corrected:',corrected)
                second_word_corrected = corrected.split(' ')[1:]  # get second corrected word (if it exists)
                first_word_corrected = corrected.split(' ')[0]  # get first corrected word
                second_word_corrected = ''.join(second_word_corrected)  # get second word and everything after it as string
            except IndexError:
                second_word_corrected = ''  # set second word to blank if there is no second word

        #     print(num_words)
        #     print(first_word_corrected)
        #     print(second_word_corrected)
            #print('Original:', original)

            # get correction
            correction = word.split('> ')[1].lower()
            # check answer
            if correction in self.answers:
                num_corrections += 1
                corrections.append(correction)
                #print('Correction:', correction)
                #print('\n')
                continue  # don't run regex module if correction is accurate

            # IF CORRECTION IS NOT ACCURATE, RUN REGEX MODULE

            # REGEX MODULE:
            # if first word matches one of the prefixes
            if first_word_corrected in self.prefixes:
                # if second word matches regex for Lejeune
                if re.match(self.regex_lejeune, second_word_original):
        #             print('LEJEUNE MATCH')
                    num_corrections += 1
                    correction = 'Camp Lejeune'
                    corrections.append(correction)
                # if second word matches regex for Orange
                if re.match(self.regex_orange, second_word_original):
                    num_corrections += 1
                    correction = 'Agent Orange'
                    corrections.append(correction)
            # if second word matches with one of the suffixes
            if second_word_corrected in self.suffixes:
        #         print('second word matches')
                # if first word matches regex for Agent
                if re.match(self.regex_agent, first_word_original):
                    num_corrections += 1
                    correction = 'Agent Orange'
                    corrections.append(correction)
            # if 'agent' and 'orange' don't have a space
            if re.match(self.regex_agent + self.regex_orange, first_word_original):
                num_corrections += 1
                correction = 'Agent Orange'
                corrections.append(correction)
            # if burn pit misspelt, whether there is a space or not between 'burn' and 'pit'
            if re.match(self.regex_burnpit, original):
                num_corrections += 1
                correction = 'burn pit'
                corrections.append(correction)
            else:
                corrections.append(correction)
        
        # show score and accuracy
        correct_accuracy = num_corrections / num_words
        print('\nNumber of words:', num_words)
        print('Number of accurate corrections:',num_corrections)
        print('Correction accuracy:', correct_accuracy)
        
        return [x.lower() for x in corrections]

          

In [0]:
words1 = ['ageunt ','orage','agint','orange','ageant','oragne','ageant','orange','agetn','orange','agent','orangerel','agent','orag','angentorange','east','orange','asian','orange','camp','legum','camp','legume', 'camp','legoon','camp','lejoon','Camp','Lejeune','Camp','Lejune','Camp','Lajeune','burn','oit','burnpit','camp','lejun','camp','lejune','cqmp','jejun','camp','le','juen','camp','leguen','czmp','legum','camp', 'legume','camp','legun','csmp','lejoun','camp','leju','camp','lejun','camp','lejuen']

In [0]:
from textblob import TextBlob
output = [str(TextBlob(word).correct()) for word in words1]
print(output)

In [0]:
words2 = ['ageunt orage', 'agint orange', 'ageant oragne', 'ageant orange', 'agetn orange', 'agent orangerel','agent orag','angentorange','east orange','asian orange', 'camp legum','camp legume', 'camp legoon', 'camp lejoon', 'Camp Lejeune', 'Camp Lejune', 'Camp Lajeune','burn oit','burnpit','camp lejun','camp lejune','cqmp jejun','camp le juen','camp leguen','czmp legum','camp legume','camp legun','csmp lejoun','camp leju','camp lejun','camp lejuen']

Create instance of model and run model on given list of words/phrases

In [0]:
corrector = TextCorrector()
corrector.apply(words1)

### Version 1.2

In [0]:
import re
from typing import List

class TextCorrector:
    
    
    def __init__(self):
        self.acceptable = []
        self.answers = ['agent orange','camp lejeune','burn pit']

        self.prefixes = ['camp','burn','agent']
        self.suffixes = ['lejeune','pit','orange']

        self.regex_lejeune = 'l[aeiou][jg][aeiou]{1,2}[nm][e]{0,1}'
        self.regex_le = 'l[aeiou]'
        self.regex_jeune = '[jg][aeiou]{1,2}[nm][e]{0,1}'
        self.regex_agent = 'a[nei]{0,1}[gs][aei]{0,2}n[t]{0,1}'
        self.regex_orange = 'ora{0,1}n{0,1}ge{0,1}'
        self.regex_agentorange = 'a[nei]{0,1}[gs][aei]{0,2}n[t]{0,1}ora{0,1}n{0,1}ge{0,1}'
        self.regex_burnpit = '[burn]{4}[ ]{0,1}[po]{0,1}[oi]{0,1}t'
        self.regex_burn = 'bu[rn]{1,2}'
        self.regex_pit = '[op]{0,1}[it]{2}'
        
        self.debug_on = False

    def _spelling_corrector(self,words: List[str]) -> List[str]:
        """
        Takes list as input and does a spelling correction on
        each word in the list, showing the original word with
        its corrected version as output.
        """
        import textblob
        from textblob import TextBlob
        from textblob import Word

        textblob.en.spelling.update({'Lejeune':1})
        textblob.en.spelling.update({'burn pit':1})

#         output = [str(word + ' ------> ' + str(TextBlob(word).correct())) for word in words]
        output = [str(TextBlob(word).correct()) for word in words]
    #     output = [str(word + ' ------> ' + str(TextBlob(word).correct()) + ' ------> ' + str(Word(word).spellcheck())) for word in words]
#         print('Output:')
#         print(output)
        return output
    
    # print('Original word: \tCorrected word:')
    def apply(self,model_input: List[str]) -> List[str]:
        num_corrections = 0
        num_words = 0
        corrections =[]
        word_list = self._spelling_corrector(model_input)
        
        for word in word_list:
            num_words += 1
#             print(num_words)
            try:
                first_word = word.lower()
                second_word = word_list[num_words].lower()
#                 print('first_word:',first_word)
#                 print('second_word:',second_word)
                first_word_original = model_input[num_words-1]
                second_word_original = model_input[num_words]
                
            except IndexError:
#                 second_word_corrected = ''  # set second word to blank if there is no second word
                continue

        #     print(num_words)
        #     print(first_word_corrected)
        #     print(second_word_corrected)
            #print('Original:', original)
                        
            # IF REGEX FINDS BURN PIT
            if re.match(self.regex_burnpit, first_word_original):
                num_corrections += 1
                corrections.append('burn pit')
#                 print(first_word,second_word)
#                 print('burn pit')
                continue
            
            # IF CORRECTOR FINDS FIRST WORD, CHECK IF SECOND WORD MAKES A MATCH
            if first_word in self.prefixes:
                if second_word in self.suffixes:
#                     print('MATCH')
                    num_corrections += 1
                    corrections.append(first_word + ' ' + second_word)
#                     print(first_word + ' ' + second_word)
                    continue  # don't run regex module if correction is accurate
                # REGEX for Camp LeJeune
                elif first_word == 'camp' and re.match(self.regex_lejeune, second_word_original):
                    num_corrections += 1
                    corrections.append('camp lejeune')
#                     print(first_word,second_word)
#                     print('camp lejeune')
                    continue
                # REGEX FOR 'Le' or 'Jeune'
                elif first_word == 'camp' and re.match(self.regex_le, second_word_original) and re.match(self.regex_jeune, model_input[num_words+1]):
                    num_corrections += 1
                    corrections.append('camp lejeune')
#                     print(first_word,second_word)
#                     print('camp lejeune')
                # REGEX for Agent Orange
                elif first_word == 'agent' and re.match(self.regex_orange, second_word_original):
                    num_corrections += 1
                    corrections.append('agent orange')
#                     print(first_word,second_word)
#                     print('(1) agent orange')
                    continue
                # REGEX for Burn Pit
                elif first_word == 'burn' and re.match(self.regex_pit, second_word_original):
                    num_corrections += 1
                    corrections.append('burn pit')
#                     print(first_word,second_word)
#                     print('burn pit')
                    continue
                     
            # IF REGEX FINDS AGENT ORANGE
            if re.match(self.regex_agentorange, first_word_original):
                num_corrections += 1
                corrections.append('agent orange')
#                 print(first_word,second_word)
#                 print('(2) agent orange')
                continue
            # IF REGEX FINDS AGENT
            if re.match(self.regex_agent, first_word_original):
                # if second word either matches 'orange' or the REGEX finds 'orange'
                if second_word == 'orange' or re.match(self.regex_orange, second_word_original):
                    num_corrections += 1
                    corrections.append('agent orange')
#                     print(first_word,second_word)
#                     print('(3) agent orange')
                    continue

            
        # show score and accuracy
        correct_accuracy = num_corrections / 32
        
        # only print if debug set to True:
        if self.debug_on == True:
            print('\nNumber of phrases:', 32)
            print('Number of accurate corrections:',num_corrections)
            print('Correction accuracy:', correct_accuracy)
        
        return [x.lower() for x in corrections]

In [0]:
words1 = ['ageunt','orage','agint','orange','ageant','oragne','ageant','orange','agetn','orange','agent','orangerel','agent','orag','angentorange','east','orange','asian','orange','burn','oit','burnpit','bunr','pit','burnt','pit','camp','legum','camp','legume','camp','legoon',
          'camp','lejoon','Camp','Lejeune','Camp','Lejune','Camp','Lajeune','camp','lejun','camp','lejune','cqmp','jejun','camp','le','juen','camp','leguen','czmp','legum','camp', 'legume','camp','legun','csmp','lejoun','camp','leju','camp','lejuen']

words2 = ['ageunt','orage','agint','orange','ageant','oragne','ageant','orange','agetn','orange','agent','orangerel','agent','orag','angentorange','east','orange','asian','orange']

words3 = ['burn','oit','burnpit','bunr','pit','burnt','pit']

words4 = ['camp','legum','camp','legume','camp','legoon','camp','lejoon','Camp','Lejeune','Camp','Lejune','Camp','Lajeune','camp','lejun','camp','lejune','cqmp','jejun','camp','le','juen',
        'camp','leguen','czmp','legum','camp', 'legume','camp','legun','csmp','lejoun','camp','leju','camp','lejuen']
from textblob import TextBlob
[str(TextBlob(word).correct()) for word in words1]

In [0]:
corrector = TextCorrector()
corrector.apply(words1)