In [1]:
import stanza, re, benepar, psutil, gc, json, csv, time, string, os

import numpy as np
import networkx as nx
import pandas as pd
import seaborn as sns

from textstat import textstat
from datetime import datetime
from collections import Counter

import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk import Tree
from textblob import Word
from spellchecker import SpellChecker

import matplotlib.pyplot as plt
from matplotlib.ticker import LogLocator, LogFormatter

In [2]:
def memcheck():
    gc.collect()
    memory_info = psutil.virtual_memory()

    # Display the memory information in GB
    total_memory = memory_info.total / (1024 ** 3)
    available_memory = memory_info.available / (1024 ** 3)
    used_memory = memory_info.used / (1024 ** 3)

    print(f"Total Memory: {total_memory:.2f} GB")
    print(f"Available Memory: {available_memory:.2f} GB")
    print(f"Used Memory: {used_memory:.2f} GB")

def time_taken(start_time, end_time):
    time_taken = end_time - start_time
    hours, remainder = divmod(time_taken.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"Time Taken: {hours} hours, {minutes} minutes, {seconds} seconds")

In [3]:
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse,sentiment,ner')
nlp_tokens = stanza.Pipeline(lang='en', processors='tokenize', use_gpu=False)

2024-10-28 17:13:07 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-10-28 17:13:07 INFO: Downloaded file to C:\Users\Roland\stanza_resources\resources.json
2024-10-28 17:13:09 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| pos       | combined_charlm           |
| lemma     | combined_nocharlm         |
| depparse  | combined_charlm           |
| sentiment | sstplus_charlm            |
| ner       | ontonotes-ww-multi_charlm |

2024-10-28 17:13:09 INFO: Using device: cpu
2024-10-28 17:13:09 INFO: Loading: tokenize
2024-10-28 17:13:11 INFO: Loading: mwt
2024-10-28 17:13:11 INFO: Loading: pos
2024-10-28 17:13:11 INFO: Loading: lemma
2024-10-28 17:13:11 INFO: Loading: depparse
2024-10-28 17:13:12 INFO: Loading: sentiment
2024-10-28 17:13:12 INFO: Loading: ner
2024-10-28 17:13:12 INFO: Done loading processors!
2024-10-28 17:13:12 INFO: Checking for updates to resources.json in

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-10-28 17:13:13 INFO: Downloaded file to C:\Users\Roland\stanza_resources\resources.json
2024-10-28 17:13:13 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2024-10-28 17:13:13 INFO: Using device: cpu
2024-10-28 17:13:13 INFO: Loading: tokenize
2024-10-28 17:13:13 INFO: Loading: mwt
2024-10-28 17:13:13 INFO: Done loading processors!


In [4]:
def display_text_in_context(text, words, window = 75):
#Finds the first occurrence of each word in target text and displays it
    
    for word in words:
        ind = text.find(word)
        if ind == -1:
            print (f"{word} not found in text")
        else:
            display_text = text[ind-window:ind+window+1]
            display_text = display_text.replace("\n", " ")
            display_text = display_text.replace("\t", " ")
            print(display_text)

def display_words_in_context(text_words, words, repeat = False, window = 2):
# Finds the first occurrence of each word in target list of words. Optionally finds every subsequent occurrence     
    if not repeat:
        for word in words:
            try:
                index = text_words.index(word)
                #print(f"The index of '{word}' is: {ind}")
                display_text = ' '.join(text_words[index - window:index])
                display_text = display_text + " [" + word + "] "
                display_text = display_text + ' '.join(text_words[index + 1 : index + window + 1])
                print(display_text)
            except ValueError:
                print(f"'{word}' not found in the text.")
    else:
        for word in words:
            indices = [index for index, value in enumerate(text_words) if value == word]
            if len(indices) == 0:
                print(f"'{word}' not found in the text.")
            else:
                for index in indices:
                    display_text = ' '.join(text_words[index - window:index])
                    display_text = display_text + " [" + word + "] "
                    display_text = display_text + ' '.join(text_words[index + 1 : index + window + 1])
                    print(display_text)
                

In [5]:
def where_is(word):
    # finds if spell checker has flagged a word.  Must run spell checker first.
    if (word in spell_set_1): 
        print ("In spell set 1")
    if (word in blob_set_1):
        print ("In blob set 1")
    if (word in spell_set_1) or (word in blob_set_1):
        display_words_in_context(test_words_1, [word], False, 6)
    if (word in spell_set_2): 
        print ("In spell set 2")
    if (word in blob_set_2):
        print ("In blob  set 2")
    if (word in spell_set_2) or (word in blob_set_2):
        display_words_in_context(test_words_2, [word], False, 6)
    if (word in spell_set_3): 
        print ("In spell set 3")
    if (word in blob_set_3):
        print ("In blob set 3")
    if (word in spell_set_1) or (word in blob_set_3):
        display_words_in_context(test_words_3, [word], False, 6)        
        

In [6]:
def find_hyphenated_words(text):
    # Use regex to find words that are hyphenated across lines
    # Match sequences where a hyphen is at the end of a line, followed by a newline, and then continued with a word
    hyphenated_words = re.findall(r"(\w+)-\n(\w+)", text)

    for first_part, second_part in hyphenated_words:
        print(f"Found broken word: {first_part}-{second_part}")
    print(f"{len(hyphenated_words)} hyphenated words found, text length is: {len(text)}")
    


In [7]:
def replace_hyphenated_words(text, show_changes = False):
    # This regular expression captures words separated by a hyphen, with letters on both sides.
    pattern = r'(\b\w+)-(\w+\b)'
    
    # Function to handle replacement and printing
    def replacement(match):
        # Original hyphenated word
        original_word = match.group(0)
        # Replacement word (with space instead of hyphen)
        altered_word = match.group(1) + " " + match.group(2)
        
        # Print the hyphenated word that was altered
        if show_changes: print(f"Altered: {original_word} -> {altered_word}")
        
        return altered_word

    # Replace hyphenated words and call the replacement function
    result = re.sub(pattern, replacement, text)
    
    return result


In [8]:
source_texts= list()
source_texts.append("Kazuo Ishiguro - Never Let Me Go.txt")
source_texts.append("Kazuo Ishiguro - The Remains of the Day")
source_texts.append("Kazuo Ishiguro - A Pale View of Hills-Knopf Doubleday Publishing Group (1990)")
source_texts.append("Kazuo-Ishiguro-When-We-Were-Orphans-Alfred-A.-Knopf_Vintage-_2001_")
source_texts.append("The Buried Giant (Kazuo Ishiguro) (Z-Library)-1")
source_texts.append("Kazuo Ishiguro - The Unconsoled-Vintage (1996)")
directory_path = "C:/Users/Roland/Documents/AI/stylometry/"
file_path = directory_path+source_texts[1]+".txt"
print (file_path)

C:/Users/Roland/Documents/AI/stylometry/Kazuo Ishiguro - The Remains of the Day.txt


In [9]:
text_choice = 1

file_path = directory_path+source_texts[text_choice]+" edited_2.txt"
if os.path.isfile(file_path):
    print ("file found at ", file_path)
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
else:
    print("file not found at ", file_path)
    file_path = directory_path+source_texts[text_choice]+" edited_1.txt"
    if os.path.isfile(file_path):
        print ("file found at ", file_path)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
    else:
        print("file not found at ", file_path)
        file_path = directory_path+source_texts[text_choice]+" edited.txt"
        if os.path.isfile(file_path):
            print ("file found at ", file_path)
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        else:
            print("file not found at ", file_path)


file not found at  C:/Users/Roland/Documents/AI/stylometry/Kazuo Ishiguro - The Remains of the Day edited_2.txt
file found at  C:/Users/Roland/Documents/AI/stylometry/Kazuo Ishiguro - The Remains of the Day edited_1.txt


In [10]:
print (len(text))

423406


Prepare words for spell checking.

The first set has punctuation removed

The second set has punctuation removed after replacing hyphenated words with spaces

The third set is also made lower case

In [11]:
test_words_1 = text.translate(str.maketrans('', '', string.punctuation)).split()
unhyphenated = replace_hyphenated_words(text)
test_words_2 = unhyphenated.translate(str.maketrans('', '', string.punctuation)).split()
test_words_3 = unhyphenated.translate(str.maketrans('', '', string.punctuation)).lower().split()

In [12]:
# Use spellchecker to validate words
# Initialize spell checker
spell = SpellChecker()


# Find words not in the dictionary
spell_set_1 = {word for word in test_words_1 if word not in spell}
spell_set_2 = {word for word in test_words_2 if word not in spell}
spell_set_3 = {word for word in test_words_3 if word not in spell}

print(len(spell_set_1), len(spell_set_2), len(spell_set_3))


499 413 393


In [13]:
print(f"errors found with punctuation removed only: {len(spell_set_1)}")
print(f"errors found with punctuation removed after replacing hyphens: {len(spell_set_2)}")
print(f"errors found with case lowered too:  {len(spell_set_3)}")     
print(f"errors eliminated by removing hyphens:  {len(spell_set_1 - spell_set_2)}")
print(f"errors introduced by removing hyphens:  {len(spell_set_2 - spell_set_1)}")
print(f"errors eliminated by lowering case:  {len(spell_set_2 - spell_set_3)}")
print(f"errors introduced by lowering case:  {len(spell_set_3 - spell_set_2)}")

errors found with punctuation removed only: 499
errors found with punctuation removed after replacing hyphens: 413
errors found with case lowered too:  393
errors eliminated by removing hyphens:  102
errors introduced by removing hyphens:  16
errors eliminated by lowering case:  245
errors introduced by lowering case:  225


In [14]:
oddities = list(spell_set_2 - spell_set_1)
print (' '.join(oddities))
print("\n")
# display_words_in_context(test_words_2, oddities, True, 3)

ordinated empting ofhand Karl Heinz pre Semitic martialling vident humouredness un sheetings inwaiting loverStevens co longue




Select the ones that could be errors and find them in the original text:

In [None]:
selected_oddities = ["team of ladies", "sleights-of-hand", "re a nature-lover", "have been self-vident"]
#These should appear in the comprehensive search later.

In [None]:
display_text_in_context(text, selected_oddities)

Select the errors and add them to the replacements dictionary.  The rest of the oddities are not spelling errors.

In [None]:
spell_set_2_lowered = {word.lower() for word in spell_set_2}
len(spell_set_2_lowered - spell_set_3)

Check words that are picked out by spell checker when made lower case:

In [None]:
case_oddities = list(spell_set_3 - spell_set_2)
print (' '.join(case_oddities))
print("\n")
# display_words_in_context(test_words_3, case_oddities, False, 3)

Check words that picked out by spell checker with the original case but passed when made lower case:

In [None]:
inverse_case_oddities = list(spell_set_2 - spell_set_3)
print (' '.join(inverse_case_oddities))
print("\n")
# display_words_in_context(test_words_2, inverse_case_oddities, False, 3)

Run text blob, an alternative spell checker:

In [None]:
start_time = datetime.now()
print(f"Starting processing at: {start_time.strftime('%H:%M:%S')}")



# Find invalid words
blob_set_1 = set()
for word in test_words_1:
    result = Word(word).spellcheck()
    if result[0][1] < 1.0 or result[0][0] != word:
        blob_set_1.add(word)

blob_set_2 = set()
for word in test_words_2:
    result = Word(word).spellcheck()
    if result[0][1] < 1.0 or result[0][0] != word:
        blob_set_2.add(word)

blob_set_3 = set()
for word in test_words_3:
    result = Word(word).spellcheck()
    if result[0][1] < 1.0 or result[0][0] != word:
        blob_set_3.add(word)
        

end_time = datetime.now()
print(f"Completed processing at: {end_time.strftime('%H:%M:%S')}")
time_taken(start_time, end_time)

print(len(blob_set_1), len(blob_set_2), len(blob_set_3))

Now look at how many each spell checker finds that the other didn't find:

In [None]:
spell_only_1 = spell_set_1 - blob_set_1
spell_only_2 = spell_set_2 - blob_set_2
spell_only_3 = spell_set_3 - blob_set_3

blob_only_1 = blob_set_1 - spell_set_1
blob_only_2 = blob_set_2 - spell_set_2
blob_only_3 = blob_set_3 - spell_set_3

both_3 = spell_set_3 & blob_set_3
either_3 = spell_set_3 | blob_set_3 # this is an OR symbol

print(f"spell / blob 1 have: {len(spell_set_1)} / {len(blob_set_1)}")
print(f"spell / blob 2 have: {len(spell_set_2)} / {len(blob_set_2)}")
print(f"spell / blob 3 have: {len(spell_set_3)} / {len(blob_set_3)}")
print(f"spell unique / blob unique 1 have: {len(spell_only_1)} / {len(blob_only_1)}")
print(f"spell unique / blob unique 2 have: {len(spell_only_2)} / {len(blob_only_2)}")
print(f"spell unique / blob unique 3 have: {len(spell_only_3)} / {len(blob_only_3)}")


In [None]:
# display_words_in_context(test_words_1, spell_only_1, False, 3)

In [None]:
blob_oddities = list(blob_set_2 - blob_set_1)
print (' '.join(blob_oddities))
print("\n")
# display_words_in_context(test_words_2, oddities, True, 3)

In [None]:
len(spell_only_2)

In [None]:

print(' '.join(word for word in both_3))
# display_words_in_context(test_words_2, spell_only_2 - spell_only_1, False, 3)

In [None]:
errors_raw = "witha bethat selfvident notseek distinguishedand redding civvy ting carryon putout ell arec simplyaccepting gotto thatsall donttake andthe itinvolved istrue tosit tj1e ifi inwaiting seehe allhis lto ohnothing ill1terest het villagesir imvery loc lotyou verypleased docrucially avery friendsand anysuch wer thevery iarrived ican startseeing canonly allshot contemplatin8 infact ot 0f roas agre ofthe dh owardst itsrather ofhand confidentia1 cansee prise loverstevens canbe chainbers lookingback thelikes beyondsuch farradayscircle al1"
errors_raw = errors_raw.split()
errors_raw = sorted(errors_raw)
print(len(errors_raw))
print (errors_raw)

In [None]:
errors = ['0f', 'agre', 'al1', 'allhis', 'andthe', 'anysuch', 'arec', 'avery', 'be-that', 'beyondsuch', 'canbe', 'canonly', 'cansee', 'carryon', 'Chainbers', 'confidentia1', 'contemplatin8', 'Dh', "'distinguished',and", 'docrucially', "don'ttake", 'T ell', "Farraday'scircle", 'friendsand', 'gotto', 'het', 'Iarrived', 'Ican', "if'-I", 'ill1terest', "I'mvery", 'Infact', 'istrue', 'it,involved', "it'srather", 'loc ked', 'lookingback', 'lot,you', 'lover,Stevens', 'lto', 'not-seek', 'ofthe', 'Oh,nothing', 'ot', 'owardst', 'putout', 'roas ting', 'seehe', 'self-vident', 'simplyaccepting', 'startseeing', "that'sall", 'thelikes', 'thevery', 'tJ1e', 'tosit', 'verypleased', 'village,sir', 'wer', 'with-a']


In [None]:
print(len(errors))

In [None]:
print("distinguishedand" in errors_raw)
print("distinguishedand" in errors)

In [None]:
errors_tt == errors_raw

In [None]:
corrections = ['of', 'agree', 'all', 'all his', 'and the', 'any such', 'care', 'a very', 'be that', 'beyond such', 'can be', 'can only', 'can see', 'carry on', 'Chambers', 'confidential', 'contemplating', 'Oh', "'distinguished', and", 'do crucially', "don't take", 'Tell', 'Farradays Circle', 'friends and', 'got to', 'the', 'I arrived', 'I can', 'if I', 'interest', "I'm very", 'In fact', 'is true', 'it involved', "its rather", 'locked', 'looking back', 'lot, you', 'lover, Stevens', 'to', 'not seek', 'of the', 'Oh, nothing', 'to', 'towards', 'put out', 'roasting', 'see he', 'self-evident', 'simply accepting', 'start seeing', "that's all", 'the likes', 'the very', 'the', 'to sit', 'very pleased', 'village, sir', 'were', 'with a']

In [None]:
set_1 = set(errors_tt)
set_2 = set(errors)
print(len(set_1), len(set_2))
print(set_1 - set_2)
print(set_2 - set_1)

In [None]:
counter_spell_1 = 0
counter_spell_2 = 0
counter_spell_3 = 0
counter_blob_1 = 0
counter_blob_2 = 0
counter_blob_3 = 0

for word in errors_tt:
    if word in spell_set_1: counter_spell_1 += 1 
    if word in spell_set_2: counter_spell_2 += 1
    if word in spell_set_3: counter_spell_3 += 1
    if word in blob_set_1: counter_blob_1 += 1
    if word in blob_set_2: counter_blob_2 += 1
    if word in blob_set_3: counter_blob_3 += 1
print (counter_spell_1, counter_spell_2, counter_spell_3)
print (counter_blob_1, counter_blob_2, counter_blob_3)
    

In [None]:
for word in errors_tt:
    if word not in blob_set_3: print(word)

In [None]:
counter_spell_1 = 0
counter_spell_2 = 0
counter_spell_3 = 0
counter_blob_1 = 0
counter_blob_2 = 0
counter_blob_3 = 0

for word in errors:
    if word in spell_set_1: counter_spell_1 += 1 
    if word in spell_set_2: counter_spell_2 += 1
    if word in spell_set_3: counter_spell_3 += 1
    if word in blob_set_1: counter_blob_1 += 1
    if word in blob_set_2: counter_blob_2 += 1
    if word in blob_set_3: counter_blob_3 += 1
print (counter_spell_1, counter_spell_2, counter_spell_3)
print (counter_blob_1, counter_blob_2, counter_blob_3)
    

In [None]:
print(len(errors_raw), len(errors), len(corrections))

In [None]:
counter_spell_1 = 0
counter_spell_2 = 0
counter_spell_3 = 0
counter_blob_1 = 0
counter_blob_2 = 0
counter_blob_3 = 0
counter_both_3 = 0

for word in errors_raw:
    if word in spell_set_1: counter_spell_1 += 1 
    if word in spell_set_2: counter_spell_2 += 1
    if word in spell_set_3: counter_spell_3 += 1
    else: print("not in spell_3: ", word)
    if word in blob_set_1: counter_blob_1 += 1
    if word in blob_set_2: counter_blob_2 += 1
    if word in blob_set_3: counter_blob_3 += 1
    else: print("not in blob_3: ", word)
    if word in both_3: counter_both_3 += 1
    else: print("not in both_3: ", word)
print (counter_spell_1, counter_spell_2, counter_spell_3)
print (counter_blob_1, counter_blob_2, counter_blob_3)
print(counter_both_3)
    

In [None]:
print(len(corrections))

In [None]:
for error, correction in zip(errors, corrections):
    if error == correction:
        display_words_in_context(test_words_3, [error], False, 20)

In [None]:
for word in errors:
    if word not in text:
        display_words_in_context(test_words_3, [word], False, 6)

In [None]:
display_text_in_context(text, ["never forgive me"])

In [None]:
print([word in blob_set_3 for word in errors])

In [None]:
where_is("bethat")

In [None]:
"be that" in corrections

In [None]:
"allshot" in both_3

In [None]:
display_text_in_context(text,["ell"])

In [None]:
replacements = {error: correction for error, correction in zip (errors, corrections)}

In [None]:
print(replacements["with-a"])

In [None]:
print(' '.join(word for word in (blob_only_2 - blob_only_1)))
display_words_in_context(test_words_2, blob_only_2 - blob_only_1, False, 3)

In [None]:
print(' '.join(word for word in (spell_only_3 - spell_only_2 - spell_only_1)))
display_words_in_context(test_words_3, spell_only_3 - spell_only_2 - spell_only_1, False, 3)

In [None]:
print(' '.join(word for word in (blob_only_3 - blob_only_2 - blob_only_1)))
display_words_in_context(test_words_3, blob_only_3 - blob_only_2 - blob_only_1, False, 3)

In [None]:
counter = 0
for word in blob_set_2:
    if word.lower() in blob_set_3:
        print(f"{word} in both")
        print(Word(word).spellcheck())
        if word != word.lower():
            print(Word(word.lower()).spellcheck())
    else:
        print (f"{word} in 2 but not in 3")
        print(Word(word).spellcheck())
    print("\n")    
    counter = counter + 1
    if counter > 15: break    

In [None]:
errors = "istrue canbe im tj1e overath laval itsrather wer silverss gotto thelikes ifi 0f prise ritz ofthe andthe lto lookingback al1 verypleased het thevery agre bethat seehe ot selfvident sleightsofhand cansee thatsall distinguishedand villagesir docrucially arec farradayscircle witha canonly barnets redding tosit clementss newtmating ill1terest anysuch dh lewiss donttake allhis contemplatin8 startseeing symons owardst imvery simplyaccepting selftraining wellcontented evercourteous donttake allhis lastminute civvy"

In [None]:
bad_words=["ohnothing", "iarrived", "friendsand", "allshot"]
for word in bad_words:
    print(word in errors_split)

In [None]:
suggestions_split = suggestions.split(' ')
suggestions_sorted = sorted(suggestions_split)
print(suggestions_sorted)

In [None]:
errors_sorted = sorted(errors_split)
print (errors_sorted)

In [None]:
for word in bad_words:
    errors_split.append(word)
print(errors_split)

In [None]:
print(suggestions)

In [None]:
errors_split = ['istrue', 'canbe', 'im', 'tj1e', 'overath', 'laval', 'itsrather', 'wer', 'silverss', 'gotto', 'thelikes', 'ifi', '0f', 'prise', 'ritz', 'ofthe', 'andthe', 'lto', 'lookingback', 'al1', 'verypleased', 'het', 'thevery', 'agre', 'bethat', 'seehe', 'ot', 'selfvident', 'sleightsofhand', 'cansee', 'thatsall', 'distinguishedand', 'villagesir', 'docrucially', 'arec', 'farradayscircle', 'witha', 'canonly', 'barnets', 'redding', 'tosit', 'clementss', 'newtmating', 'ill1terest', 'anysuch', 'dh', 'lewiss', 'donttake', 'allhis', 'contemplatin8', 'startseeing', 'symons', 'owardst', 'imvery', 'simplyaccepting', 'selftraining', 'wellcontented', 'evercourteous', 'donttake', 'allhis', 'lastminute', 'civvy', 'ohnothing', 'iarrived', 'friendsand', 'allshot']

In [None]:


for bad_word in bad_words:
    ind = text.find(bad_word)
    if ind == -1:
        print(f"{bad_word} not found")
    else:
        window = text[ind-50:ind+50]
        window = window.replace("\n", " ")
        window = window.replace("\t", " ")
        print(window)
    #test_result = (Word(test_word).spellcheck())
    #print(test_result)


In [None]:
test_words = text.translate(str.maketrans('', '', string.punctuation)).lower().split()
bad_words=["ohnothing", "iarrived", "friendsand", "allshot"]
bad_words = ["ell", "ting"]
for bad_word in bad_words:
    try:
        ind = test_words.index(bad_word)
        print(f"The index of '{bad_word}' is: {ind}")
        print(' '.join(test_words[ind - 10:ind + 10]))
    except ValueError:
        print(f"'{item_to_find}' not found in the list.")


In [None]:
print(' '.join(blob_only))

In [None]:
both = spell_set & blob_set
print(len(both))

In [None]:
print(' '.join(word for word in both))

In [None]:
original_errors = "istrue canbe im tj1e overath laval itsrather wer silverss gotto thelikes ifi 0f prise ritz ofthe andthe lto lookingback al1 verypleased het thevery agre bethat seehe ot selfvident sleightsofhand cansee thatsall distinguishedand villagesir docrucially arec farradayscircle witha canonly barnets redding tosit clementss newtmating ill1terest anysuch dh lewiss donttake allhis contemplatin8 startseeing symons owardst imvery simplyaccepting selftraining wellcontented evercourteous donttake allhis lastminute civvy"

In [None]:
original_errors_split = set(original_errors.split())

In [None]:
missed = {word for word in original_errors_split if word not in errors}

In [None]:
print ([(word, word in blob_set_3) for word in missed])

In [None]:
print ([(word, word in both_3) for word in missed])

In [None]:
print ([(word, word in spell_set_3) for word in missed])


In [None]:
for word in errors_split:
    if word not in text:
        if word not in cleaned_suggestions:
            print(word)


In [None]:
where_is("distinguishedand")

In [None]:
print([word for word in blob_set_3 if word not in blob_only])

In [None]:
suggestions = "Overath Laval 1e with-a it'srather Silvers's Farraday'scircle Barnet's Clements's newt-mating Lewis's don'ttake I'mvery self-training well-contented ever-courteous don'ttake last-minute Ritz Redding self-vident sleights-of-hand be-that that'sall d-h Symons"
cleaned_suggestions = suggestions.translate(str.maketrans('', '', string.punctuation)).lower().split()

In [None]:
for word in suggestions.split(' '):
    if word not in text:
        print(word)

In [None]:
for word in errors_split:
    if word not in cleaned_suggestions:`
        if word not in test_words:
            print(word)

In [None]:
" prise " in text