# Spell Checking via Text Prediction Exploration
## Nicholas Miklaucic & Peabody Work Duty Group

In [1]:
import spellcheck
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from string import punctuation, whitespace
from tqdm import tqdm
from multiprocessing import Pool

%matplotlib inline

In [2]:
spellcheck.sentence_correct("A blob is nice this timem of day.")

'A blog is nice this time of day.'

In [3]:
spellcheck.sentence_correct("Name urude quartzite stone.")

'Name urude quartzite stone.'

In [4]:
unwanted_chars = list(punctuation) + list(whitespace)
unwanted_chars.remove(' ')
unwanted_chars.remove('-')
unwanted_chars.remove("'")
def word_parse(string):
    """Lowercases, emoves punctutation besides that which can appear in the inside of words (hyphen, apostrophe), and removes extraneous whitespace."""
    parsed = string.strip().lower()
    for unwanted_char in unwanted_chars:
        parsed = parsed.replace(unwanted_char, '')
    return parsed

In [5]:
other_df = pd.read_csv("test.csv")
other_df.replace({r'\n': ' '}, regex=True ,inplace=True)
other_df.head()

Unnamed: 0.1,Unnamed: 0,Cat Number,Site Number,Locality,Site,Name,Situation,AccNum,fileLocation
0,0,1,M50/1,"Locality Squibnocket Head, so Martha' 3 Vineya...",Site Squibnocket Cliff .,Name Butt of arrowhead.,Situation on sand under shell just south of st...,1,peabody_files/Accession Files/1/1_0001.pdf.png
1,1,2,M50/1,"Lmnmw Squibnocket Head, so Martha's Vineyard, M",Site Squibnocket Cliff.,Name Butt of quartz knife.,Situation Black sandy loam near stake 2.,1,peabody_files/Accession Files/1/1_0002.pdf.png
2,2,3,M50/1,"Locality Squibnocket Head, sou Martha' 3 Viney...",Site Squibnocket Cliff .,Name Crude quartz point.,"Situation Black sandy loam, 1M. east of stake A.",1,peabody_files/Accession Files/1/1_0003.pdf.png
3,3,4,M50/1,"Locality Squibnooket Head, so Martha's Vineyar...",Site Squibnocket Cliff.,Name Urude quartzite apea:,Situation Top of first shell layer.,1,peabody_files/Accession Files/1/1_0004.pdf.png
4,4,5,M50/1,"Locality Squibnocket Head, sou Martha' 3 Viney...",Site Squibnocket Cliff .,Name Crude chopper.,Situation Underneath lowest shell layer.,1,peabody_files/Accession Files/1/1_0005.pdf.png


In [6]:
fields = ["Locality", "Site", "Name", "Situation"]
other_df[fields] = other_df[fields].apply(np.vectorize(lambda x: word_parse(str(x))))
def correctRow(tup):
    i, row = tup
    if len(row["Locality"]) > 0:
        row["Locality"] = spellcheck.sentence_correct(row["Locality"])
    if len(row["Site"]) > 0:
        row["Site"] = spellcheck.sentence_correct(row["Site"])
    if len(row["Name"]) > 0:
        row["Name"] = spellcheck.sentence_correct(row["Name"])
    if len(row["Situation"]) > 0:
        row["Situation"] = spellcheck.sentence_correct(row["Situation"])
    other_df.loc[i] = row
rows=list(other_df.iterrows())
pool=Pool(processes=29)
pool.map(correctRow, rows)
pool.close()
pool.join()
for index, row in tqdm(other_df.iterrows()):
    row["Locality"] = row["Locality"].strip()[row["Locality"].index(" ")+1 if "Locality" in row["Locality"] else 0:]
    row["Name"] = row["Name"].strip()[row["Name"].index(" ")+1 if "Name" in row["Name"] else 0:]
    row["Site"] = row["Site"].strip()[row["Site"].index(" ")+1 if "Site" in row["Site"] else 0:]
    row["Situation"] = row["Situation"].strip()[row["Situation"].index(" ")+1 if "Situation" in row["Situation"] else 0:]
    other_df.loc[index] = row
#     print(other_df.loc[index])

#     other_df.loc[index] = correctRow(row)
#     print(other_df.loc[index])

Process ForkPoolWorker-3:
  File "/Users/localhost/.pyenv/versions/anaconda3-4.1.0/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-1:
Process ForkPoolWorker-4:
Process ForkPoolWorker-2:
Traceback (most recent call last):
  File "/Users/localhost/.pyenv/versions/anaconda3-4.1.0/lib/python3.5/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/localhost/.pyenv/versions/anaconda3-4.1.0/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/localhost/.pyenv/versions/anaconda3-4.1.0/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/Users/localhost/.pyenv/versions/anaconda3-4.1.0/lib/python3.5/multiprocessing/process.py", line 254, in _bootstrap
    self.ru

KeyboardInterrupt: 

In [None]:
other_df.head(5)

words = Counter()
for field in fields:
    words.update(Counter(other_df[field].apply(lambda x: x + ' ').sum().strip().split(' ')))
for word, freq in words.most_common():
    words[word.title()] = words[word]
del words['']

In [None]:
words.most_common(10)

The current idea I have for mixing frequencies is as follows: for any word currently in the corpus, ignore it. Otherwise, set it to a constant times the place in the list.

In [None]:
ALPHA = .05 # determines how much normal English words are weighted
# for the value, you should probably use the wanted "top" value / 10000
with open("google-10000-english-no-swears.txt", 'r') as corpusfile:
    for i, word in enumerate(reversed(list(corpusfile))):
        if word.strip() in words:
            continue
        else:
            words[word.strip()] = int(ALPHA * i)
            words[word.strip().title()] = int(ALPHA * i)

In [None]:
words.most_common(50)

In [None]:
BETA = 200
with open("name_list.dat", 'r') as namelistfile:
    for line in namelistfile:
        processed = line.strip().lower()
        if processed in words:
            pass
        else:
            words[processed] = BETA
            words[processed.title()] = BETA

In [None]:
del words['i']
with open("spellcheckcorpus.dat", 'w') as outputfile:
    with open("spellcheckcorpuswithfreqs.csv", 'w') as outputfreqs:
        for word, freq in words.most_common():
            outputfile.write(word + '\n')
            outputfreqs.write("{},{}\n".format(word, freq))

In [None]:
# algorithm for computing edit distance
# Good artists copy, great artists steal
# this is from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
# and I take no credit for it whatsoever
def levenshtein(source, target):
    if len(source) < len(target):
        return levenshtein(target, source)

    # So now we have len(source) >= len(target).
    if len(target) == 0:
        return len(source)

    # We call tuple() to force strings to be used as sequences
    # ('c', 'a', 't', 's') - numpy uses them as values by default.
    source = np.array(tuple(source))
    target = np.array(tuple(target))

    # We use a dynamic programming algorithm, but with the
    # added optimization that we only need the last two rows
    # of the matrix.
    previous_row = np.arange(target.size + 1)
    for s in source:
        # Insertion (target grows longer than source):
        current_row = previous_row + 1

        # Substitution or matching:
        # Target and source items are aligned, and either
        # are different (cost of 1), or are the same (cost of 0).
        current_row[1:] = np.minimum(
                current_row[1:],
                np.add(previous_row[:-1], target != s))

        # Deletion (target grows shorter than source):
        current_row[1:] = np.minimum(
                current_row[1:],
                current_row[0:-1] + 1)

        previous_row = current_row

    return previous_row[-1]

In [None]:
# testing examples
print(levenshtein("history", "herstory"))
# should output 2: add 'r', change 'i' to 'e'
print(levenshtein("t3sstin", "testing"))
# should output 3: change '3' to 'e', delete 's', add a 'g'

In [None]:
# now to spell-check a single word, we find its closest thing in the list of words we have and then settle ties by commonality in the list
def correct(input_word):
    """Returns the closest words in the list of words we have, sorted by likelihood."""
    candidates = []
    curr_min_distance = 200
    for word in words:
        distance = levenshtein(word, input_word)
        if distance < curr_min_distance:
            candidates = [word]
            curr_min_distance = distance
        elif distance == curr_min_distance:
            candidates.append(word)
        else:
            continue
    candidates.sort(key=lambda x: words[x], reverse=True)
    return candidates

In [None]:
print(correct("arrop oint"))
print(correct("or1g1nal3fdf"))
print(correct("history"))
print(correct("mcmurphy"))

I feel really good about this system, especially if I ever get the actual English dictionary to back the wordlist up (EDIT: done!) and figure out how to properly mix those. Things to think about improving:
 * If the OCR collapses one word into many or many words into one, that's really hard for this to catch.
 * I thought about doing totally next-level optical edit distance stuff, but that seems like overkill.
 * Knowing when to apply this system is also important: locations and names have to stay that way, and Massachusetts has some *weird* place names that can't be spell-checked and might change constantly.
 * Word embeddings a la `word2vec` would significantly improve this and I'm working on that.

The really nifty thing would be to use this algorithm to train a neural network to do deep learning on spell-checking (which has been done successfully), but that REALLY seems like overkill. I'll plug this into the whole pipeline and get the dictionary sorted before I do that.