# Spelling Correction

In [1]:
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('newwords.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [15]:
if 'chutney' in WORDS:
    print 'yes'
else:
    print 'no'

yes


In [16]:
# discarded graphlab
#import graphlab as gl

# used pandas for reading csv file
import pandas as pd

In [17]:
# discarded graphlab
#df=gl.SFrame.read_csv('datasets\\3.data_case_folded.csv')

# read the csv file using pandas
df=pd.read_csv("datasets\\5.data_stemmed.csv",encoding="latin1")

In [18]:
type(df)
len(df)
df['Reviews'][6]

u"day visit gulati years, it' superb food. dahi ke kebab kakori kebab melt mouth highli recommended. khasta roti paneer lababdar, gravi thick, paneer soft masala perfect. mocktail dont recal rememb it, pineapple, mango icecream, them. place doubt legendari outlet delhi north indian delicacies. staff excel service. quantiti serv true money charge."

In [19]:
from IPython.display import clear_output
import thread
def spell_correct(min,max):
    for i in range(min,max):
        df['Reviews'][i]=" ".join([correction(x) for x in df['Reviews'][i].split()])
        print i
        clear_output(wait=True)
for i in range(0,1000):
    thread.start_new_thread(spell_correct,(i*len(df)/1000,(i+1)*int(len(df)/1000)))

5402


In [38]:
df['Reviews'][6]

3077


In [42]:
df['Reviews'][178]

u'tam lunch iguru glaze buffet read hospitality. doubt ambience a food cervix commendable enjoy aam panna a masala lemonade tandoori mushroom a ananas dahi ke kababs min dessert toast buffet decent priced.'

In [44]:
# write the data back to a new csv file
df.to_csv("datasets\\6.data_spell_corrected.csv",mode='w',headers=False,encoding="latin1")