In [65]:
import re
import math
from pprint import pprint
from collections import Counter, defaultdict

In [66]:
'''Word Probability'''
def words(text):
    return re.findall(r'\w+', text.lower())

count_word = Counter(words(open('big.txt').read()))
Nw = sum(count_word.values())
Pdist = {word: float(count) / Nw for word, count in count_word.items()}

def Pw(word):
    return Pdist[word] if word in Pdist else 10 / 10**len(word) / Nw

In [67]:
'''Channel Probability'''
count_1edit = defaultdict(lambda: 0)
count_c = defaultdict(lambda: 0)
for line in open('count_1edit.txt'):
    edit, count = line.split('\t')[0], int(line.split('\t')[1].replace('\n', ''))
    w, c = edit.split('|')[0], edit.split('|')[1]
    count_1edit[(w, c)] = count
    count_c[c] += count

r = 10
N = dict()
for i in range(1, r + 2):
    N[i] = (sum(count for count in count_1edit.values() if count == i)) // i

N[0] = 26 * 26 * 26 * 26 + 2 * 26 * 26 * 26 + 26 * 26 - sum(N.values())
Nall = len(count_1edit.keys())

In [68]:
def smooth(count, r=10):
    if count <= r:
        return (count + 1) * N[count + 1] / N[count]
    else:
        return count

In [69]:
smooth(0)

0.0006445696531727264

In [70]:
Nall

1587

In [71]:
count_c['e']

3610

In [72]:
def Pedit(w, c):
    if (w, c) in count_1edit:
        return smooth(count_1edit[w, c]) / count_c[c]
    else:
        if c in count_c:
            return smooth(0)/count_c[c]
        else:
            return 10**(-20)

In [73]:
Pedit("e","i")

0.3411458333333333

In [74]:
'''Combining channel probability with word probability to score states'''
def P(pedit, pw):
    return math.log(pedit) + math.log(pw)

In [75]:
'''Next States'''
letters = 'abcdefghijklmnopqrstuvwxyz'
def next_states(state):
    L, R, edits, prob, pedit = state
    R0, R1 = R[0], R[1:]
    
    if len(edits) == 2:
        return [(L + R0, R1, edits, prob, pedit * 0.8)]
    
    noedit = [(L + R0, R1, edits, prob, pedit * 0.8)]
    delete = [(L, R1, edits + [(L[-1:] + R0, L[-1:])], Pw(L + R1), pedit * Pedit(L[-1:] + R0, L[-1:]))]
    replace = [(L + c, R1, edits + [(R0, c)], Pw(L + c + R1), pedit * Pedit(R0, c)) for c in letters]
    insert  = [(L + R0 + c, R1, edits + [(R0, R0 + c)], Pw(L + R0 + c + R1), pedit * Pedit(R0, R0 + c)) for c in letters]
    transpose = [(L + R1[0], R0 + R1[1:], edits + [(R0 + R1[0], R1[0] + R0)], Pw(L + R1[0] + R0 + R1[1:]), pedit * Pedit(R0 + R1[0], R1[0] + R0))] if len(R1) > 0 else []    
    
    return noedit + delete + insert + replace + transpose

In [76]:
# Using Pw(word) or P(word) may result in different answers
def correction(word):
    states = [ ("", word, [], Pw(word), 1) ]
    MAXBEAM = 550
    
    for i in range(len(word)):
        states = [state for states in map(next_states, states) for state in states]
        
        word_dict = {}
        for state in states:
            L, R, edits, prob, pedit = state
            word = L + R
            if word not in word_dict or len(edits) < len(word_dict[word][2]):
                word_dict[word] = state
                
        states = list(word_dict.values())
        states = sorted(states, key=lambda x: P(x[3],x[4]), reverse=True)
        states = sorted(states, key=lambda x: len(x[2]))[:MAXBEAM]
        
    states = sorted(states, key=lambda x: P(x[3],x[4]), reverse=True)
    return states[:3]

In [77]:
correction('appearant')

[('apparent',
  '',
  [('pe', 'p'), ('a', 'e')],
  3.764840868244015e-05,
  0.01018676187828061),
 ('appearance',
  '',
  [('n', 'nc'), ('t', 'e')],
  0.00012101274219355764,
  0.0011182874238227151),
 ('appearing',
  '',
  [('a', 'i'), ('t', 'g')],
  2.061698570705056e-05,
  0.0009722564755838646)]

In [78]:
correction("runing")

[('turning',
  '',
  [('r', 't'), ('u', 'ur')],
  0.0001864492620463703,
  0.0042981549815498165),
 ('ruining', '', [('u', 'ui')], 2.6891720487457255e-06, 0.27703854545454554),
 ('ringing',
  '',
  [('u', 'i'), ('n', 'ng')],
  2.8684501853287736e-05,
  0.01890037453183521)]

In [79]:
correction("particpate")

[('participate', '', [('c', 'ci')], 3.585562731660967e-06, 0.0648964838681319),
 ('participated',
  '',
  [('c', 'ci'), ('e', 'ed')],
  2.6891720487457255e-06,
  0.060988483927167755),
 ('participates',
  '',
  [('c', 'ci'), ('e', 'es')],
  8.963906829152417e-07,
  0.06499725480581534)]

In [80]:
correction("beleive")

[('believe', '', [('ei', 'ie')], 0.00016403949497348924, 0.15302185645933022),
 ('believed',
  '',
  [('ei', 'ie'), ('e', 'ed')],
  7.977877077945652e-05,
  0.14380703663604938),
 ('believes',
  '',
  [('ei', 'ie'), ('e', 'es')],
  8.963906829152418e-06,
  0.1532594680376832)]

In [81]:
correction('writtung')

[('written',
  '',
  [('u', 'e'), ('ng', 'n')],
  0.00010487770990108329,
  0.0004765429362880889),
 ('writhing',
  '',
  [('t', 'h'), ('u', 'i')],
  3.585562731660967e-06,
  0.0008601237842617159),
 ('writing',
  '',
  [('t', 'i'), ('iu', 'i')],
  6.185095712115169e-05,
  5.986394557823132e-06)]

In [82]:
correction('happy')

[('happy', '', [], 0.00019541316887552272, 0.32768000000000014),
 ('happen',
  '',
  [('p', 'pe'), ('y', 'n')],
  8.874267760860894e-05,
  0.002143868312757202),
 ('sappy', '', [('h', 's')], 1.7927813658304835e-06, 0.0034092746730083247)]

In [83]:
#%save lab3_noisy_channel.py 65-82

In [85]:
correction('thenks')

[('thanks', '', [('e', 'a')], 3.495923663369443e-05, 0.09240674698795184),
 ('think',
  '',
  [('e', 'i'), ('ks', 'k')],
  0.0004992896103837897,
  0.005557575757575758),
 ('thinks', '', [('e', 'i')], 2.2409767072881046e-05, 0.11178666666666669)]