In [2]:
import re
import math
from collections import Counter, defaultdict
from pprint import pprint

In [3]:
'''Word Probability'''
def words(text):
    return re.findall(r'\w+', text.lower())

count_word = Counter(words(open('big.txt').read()))
Nw = sum(count_word.values())
Pdist = {word: float(count) / Nw for word, count in count_word.items()}

def Pw(word):
    return Pdist[word] if word in Pdist else 10 / 10**len(word) / Nw

In [4]:
'''Channel Probability'''
count_1edit = defaultdict(lambda: 0)
count_c = defaultdict(lambda: 0)
for line in open('count_1edit.txt'):
    edit, count = line.split('\t')[0], int(line.split('\t')[1].replace('\n', ''))
    w, c = edit.split('|')[0], edit.split('|')[1]
    count_1edit[(w, c)] += count
    count_c[c] += 1

r = 10
N = dict()
for i in range(1, r):
    N[i] = (sum(count for count in count_1edit.values() if count == i)) // i
N[0] = 26 * 26 * 26 * 26 + 2 * 26 * 26 * 26 + 26 * 26 - sum(N.values())

In [5]:
def smooth(count, r=10):
    if count <= r:
        return (count + 1) * N[count + 1] / N[count]
    else:
        return count

In [6]:
def Pedit(w, c):
    if count_c[c] > 0:
        return smooth(count_c[c]) / count_c[c]
    else:
        return 0

In [7]:
'''Combining channel probability with word probability to score states'''
def P(pedit, pw):
    return (pedit * pw) * 10 ** 7

In [8]:
'''Next States'''
letters = 'abcdefghijklmnopqrstuvwxyz'
def next_states(state):
    L, R, edits, pw, pedit = state  # (str, str, list, float, float)
    R0, R1 = R[0], R[1:]
    if edits == 2:
        return [(L + R0, R1, edits, pw, pedit * 0.8)]
    noedit = [(L + R0, R1, edits, pw, pedit * 0.8)]
    if len(L) > 0:
        delete = [(L, R1, edits + 1, Pw(L + R1), P(Pedit(L[-1], L[-1] + R0), Pw(L + R1)))]
    else:
        delete = [(L, R1, edits + 1, Pw(L + R1), P(Pedit('', R0), Pw(L + R1)))]
    insert = [(L + R0 + c, R1, edits + 1, Pw(L + R0 + c + R1), P(Pedit(R0, R0 + c), Pw(L + R0 + c + R1))) for c in letters]
    replace = [(L + c, R1, edits + 1, Pw(L + c + R1), P(Pedit(R0, R1), Pw(L + c + R1))) for c in letters]
    if len(R1) > 0:
        transpose = [(L + R1[0], R0 + R1[1:], edits + 1, Pw(L + R1[0] + R0 + R1[1:]), P(Pedit(R0 + R1[0], R1[0] + R0), Pw(L + R1[0] + R0 + R1[1:])))]
    else:
        transpose = []
    return noedit + delete + insert + replace + transpose

In [9]:
'''Correcting'''
MAXBEAM = 1000
def correction(word):
    states = [('', word, 0, Pw(word), 1)]  # initial state
    for i in range(len(word)):
        states = [newstates for state in states for newstates in next_states(state)]
        states = [state for state in states if state[4] > 0]

        temp = defaultdict(list)
        for state in states:
            L, R, edits, pw, pedit = state
            temp[L + R].append(state)
        states = [min(substates, key=lambda x: x[2]) for wd, substates in temp.items()]

        states = sorted(states, key=lambda x: x[4], reverse=True)
        states = sorted(states, key=lambda x: x[2])[:MAXBEAM]

    states = [state for state in states if state[4] > 0]

    return sorted(states, key=lambda x: x[4], reverse=True)[:3]

In [10]:
correction("appearant")

[('appearance', '', 2, 0.00012101274219355764, 1404.816691710701),
 ('appearing', '', 2, 2.061698570705056e-05, 239.3391400692305),
 ('apparent', '', 2, 3.764840868244015e-05, 194.4622152814735)]

In [11]:
correction("runing")

[('running', '', 1, 0.00012549469560813384, 414.85272593381006),
 ('ruin', '', 2, 4.302675277993161e-05, 347.2539558597741),
 ('ring', '', 2, 4.9301487560338295e-05, 203.72232077106747)]

In [12]:
correction("particpate")

[('participated', '', 2, 2.6891720487457255e-06, 21.703372241235883),
 ('participate', '', 1, 3.585562731660967e-06, 11.85293502668029),
 ('participates', '', 2, 8.963906829152417e-07, 7.234457413745293)]

In [13]:
correction("beleive")

[('believe', '', 1, 0.00016403949497348924, 677.839721838279),
 ('believed', '', 2, 7.977877077945652e-05, 643.8667098233311),
 ('believes', '', 2, 8.963906829152418e-06, 72.34457413745294)]

In [14]:
correction('writtung')

[('written', '', 2, 0.00010487770990108329, 846.4315174081994),
 ('writing', '', 2, 6.185095712115169e-05, 319.4736393909922),
 ('writtung', '', 0, 8.963906829152418e-14, 0.1677721600000001)]

In [15]:
correction('happy')

[('happen', '', 2, 8.874267760860894e-05, 1030.1989072545139),
 ('apply', '', 2, 3.85447993653554e-05, 248.8653350328381),
 ('hay', '', 2, 3.764840868244015e-05, 243.07776910184185)]

In [16]:
%save lab3_noisy_channel.py 2-10

File `lab3_noisy_channel.py` exists. Overwrite (y/[N])?  y
The following commands were written to file `lab3_noisy_channel.py`:
import re
import math
from collections import Counter, defaultdict
from pprint import pprint
'''Word Probability'''
def words(text):
    return re.findall(r'\w+', text.lower())

count_word = Counter(words(open('big.txt').read()))
Nw = sum(count_word.values())
Pdist = {word: float(count) / Nw for word, count in count_word.items()}

def Pw(word):
    return Pdist[word] if word in Pdist else 10 / 10**len(word) / Nw
'''Channel Probability'''
count_1edit = defaultdict(lambda: 0)
count_c = defaultdict(lambda: 0)
for line in open('count_1edit.txt'):
    edit, count = line.split('\t')[0], int(line.split('\t')[1].replace('\n', ''))
    w, c = edit.split('|')[0], edit.split('|')[1]
    count_1edit[(w, c)] += count
    count_c[c] += 1

r = 10
N = dict()
for i in range(1, r):
    N[i] = (sum(count for count in count_1edit.values() if count == i)) // i
N[0] = 26 * 26 * 26 