In [52]:
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
import sys
import contractions

In [53]:
# Step 1 

!python fetch_gutenberg.py > data/gutenberg.txt

[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/tilemahos/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [54]:
# Step 2

# Read the gutenberg.txt file word by word
# and create a disctionary that holds as keys the tokens(words) and as values the number of appearance
Dict = {}
with open('./data/gutenberg.txt') as file:
    # reading each line   
    for line in file:
        # reading each word       
        for word in line.split():
            if word not in Dict:
                Dict[word] = 1
            else:
                Dict[word] = Dict[word] + 1

# Filter rare tokens
Dict = {key: val for key, val in Dict.items() if val >= 5}
        
# Write the Dict in an output file
with open("./vocab/vocab.txt",'w') as f:
    for word in Dict.keys():
        f.write("{}\t{}\n".format(word,Dict[word]))

In [55]:
# Step 3

# Create chars.syms containing all ascii characters indexed
with open("./vocab/chars.syms",'w') as f:
    f.write("<eps>\t0\n")
    for i in range(97,123):
        f.write("{}\t{}\n".format(chr(i),i-96))
 
 # Create words.syms containing all words in corpus indexed
with open("./vocab/words.syms",'w') as f:
    i = 0
    f.write("<eps>\t0\n")
    i = i+1
    for word in Dict.keys():
        f.write("{}\t{}\n".format(word,i))
        i = i+1

In [56]:
# Step 4

# Levenhstein .fst

with open("./fsts/L.fst",'w') as f:
    for i in range(97,123):
        f.write("0\t0\t{}\t{}\t0\n".format(chr(i),chr(i))) # chr -> chr
        f.write("0\t0\t{}\t<eps>\t1\n".format(chr(i))) # chr -> <eps>
        f.write("0\t0\t<eps>\t{}\t1\n".format(chr(i))) # <eps> -> chr
        for j in range(97,123):
            if j != i:
                f.write("0\t0\t{}\t{}\t1\n".format(chr(i),chr(j))) # chr -> chr
    f.write("0\t0\n") # end state
                
!fstcompile -isymbols=./vocab/chars.syms -osymbols=./vocab/chars.syms ./fsts/L.fst ./fsts/L.binfst
!fstdraw -isymbols=./vocab/chars.syms -osymbols=./vocab/chars.syms -portrait ./fsts/L.binfst | dot -Tpng >./fsts/L.png
!display ./fsts/L.png

In [57]:
# Step 5

# Dictionary acceptor .fst

with open("./fsts/V.fst",'w') as f:
    j = 0
    for word in Dict.keys():
        i = 0
        for letter in word:
            if i == 0:
                f.write("{}\t{}\t{}\t{}\t0\n".format(i,j+1,letter,word)) # word[0] -> word
                j = j+1
            else:    
                f.write("{}\t{}\t{}\t<eps>\t0\n".format(j,j+1,letter)) # word[i>0] -> <eps>
                j = j+1
            i = i+1
        f.write("{}\t0\n".format(j)) # end state

In [58]:
!fstcompile -isymbols=./vocab/chars.syms -osymbols=./vocab/words.syms ./fsts/V.fst ./fsts/V.binfst
!fstrmepsilon ./fsts/V.binfst | fstdeterminize | fstminimize >./fsts/V_opt.binfst
# !fstdraw -isymbols=./fsts/chars.syms -osymbols=./fsts/words.syms -portrait ./fsts/V_opt.binfst | dot -Tpng >./fsts/V_opt.png
# !display ./fsts/V_opt.png

In [59]:
# Step 6

# Compose L (Levensthein) with V (Dictionary acceptor) to build the naive spell checker S

!fstarcsort --sort_type=olabel ./fsts/L.binfst ./fsts/L.binfst
!fstarcsort --sort_type=ilabel ./fsts/V_opt.binfst ./fsts/V_opt.binfst
!fstcompose ./fsts/L.binfst ./fsts/V_opt.binfst ./fsts/S.binfst

!./predict.sh ./fsts/S.binfst cwt 


cut

In [60]:
!./predict.sh ./fsts/S.binfst cit 

city

In [61]:
!./predict.sh ./fsts/S.binfst antheaterry 

entreaty

In [62]:
# Step 7 

# Correcting the first 20 words of spell_test.txt

import subprocess

with open('./data/spell_test.txt') as file:
    # reading each line 
    j = 0
    for line in file:
        j = j+1
        if j == 20:
            break
        i = 0
        # reading each word       
        for word in line.split():
            i = i+1
            if i == 1:
                print("True \n{}".format(word))
            elif i == 2:
                print("False \n{}".format(word))
                print("Corrected ")
                subprocess.call(['bash','predict.sh', './fsts/S.binfst' , word])
                print('\n')
            else:
                break

True 
contented:
False 
contenpted
Corrected 
contented

True 
beginning:
False 
begining
Corrected 
beginning

True 
problem:
False 
problam
Corrected 
problem

True 
driven:
False 
dirven
Corrected 
given

True 
ecstasy:
False 
exstacy
Corrected 
ecstasy

True 
juice:
False 
guic
Corrected 
guil

True 
locally:
False 
localy
Corrected 
local

True 
compare:
False 
compair
Corrected 
company

True 
pronunciation:
False 
pronounciation
Corrected 
provocation

True 
transportability:
False 
transportibility
Corrected 
respectability

True 
minuscule:
False 
miniscule
Corrected 
ridicule

True 
independent:
False 
independant
Corrected 
independent

True 
arranged:
False 
aranged
Corrected 
ranged

True 
poetry:
False 
poartry
Corrected 
party

True 
level:
False 
leval
Corrected 
legal

True 
basically:
False 
basicaly
Corrected 
sickly

True 
triangular:
False 
triangulaur
Corrected 
triangular

True 
unexpected:
False 
unexpcted
Corrected 
unexpected

True 
standardizing:
False 
stane

**Part 1**

In [63]:
# Step 8

!./word_edits.sh abandonned abandoned

n	<eps>


In [64]:
# Create a file with the edits in wiki.txt

with open('./data/wiki.txt') as file:
    # reading each line 
    for line in file:
        # reading words       
        words = line.split()
        subprocess.call(['bash','word_edits_savetofile.sh', words[0] , words[1]])
            

In [65]:
# Create Dict with frequency(value) of each edit(key)

Edits = {}
with open('./data/edits.txt') as file:
    # reading each line   
    for line in file:
        # reading each edit      
        edit =  tuple(line.split())
        if edit not in Edits:
            Edits[edit] = 1
        else:
            Edits[edit] = Edits[edit] + 1

print(len(Edits))
print(24 + 24 + 24*23)

342
600


In [66]:
from math import log10

# E .fst (edit weight == -log10(edit freq))

total = sum(Edits.values())

with open("./fsts/E.fst",'w') as f:
    for i in range(97,123):
        f.write("0\t0\t{}\t{}\t0\n".format(chr(i),chr(i))) # chr -> chr
        
        if (chr(i), '<eps>') not in Edits:
            f.write("0\t0\t{}\t<eps>\t10000\n".format(chr(i))) # chr -> <eps>
        else:
            f.write("0\t0\t{}\t<eps>\t{}\n".format(chr(i),-log10(Edits[(chr(i),'<eps>')]/total))) # chr -> <eps>
            
        if ('<eps>', chr(i)) not in Edits:
            f.write("0\t0\t<eps>\t{}\t10000\n".format(chr(i))) # <eps> -> chr
        else:
            f.write("0\t0\t<eps>\t{}\t{}\n".format(chr(i),-log10(Edits[('<eps>', chr(i))]/total))) # <eps> -> chr
        
        for j in range(97,123):
            if j != i:
                if (chr(i), chr(j)) not in Edits:
                    f.write("0\t0\t{}\t{}\t10000\n".format(chr(i),chr(j))) # chr -> chr
                else:
                    f.write("0\t0\t{}\t{}\t{}\n".format(chr(i),chr(j),-log10(Edits[(chr(i), chr(j))]/total))) # chr -> chr
    
    f.write("0\t0\n") # end state
                
!fstcompile -isymbols=./vocab/chars.syms -osymbols=./vocab/chars.syms ./fsts/E.fst ./fsts/E.binfst
# !fstdraw -isymbols=./vocab/chars.syms -osymbols=./vocab/chars.syms -portrait ./fsts/L.binfst | dot -Tpng >./fsts/L.png
# !display ./fsts/L.png

In [67]:
# Compose E with V (Dictionary acceptor) to build the spell checker EV

!fstarcsort --sort_type=olabel ./fsts/E.binfst ./fsts/E.binfst
!fstarcsort --sort_type=ilabel ./fsts/V_opt.binfst ./fsts/V_opt.binfst
!fstcompose ./fsts/E.binfst ./fsts/V_opt.binfst ./fsts/EV.binfst

!./predict.sh ./fsts/EV.binfst cwt 

wit

In [68]:
!./predict.sh ./fsts/EV.binfst cit 

clit

In [69]:
!./predict.sh ./fsts/EV.binfst antheaterry 

theatre

In [70]:
# Step 7 

# Correcting the first 20 words of spell_test.txt with EV

import subprocess

with open('./data/spell_test.txt') as file:
    # reading each line 
    j = 0
    for line in file:
        j = j+1
        if j == 20:
            break
        i = 0
        # reading each word       
        for word in line.split():
            i = i+1
            if i == 1:
                print("True \n{}".format(word))
            elif i == 2:
                print("False \n{}".format(word))
                print("Corrected ")
                subprocess.call(['bash','predict.sh', './fsts/EV.binfst' , word])
                print('\n')
            else:
                break

True 
contented:
False 
contenpted
Corrected 
contented

True 
beginning:
False 
begining
Corrected 
beginning

True 
problem:
False 
problam
Corrected 
problem

True 
driven:
False 
dirven
Corrected 
driven

True 
ecstasy:
False 
exstacy
Corrected 
exactly

True 
juice:
False 
guic
Corrected 
guil

True 
locally:
False 
localy
Corrected 
local

True 
compare:
False 
compair
Corrected 
compare

True 
pronunciation:
False 
pronounciation
Corrected 
pronouncing

True 
transportability:
False 
transportibility
Corrected 
respectability

True 
minuscule:
False 
miniscule
Corrected 
mince

True 
independent:
False 
independant
Corrected 
independent

True 
arranged:
False 
aranged
Corrected 
arranged

True 
poetry:
False 
poartry
Corrected 
poetry

True 
level:
False 
leval
Corrected 
level

True 
basically:
False 
basicaly
Corrected 
busily

True 
triangular:
False 
triangulaur
Corrected 
triangular

True 
unexpected:
False 
unexpcted
Corrected 
unexpected

True 
standardizing:
False 
stan