### Extract Corpus Bahasa Indonesia

In [1]:
with open('wiki.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

    # ✅ ['𝘈Ḇ𝖢𝕯٤ḞԍНǏ\n', 'hello world']
    print(len(lines))

6507702


In [4]:
import re
from collections import Counter

# build language model
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('wiki.txt', encoding='utf-8').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N
    # selection mechanism
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

# candidate model
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

# error model
def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [10]:
correction('mrka')

'maka'

In [5]:
import time

test_words = ['kcing', 'memkan', 'mrdeka', 'mnyedihkan', 'gimna',
              'terdpt', 'mrmpersulit', 'mhon', 'banos', 'begimana']
              
start = time.time()
for w in test_words:
    print(f"'{w}' -> '{correction(w)}'")
end = time.time()
print(f'- Peter Norvig: {end-start} detik')
print()

'kcing' -> 'kucing'
'memkan' -> 'memakan'
'mrdeka' -> 'merdeka'
'mnyedihkan' -> 'menyedihkan'
'gimna' -> 'gmina'
'terdpt' -> 'terdpat'
'mrmpersulit' -> 'mempersulit'
'mhon' -> 'moon'
'banos' -> 'banos'
'begimana' -> 'bagimana'
- Peter Norvig: 0.009091615676879883 detik

