### Imports and Helper Functions

In [40]:
import re
from collections import defaultdict

### Edit-distance and Candidate Functions

In [41]:
def words(text):
    """Extract all words from a text (lowercase)"""
    return re.findall(r'\w+', text.lower())

def edits1(word):
    """Generate all possible edits 1 edit away from word"""
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    """Generate all possible edits 2 edits away from word"""
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def known(words_list, word_dict):
    """Filter words_list to include only those present in word_dict"""
    return set(w for w in words_list if w in word_dict)

def candidates(word, word_dict):
    """Generate possible spelling corrections for word"""
    return (known([word], word_dict) or known(edits1(word), word_dict) or known(edits2(word), word_dict) or [word])

### Load Corpus

In [42]:
with open('big.txt', 'r') as f:
    corpus = f.read().lower()
    all_words = words(corpus)
    UNIGRAMS = Counter(all_words)
    BIGRAMS = Counter(zip(all_words, all_words[1:]))

### Bigram Probability and Correction Logic

In [43]:
def bigram_prob(w1, w2):
    """Calculate the probability of bigram (w1, w2)"""
    count_w1 = UNIGRAMS.get(w1, 0)
    if count_w1 == 0:
        return 0.0
    return BIGRAMS.get((w1, w2), 0) / count_w1

def correct_bigram(w1, w2):
    """Correct a bigram (w1, w2) by finding the best candidate pair"""
    if bigram_prob(w1, w2) > 0:
        return (w1, w2)
    
    c1_candidates = candidates(w1, UNIGRAMS)
    c2_candidates = candidates(w2, UNIGRAMS)
    
    best_pair = (w1, w2)
    max_prob = -1
    
    for c1 in c1_candidates:
        for c2 in c2_candidates:
            prob = bigram_prob(c1, c2)
            if prob > max_prob:
                max_prob = prob
                best_pair = (c1, c2)
    
    return best_pair

def correct_sentence(sentence):
    """Correct all bigrams in a sentence"""
    words_in = words(sentence)
    if len(words_in) < 2:
        return sentence
    
    corrected = []
    for i in range(len(words_in) - 1):
        w1 = words_in[i]
        w2 = words_in[i+1]
        c1, c2 = correct_bigram(w1, w2)
        
        if i == 0:
            corrected.append(c1)
        corrected.append(c2)
    
    return ' '.join(corrected)

### Example Usage and Output

In [44]:

test_sentences = [
    "thn appl",
    "korrect thn appl",
    "he wrriten the lettir and sehnd it tomorow",
    "diki beach is beauiful and crowsed",
    "this sentence has no mistakes",
    "hello mello",
    "ohhh my myn",
    "wrng example and teste",
    "this, is an exampel; indeed!",
    "ThIs Is A tEstt",
]


for sentence in test_sentences:
    print("Original:", sentence)
    print("Corrected:", correct_sentence(sentence))
    print("-" * 50)

Original: thn appl
Corrected: the apple
--------------------------------------------------
Original: korrect thn appl
Corrected: correct the apple
--------------------------------------------------
Original: he wrriten the lettir and sehnd it tomorow
Corrected: he writes the letter and send it tomorrow
--------------------------------------------------
Original: diki beach is beauiful and crowsed
Corrected: wiki beach is beautiful and crossed
--------------------------------------------------
Original: this sentence has no mistakes
Corrected: this sentence has no mistakes
--------------------------------------------------
Original: hello mello
Corrected: hello mellow
--------------------------------------------------
Original: ohhh my myn
Corrected: oh my men
--------------------------------------------------
Original: wrng example and teste
Corrected: wrong example and test
--------------------------------------------------
Original: this, is an exampel; indeed!
Corrected: this is an 