In [2]:
pip install editdistance


Collecting editdistance
  Downloading editdistance-0.8.1-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Downloading editdistance-0.8.1-cp311-cp311-win_amd64.whl (79 kB)
   ---------------------------------------- 0.0/79.7 kB ? eta -:--:--
   ----- ---------------------------------- 10.2/79.7 kB ? eta -:--:--
   ----- ---------------------------------- 10.2/79.7 kB ? eta -:--:--
   --------------- ------------------------ 30.7/79.7 kB 186.2 kB/s eta 0:00:01
   --------------- ------------------------ 30.7/79.7 kB 186.2 kB/s eta 0:00:01
   --------------- ------------------------ 30.7/79.7 kB 186.2 kB/s eta 0:00:01
   --------------- ------------------------ 30.7/79.7 kB 186.2 kB/s eta 0:00:01
   --------------- ------------------------ 30.7/79.7 kB 186.2 kB/s eta 0:00:01
   --------------- ------------------------ 30.7/79.7 kB 186.2 kB/s eta 0:00:01
   --------------- ------------------------ 30.7/79.7 kB 186.2 kB/s eta 0:00:01
   --------------- ------------------------ 30.7/79.7 kB 186.2

In [3]:
from collections import Counter
import editdistance

# Sample corpus for word frequencies (unigram model)
corpus = ["hello", "world", "python", "programming", "machine", "learning", "artificial", "intelligence", "data", "science"]
word_counts = Counter(corpus)
total_words = sum(word_counts.values())

# Calculate P(C) for each word in the vocabulary
def calculate_prior(word_counts, total_words):
    prior_probabilities = {}
    for word, count in word_counts.items():
        prior_probabilities[word] = count / total_words
    return prior_probabilities

# Calculate P(W|C) using edit distance (simplified model)
def calculate_likelihood(misspelled, correct):
    # Assume a simple model where edit distance gives the likelihood
    # For simplicity, we use 1 / (edit distance + 1)
    distance = editdistance.eval(misspelled, correct)
    return 1 / (distance + 1)

# Naive Bayes autocorrect function
def autocorrect_naive_bayes(word, word_counts, total_words, max_candidates=5):
    prior_probabilities = calculate_prior(word_counts, total_words)
    candidates = []

    # Calculate P(C|W) for each word in the vocabulary
    for candidate in word_counts.keys():
        prior = prior_probabilities.get(candidate, 0)
        likelihood = calculate_likelihood(word, candidate)
        posterior = prior * likelihood
        candidates.append((candidate, posterior))
    
    # Sort candidates by posterior probability
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[:max_candidates]

# Test the function
misspelled_word = "helo"
suggestions = autocorrect_naive_bayes(misspelled_word, word_counts, total_words)

print(f"Suggestions for '{misspelled_word}':")
for suggestion, prob in suggestions:
    print(f"{suggestion} (Probability: {prob})")


Suggestions for 'helo':
hello (Probability: 0.05)
world (Probability: 0.020000000000000004)
data (Probability: 0.020000000000000004)
python (Probability: 0.016666666666666666)
machine (Probability: 0.014285714285714285)
