In [33]:
from shannlp import spell_correct, SpellCorrector, is_correct_spelling

In [34]:
print(is_correct_spelling("ၶွမ်ႊၽိဝ်ႊတိူဝ်ႊ"))
print(spell_correct("ၶွမ်ႊၽိဝ်ႊတိူဝ်ႊ"))

False
[]


In [70]:
def example_basic_correction():
    """Example 1: Basic spell correction."""
    print("=" * 60)
    print("Example 1: Basic Spell Correction")
    print("=" * 60)

    # Check if a word is correct
    word1 = "ၸိူင်ၼႆႉ"  # Correct Shan word
    print(f"\nIs '{word1}' correct? {is_correct_spelling(word1)}")

    # Get suggestions for a word
    suggestions = spell_correct(word1)
    print(f"\nSuggestions for '{word1}':")
    for word, confidence in suggestions[:3]:
        print(f"  - {word} (confidence: {confidence:.2f})")

    print()

example_basic_correction()

Example 1: Basic Spell Correction

Is 'ၸိူင်ၼႆႉ' correct? False

Suggestions for 'ၸိူင်ၼႆႉ':



In [71]:
def example_misspelled_word():
    """Example 2: Correcting a misspelled word."""
    print("=" * 60)
    print("Example 2: Correcting Misspelled Words")
    print("=" * 60)

    # Simulate a misspelled word (for demonstration)
    # In real usage, this would be actual typos
    word = "ၸိူၼႈ ၼႆ"  # A common Shan word

    print(f"\nWord: {word}")
    print(f"Is correct? {is_correct_spelling(word)}")

    suggestions = spell_correct(word, max_suggestions=5)
    print(f"\nTop 5 suggestions:")
    for i, (suggestion, confidence) in enumerate(suggestions, 1):
        print(f"  {i}. {suggestion} (confidence: {confidence:.2f})")

    print()

example_misspelled_word()

Example 2: Correcting Misspelled Words

Word: ၸိူၼႈ ၼႆ
Is correct? False

Top 5 suggestions:



In [37]:
def example_custom_dictionary():
    """Example 3: Using custom dictionary."""
    print("=" * 60)
    print("Example 3: Custom Dictionary")
    print("=" * 60)

    # Create a custom dictionary with specific words
    custom_words = {"ပီႊ", "မႂ်ႇ", "ၼႆႉ", "ၵွၺ်း"}

    print(f"\nCustom dictionary: {custom_words}")

    # Check words in custom dictionary
    test_word = "ပိႊမႂ်ႇ"
    print(f"\nIs '{test_word}' in custom dictionary? {is_correct_spelling(test_word, custom_dict=custom_words)}")

    # Get suggestions using custom dictionary
    suggestions = spell_correct(test_word, custom_dict=custom_words)
    print(f"\nSuggestions for '{test_word}' from custom dictionary:")
    for word, confidence in suggestions[:3]:
        print(f"  - {word} (confidence: {confidence:.2f})")

    print()

example_custom_dictionary()

Example 3: Custom Dictionary

Custom dictionary: {'ပီႊ', 'ၵွၺ်း', 'မႂ်ႇ', 'ၼႆႉ'}

Is 'ပိႊမႂ်ႇ' in custom dictionary? False

Suggestions for 'ပိႊမႂ်ႇ' from custom dictionary:



In [38]:
def example_spell_corrector_class():
    """Example 4: Using SpellCorrector class."""
    print("=" * 60)
    print("Example 4: SpellCorrector Class")
    print("=" * 60)

    # Create a spell corrector instance
    corrector = SpellCorrector(max_edit_distance=2, use_phonetic=True)

    print("\nSpellCorrector instance created")
    print("Settings: max_edit_distance=2, use_phonetic=True")

    # Check if words are correct
    words_to_check = ["ပီႊ", "မႂ်ႇ", "ပီႊ", "ၼႆႉ", "ၵွၺ်း"]
    print("\nChecking words:")
    for word in words_to_check:
        is_correct = corrector.is_correct(word)
        print(f"  - '{word}': {'✓ correct' if is_correct else '✗ incorrect'}")

    # Get corrections
    word = "ပီႊမႂ်ႇ"
    suggestions = corrector.correct(word, max_suggestions=3)
    print(f"\nCorrections for '{word}':")
    for suggestion, confidence in suggestions:
        print(f"  - {suggestion} (confidence: {confidence:.2f})")

    # Add custom words
    print("\nAdding custom word 'TestWord'")
    corrector.add_word("TestWord")
    print(f"Is 'TestWord' correct now? {corrector.is_correct('TestWord')}")

    # Remove word
    corrector.remove_word("TestWord")
    print(f"After removal, is 'TestWord' correct? {corrector.is_correct('TestWord')}")

    print()

example_spell_corrector_class()

Example 4: SpellCorrector Class

SpellCorrector instance created
Settings: max_edit_distance=2, use_phonetic=True

Checking words:
  - 'ပီႊ': ✓ correct
  - 'မႂ်ႇ': ✓ correct
  - 'ပီႊ': ✓ correct
  - 'ၼႆႉ': ✓ correct
  - 'ၵွၺ်း': ✗ incorrect

Corrections for 'ပီႊမႂ်ႇ':
  - ပီမႂ်ႇ (confidence: 0.60)
  - ၼီႈမႂ်ႇ (confidence: 0.24)

Adding custom word 'TestWord'
Is 'TestWord' correct now? True
After removal, is 'TestWord' correct? False



In [39]:
def example_advanced_options():
    """Example 5: Advanced options."""
    print("=" * 60)
    print("Example 5: Advanced Options")
    print("=" * 60)

    word = "မိူင်တႆပဵၼ်မိူင်းၶိုၼ်ႉယႂ်"  # Common Shan word

    # With frequency ranking
    print(f"\nWord: {word}")
    print("\n1. With frequency ranking (default):")
    suggestions = spell_correct(word, use_frequency=True, max_suggestions=3)
    for suggestion, confidence in suggestions:
        print(f"  - {suggestion} (confidence: {confidence:.2f})")

    # Without frequency ranking
    print("\n2. Without frequency ranking:")
    suggestions = spell_correct(word, use_frequency=False, max_suggestions=3)
    for suggestion, confidence in suggestions:
        print(f"  - {suggestion} (confidence: {confidence:.2f})")

    # With phonetic similarity
    print("\n3. With phonetic similarity (default):")
    suggestions = spell_correct(word, use_phonetic=True, max_suggestions=3)
    for suggestion, confidence in suggestions:
        print(f"  - {suggestion} (confidence: {confidence:.2f})")

    # Adjust edit distance
    print("\n4. Max edit distance = 1 (faster, fewer suggestions):")
    suggestions = spell_correct(word, max_edit_distance=1, max_suggestions=3)
    for suggestion, confidence in suggestions:
        print(f"  - {suggestion} (confidence: {confidence:.2f})")

    print()

example_advanced_options()

Example 5: Advanced Options

Word: မိူင်တႆပဵၼ်မိူင်းၶိုၼ်ႉယႂ်

1. With frequency ranking (default):

2. Without frequency ranking:

3. With phonetic similarity (default):

4. Max edit distance = 1 (faster, fewer suggestions):



In [40]:
def example_frequency_data():
    """Example 6: Working with frequency data."""
    print("=" * 60)
    print("Example 6: Frequency Data")
    print("=" * 60)

    from shannlp.spell.frequency import get_top_words, get_word_probability

    # Get top 10 most frequent words
    print("\nTop 10 most frequent Shan words:")
    top_words = get_top_words(10)
    for i, (word, prob) in enumerate(top_words, 1):
        print(f"  {i}. {word} (probability: {prob:.6f})")

    # Get probability of specific words
    print("\nWord probabilities:")
    test_words = ["ပီ", "မႂ်ႇ", "ပီ", "ၼႆႉ", "ၵူၺ်း"]
    for word in test_words:
        prob = get_word_probability(word)
        print(f"  - {word}: {prob:.6f}")

    print()

example_frequency_data()

Example 6: Frequency Data

Top 10 most frequent Shan words:
  1. ယဝ်ႉ (probability: 0.021304)
  2. မ (probability: 0.019400)
  3. တီႈ (probability: 0.019273)
  4. မီး (probability: 0.018646)
  5. သေ (probability: 0.016281)
  6. ၼႆႉ (probability: 0.015985)
  7. ဝၼ်း (probability: 0.015909)
  8. ပဵၼ် (probability: 0.015403)
  9. ၼၼ်ႉ (probability: 0.014498)
  10. ဢၼ် (probability: 0.011888)

Word probabilities:
  - ပီ: 0.004755
  - မႂ်ႇ: 0.000593
  - ပီ: 0.004755
  - ၼႆႉ: 0.015985
  - ၵူၺ်း: 0.001402



In [41]:
def example_edit_distance():
    """Example 7: Edit distance calculation."""
    print("=" * 60)
    print("Example 7: Edit Distance")
    print("=" * 60)

    from shannlp.spell.distance import shan_edit_distance, similarity_score

    # Compare similar words
    word1 = "မိူင်"
    word2 = "မိူင်း"
    word3 = "မူၼ်း"

    print("\nEdit distances:")
    dist1 = shan_edit_distance(word1, word2)
    print(f"  '{word1}' vs '{word2}': {dist1:.2f}")

    dist2 = shan_edit_distance(word1, word3)
    print(f"  '{word1}' vs '{word3}': {dist2:.2f}")

    # Similarity scores (0.0 to 1.0)
    print("\nSimilarity scores:")
    sim1 = similarity_score(word1, word2)
    print(f"  '{word1}' vs '{word2}': {sim1:.2f}")

    sim2 = similarity_score(word1, word3)
    print(f"  '{word1}' vs '{word3}': {sim2:.2f}")

    print()

example_edit_distance()

Example 7: Edit Distance

Edit distances:
  'မိူင်' vs 'မိူင်း': 1.00
  'မိူင်' vs 'မူၼ်း': 3.00

Similarity scores:
  'မိူင်' vs 'မိူင်း': 0.83
  'မိူင်' vs 'မူၼ်း': 0.40



In [42]:
def main():
    """Run all examples."""
    print("\n")
    print("╔" + "=" * 58 + "╗")
    print("║" + " " * 10 + "ShanNLP Spell Correction Examples" + " " * 14 + "║")
    print("╚" + "=" * 58 + "╝")
    print()

    examples = [
        example_basic_correction,
        example_misspelled_word,
        example_custom_dictionary,
        example_spell_corrector_class,
        example_advanced_options,
        example_frequency_data,
        example_edit_distance,
    ]

    for example in examples:
        try:
            example()
        except Exception as e:
            print(f"Error in {example.__name__}: {e}")
            import traceback
            traceback.print_exc()

    print("=" * 60)
    print("All examples completed!")
    print("=" * 60)
    print()

main()



║          ShanNLP Spell Correction Examples              ║

Example 1: Basic Spell Correction

Is 'ၽိင်ႈႁိတ်ႈၵၢၼ်' correct? False

Suggestions for 'ၽိင်ႈႁိတ်ႈၵၢၼ်':

Example 2: Correcting Misspelled Words

Word: ယမ်
Is correct? True

Top 5 suggestions:
  1. ယမ် (confidence: 0.93)

Example 3: Custom Dictionary

Custom dictionary: {'ပီႊ', 'ၵွၺ်း', 'မႂ်ႇ', 'ၼႆႉ'}

Is 'ပိႊမႂ်ႇ' in custom dictionary? False

Suggestions for 'ပိႊမႂ်ႇ' from custom dictionary:

Example 4: SpellCorrector Class

SpellCorrector instance created
Settings: max_edit_distance=2, use_phonetic=True

Checking words:
  - 'ပီႊ': ✓ correct
  - 'မႂ်ႇ': ✓ correct
  - 'ပီႊ': ✓ correct
  - 'ၼႆႉ': ✓ correct
  - 'ၵွၺ်း': ✗ incorrect

Corrections for 'ပီႊမႂ်ႇ':
  - ပီမႂ်ႇ (confidence: 0.60)
  - ၼီႈမႂ်ႇ (confidence: 0.24)

Adding custom word 'TestWord'
Is 'TestWord' correct now? True
After removal, is 'TestWord' correct? False

Example 5: Advanced Options

Word: မိူင်တႆပဵၼ်မိူင်းၶိုၼ်ႉယႂ်

1. With frequency ranking (default):

2

In [None]:
from shannlp.spell.context import ContextAwareCorrector

corrector = ContextAwareCorrector()
corrector.load_model('shan_bigram.msgpack')

result = corrector.correct_sentence("ၸိူင်ၼႆႉ")
print("Corrected Sentence:", result)

Model loaded from shan_bigram.msgpack (MessagePack format)
  - Vocabulary size: 5408
  - Unique contexts: 6422
N-gram model loaded (2-gram)
Corrected Sentence: ၸိုင် ၼႆႉ


In [59]:
from shannlp.spell.ngram import NgramModel

model = NgramModel.load('shan_bigram.msgpack')

# Count word frequencies
word_freq = {}
for context, words in model.ngram_counts.items():
    for word, count in words.items():
        word_freq[word] = word_freq.get(word, 0) + count

# Distribution
counts = list(word_freq.values())
print(f'Total unique words: {len(counts)}')
print(f'Words appearing 1 time: {sum(1 for c in counts if c == 1)}')
print(f'Words appearing 2-5 times: {sum(1 for c in counts if 2 <= c <= 5)}')
print(f'Words appearing 6-10 times: {sum(1 for c in counts if 6 <= c <= 10)}')
print(f'Words appearing 10+ times: {sum(1 for c in counts if c > 10)}')

Model loaded from shan_bigram.msgpack (MessagePack format)
  - Vocabulary size: 5408
  - Unique contexts: 6422
Total unique words: 5409
Words appearing 1 time: 0
Words appearing 2-5 times: 1672
Words appearing 6-10 times: 998
Words appearing 10+ times: 2739


In [69]:
from shannlp.spell.context import ContextAwareCorrector

corrector = ContextAwareCorrector()
corrector.load_model('shan_trigram.msgpack')

# Test with a real sentence containing a misspelling
result = corrector.correct_sentence("ၵူၼ်မိူင်း ၵိၼ် ၶဝ်ႈ")
print(result)

Model loaded from shan_trigram.msgpack (MessagePack format)
  - Vocabulary size: 5408
  - Unique contexts: 134612
N-gram model loaded (3-gram)
ၵူၼ် မိူင်း ၵိၼ် ၶဝ်ႈ


In [68]:
from shannlp import word_tokenize, spell_correct
from shannlp.corpus import shan_words

# 1. Check tokenization
print("Tokenization:", word_tokenize("ၸိူင်ၼႆႉ"))

# 2. Check if correct word is in dictionary
words = shan_words()
print("ၸိူဝ်း in dictionary:", "ၸိူဝ်း" in words)
print("ၸိူဝ်းၼႆႉ in dictionary:", "ၸိူဝ်းၼႆႉ" in words)
print("ၸိူင် in dictionary:", "ၸိူင်" in words)
print("ၸိုင် in dictionary:", "ၸိုင်" in words)

# 3. Check basic spell correction (without context)
print("\nBasic spell_correct for ၸိူင်:")
print(spell_correct("ၸိူင်"))

Tokenization: ['ၸိူင်', 'ၼႆႉ']
ၸိူဝ်း in dictionary: True
ၸိူဝ်းၼႆႉ in dictionary: True
ၸိူင် in dictionary: False
ၸိုင် in dictionary: True

Basic spell_correct for ၸိူင်:
[('ၸိုင်', 0.6903534427257799), ('ပိူင်', 0.6889404940078372), ('ၸိူင်ႉ', 0.6348846759319655), ('ၸိူင်း', 0.6071980039820328), ('လိူင်', 0.5897924261924)]


In [67]:
from shannlp import spell_correct

print("Testing ၸိူင် correction:")
results = spell_correct("ၸိူင်")
for word, score in results:
    print(f"  {word}: {score:.4f}")

print("\nIs ၸိူဝ်း in results?", any(w == "ၸိူဝ်း" for w, _ in results))

Testing ၸိူင် correction:
  ၸိုင်: 0.6904
  ပိူင်: 0.6889
  ၸိူင်ႉ: 0.6349
  ၸိူင်း: 0.6072
  လိူင်: 0.5898

Is ၸိူဝ်း in results? False


In [12]:
from pythainlp.spell import correct
from pythainlp.tokenize import word_tokenize

word_token = word_tokenize("เกิดเหตุผู้ต้องขังหลบหนจากเรือนจำชั่วคราวดอยฮาง")

for word in word_token:
    print(correct(word))


เกิดเหตุ
ผู้ต้องขัง
หลบ
หน
จาก
เรือนจำ
ชั่วคราว
ดอย
ฮา
ลง
