In [30]:
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict

In [6]:
statement = "Hello my name is tiger!"

In [7]:
tokens = word_tokenize(statement)
tokens

['Hello', 'my', 'name', 'is', 'tiger', '!']

In [13]:
documents = [
    "I love programming in Python as programming is interesting",
    "I think Phonetic spell correction is interesting",
    "Soundex is a useful algorithm for spelling"
]

In [14]:
term_frequency = defaultdict(dict)
for doc_id, doc in enumerate(documents):
    tokens = doc.lower().split()
    print("The tokenization for doc" ,doc_id, " yielded : ", tokens)
    for token in tokens:
        if token in term_frequency:
            term_frequency[token][doc_id] = term_frequency[token].get(doc_id,0) + 1
        else:
            term_frequency[token][doc_id] = 1 
print(term_frequency)

The tokenization for doc 0  yielded :  ['i', 'love', 'programming', 'in', 'python', 'as', 'programming', 'is', 'interesting']
The tokenization for doc 1  yielded :  ['i', 'think', 'phonetic', 'spell', 'correction', 'is', 'interesting']
The tokenization for doc 2  yielded :  ['soundex', 'is', 'a', 'useful', 'algorithm', 'for', 'spelling']
defaultdict(<class 'dict'>, {'i': {0: 1, 1: 1}, 'love': {0: 1}, 'programming': {0: 2}, 'in': {0: 1}, 'python': {0: 1}, 'as': {0: 1}, 'is': {0: 1, 1: 1, 2: 1}, 'interesting': {0: 1, 1: 1}, 'think': {1: 1}, 'phonetic': {1: 1}, 'spell': {1: 1}, 'correction': {1: 1}, 'soundex': {2: 1}, 'a': {2: 1}, 'useful': {2: 1}, 'algorithm': {2: 1}, 'for': {2: 1}, 'spelling': {2: 1}})


In [15]:
# Output the term frequency matrix
for term, doc_freq in term_frequency.items():
    print(f"{term}: {doc_freq}")


i: {0: 1, 1: 1}
love: {0: 1}
programming: {0: 2}
in: {0: 1}
python: {0: 1}
as: {0: 1}
is: {0: 1, 1: 1, 2: 1}
interesting: {0: 1, 1: 1}
think: {1: 1}
phonetic: {1: 1}
spell: {1: 1}
correction: {1: 1}
soundex: {2: 1}
a: {2: 1}
useful: {2: 1}
algorithm: {2: 1}
for: {2: 1}
spelling: {2: 1}


In [44]:
import nltk
from collections import defaultdict
from nltk.util import ngrams

In [36]:
# Tokenization
def tokenize(text):
    return nltk.word_tokenize(text)

# Inverted Index Construction
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    for doc_id, text in enumerate(documents):
        tokens = tokenize(text)
        for token in tokens:
            inverted_index[token].append(doc_id)
    return inverted_index


documents = [
    "I love programming in Python",
    "Phonetic spell correction is interesting",
    "Soundex is a useful algorithm for spelling"
]

In [42]:
# Generate n-grams (n-character n-grams, default is 3)
def generate_ngrams(word, n=3):
    return [''.join(gram) for gram in ngrams(word, n)]

# N-Gram Based Spell Correction
def spell_correction_ngram(word, vocabulary, n=3):
    word_ngrams = set(generate_ngrams(word, n))  # Generate n-grams for the misspelled word
    best_match = None
    max_overlap = 0

    for vocab_word in vocabulary:
        vocab_ngrams = set(generate_ngrams(vocab_word, n))  # Generate n-grams for the vocabulary word
        overlap = len(word_ngrams & vocab_ngrams)  # Calculate the overlap between the n-grams
        total_ngrams = len(word_ngrams | vocab_ngrams)  # Total n-grams (union)
        
        # Calculate Jaccard similarity
        similarity = overlap / total_ngrams if total_ngrams > 0 else 0
        
        # Update best match if this word has more overlap
        if similarity > max_overlap:
            max_overlap = similarity
            best_match = vocab_word
    
    return best_match

# Soundex Algorithm for Phonetic Spell Correction
def soundex(word):
    """Returns the soundex code for the word."""
    word = word.upper()
    soundex_code = word[0]

    soundex_mapping = {
        "BFPV": "1", "CGJKQSXZ": "2", "DT": "3", "L": "4",
        "MN": "5", "R": "6"
    }

    for char in word[1:]:
        for key in soundex_mapping.keys():
            if char in key:
                code = soundex_mapping[key]
                if code != soundex_code[-1]:  # Avoid duplicates
                    soundex_code += code
                break
        if len(soundex_code) == 4:
            break

    soundex_code = (soundex_code + "000")[:4]  # Pad with zeros if necessary
    return soundex_code

def spell_correction_soundex(word, vocabulary):
    """Corrects spelling using Soundex codes."""
    word_soundex = soundex(word)
    for vocab_word in vocabulary:
        if soundex(vocab_word) == word_soundex:
            return vocab_word
    return word

# Example Usage

In [37]:
# Tokenize and build inverted index
tokens = tokenize(documents[0])
tokens


['I', 'love', 'programming', 'in', 'Python']

In [39]:
inverted_index = build_inverted_index(documents)
inverted_index

defaultdict(list,
            {'I': [0],
             'love': [0],
             'programming': [0],
             'in': [0],
             'Python': [0],
             'Phonetic': [1],
             'spell': [1],
             'correction': [1],
             'is': [1, 2],
             'interesting': [1],
             'Soundex': [2],
             'a': [2],
             'useful': [2],
             'algorithm': [2],
             'for': [2],
             'spelling': [2]})

In [40]:
# Test Spell Correction
vocabulary = list(inverted_index.keys())
vocabulary

['I',
 'love',
 'programming',
 'in',
 'Python',
 'Phonetic',
 'spell',
 'correction',
 'is',
 'interesting',
 'Soundex',
 'a',
 'useful',
 'algorithm',
 'for',
 'spelling']

In [41]:
misspelled_word = "phonetick"  # Intentional typo

# N-Gram Based Correction
corrected_ngram = spell_correction_ngram(misspelled_word, vocabulary)
print(f"N-Gram Corrected Word: {corrected_ngram}")


N-Gram Corrected Word: Phonetic


In [43]:

# Soundex-Based Correction
corrected_soundex = spell_correction_soundex(misspelled_word, vocabulary)
print(f"Soundex Corrected Word: {corrected_soundex}")


Soundex Corrected Word: Phonetic
