In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
def process_data(file_path):
    word_list = []
    with open(file_path, 'r', encoding='utf-8') as file:

        text = file.read()
        text = text.lower()
        words = text.split()
        word_list.extend(words)
        return word_list

file_path = '/content/drive/MyDrive/mydataset/shakespeare.txt'
words = process_data(file_path)
print(words[:10])


['this', 'is', 'the', '100th', 'etext', 'file', 'presented', 'by', 'project', 'gutenberg,']


In [None]:
from collections import defaultdict

def get_count(word_list):
    word_counts = defaultdict(int)
    for word in word_list:
        word_counts[word] += 1
    word_counts_dict = dict(word_counts)
    return word_counts_dict




In [None]:
def compute_word_probabilities(word_counts, total_words):
    word_probabilities = {}

    for word, count in word_counts.items():
        probability = count / total_words
        word_probabilities[word] = probability

    return word_probabilities






In [None]:
def delete_letter(word):

    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deleted_words = [left + right[1:] for left, right in splits if right]

    return set(deleted_words)

# Example
word = 'nice'
deleted_words = delete_letter(word)
print(deleted_words)


{'nie', 'nic', 'nce', 'ice'}


In [None]:
def replace_letter(word):
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    replaced_words = [left + c + right[1:] for left, right in splits if right for c in 'abcdefghijklmnopqrstuvwxyz']
    return set(replaced_words)

# Example
word = 'ear'
replaced_words = replace_letter(word)
print(replaced_words)


{'jar', 'err', 'eal', 'lar', 'ecr', 'ejr', 'rar', 'ead', 'oar', 'tar', 'eaw', 'sar', 'ean', 'ekr', 'ear', 'enr', 'eay', 'ebr', 'esr', 'far', 'eur', 'eaj', 'car', 'exr', 'dar', 'eau', 'qar', 'eav', 'har', 'uar', 'eak', 'eaf', 'egr', 'eae', 'aar', 'xar', 'eir', 'gar', 'edr', 'yar', 'war', 'eer', 'ewr', 'efr', 'elr', 'eyr', 'etr', 'iar', 'kar', 'ezr', 'eao', 'eaq', 'bar', 'eor', 'eqr', 'par', 'ehr', 'eat', 'evr', 'eam', 'eap', 'eaz', 'eas', 'mar', 'eai', 'epr', 'eac', 'eab', 'emr', 'nar', 'var', 'eag', 'eax', 'zar', 'eaa', 'eah'}


In [None]:
def insert_letter(word):

    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    inserted_words = [left + c + right for left, right in splits for c in 'abcdefghijklmnopqrstuvwxyz']
    return set(inserted_words)

# Example
word = 'cat'
inserted_words = insert_letter(word)
print(inserted_words)


{'czat', 'ccat', 'icat', 'caet', 'catg', 'caxt', 'bcat', 'cabt', 'cpat', 'catp', 'cavt', 'capt', 'caft', 'csat', 'crat', 'scat', 'ckat', 'cwat', 'gcat', 'cuat', 'cqat', 'catm', 'fcat', 'cata', 'catt', 'xcat', 'cajt', 'cant', 'cagt', 'caat', 'cdat', 'wcat', 'cato', 'catw', 'rcat', 'catn', 'caqt', 'ciat', 'catb', 'qcat', 'catd', 'kcat', 'clat', 'ycat', 'catx', 'catz', 'ucat', 'catv', 'caht', 'ecat', 'caut', 'tcat', 'vcat', 'cyat', 'ctat', 'cast', 'pcat', 'chat', 'mcat', 'cact', 'catc', 'cayt', 'catu', 'cbat', 'hcat', 'cjat', 'cxat', 'cats', 'cait', 'cgat', 'zcat', 'cakt', 'cati', 'cate', 'cart', 'dcat', 'camt', 'coat', 'catl', 'caot', 'catr', 'cath', 'calt', 'catq', 'ncat', 'acat', 'caty', 'cmat', 'lcat', 'ceat', 'jcat', 'cawt', 'cnat', 'cfat', 'catf', 'catk', 'catj', 'cazt', 'cvat', 'cadt', 'ocat'}


In [None]:
def edit_one_letter(word):
    edits = delete_letter(word).union(replace_letter(word)).union(insert_letter(word))
    return edits

# Example
word = 'cat'
one_edit_away = edit_one_letter(word)
print(one_edit_away)


{'czat', 'ccat', 'icat', 'caet', 'ckt', 'cao', 'catg', 'caxt', 'cht', 'yat', 'pat', 'bcat', 'cabt', 'cit', 'cpat', 'bat', 'cak', 'cap', 'caw', 'catp', 'cavt', 'cax', 'cpt', 'capt', 'caft', 'csat', 'caj', 'crat', 'scat', 'ckat', 'cmt', 'cwat', 'gat', 'cjt', 'eat', 'iat', 'zat', 'cay', 'gcat', 'cuat', 'cav', 'cqat', 'catm', 'fcat', 'ca', 'can', 'nat', 'cata', 'catt', 'cai', 'cft', 'xcat', 'cdt', 'cajt', 'cant', 'cst', 'cagt', 'hat', 'cah', 'cqt', 'car', 'caat', 'cdat', 'cal', 'cac', 'cvt', 'wcat', 'cato', 'vat', 'catw', 'kat', 'rcat', 'catn', 'clt', 'cut', 'caqt', 'czt', 'cct', 'ciat', 'catb', 'qcat', 'cat', 'rat', 'cgt', 'cxt', 'catd', 'cbt', 'kcat', 'clat', 'ycat', 'catx', 'qat', 'catz', 'cau', 'ucat', 'catv', 'cam', 'caht', 'caa', 'ecat', 'caut', 'cwt', 'dat', 'cet', 'tcat', 'vcat', 'ctt', 'uat', 'cyat', 'ctat', 'cast', 'pcat', 'chat', 'mcat', 'cact', 'catc', 'cad', 'cayt', 'xat', 'catu', 'cbat', 'hcat', 'at', 'cjat', 'cnt', 'cxat', 'cats', 'cait', 'cgat', 'wat', 'ct', 'zcat', 'caz', 

In [None]:
def edit_two_letters(word):

    first_edits = edit_one_letter(word)
    second_edits = set()
    for edited_word in first_edits:
        second_edits.update(edit_one_letter(edited_word))

    return second_edits

# Example usage:
word = 'cat'
two_edits_away = edit_two_letters(word)
print(two_edits_away)


{'latg', 'cpag', 'uqat', 'cazs', 'hapt', 'dcabt', 'dcatt', 'cantw', 'lcvt', 'chdt', 'bagt', 'unt', 'lczat', 'cgap', 'cynt', 'cjagt', 'calty', 'carw', 'cbnat', 'seat', 'cdti', 'ccatd', 'caitt', 'czatt', 'dai', 'cxta', 'dcct', 'clatw', 'cawp', 'fatx', 'clay', 'catlw', 'jcdt', 'ccatt', 'jcdat', 'cjajt', 'cahq', 'cbmat', 'ecatn', 'camts', 'xcan', 'jcqt', 'cmo', 'wmt', 'trat', 'ucay', 'ncatt', 'mcaj', 'cathr', 'coatc', 'cavr', 'nabt', 'cxazt', 'cgvt', 'cxata', 'tato', 'cqi', 'atn', 'lcatb', 'clak', 'ceatp', 'cauz', 'daa', 'caqg', 'gatm', 'cdb', 'catxo', 'cvk', 'wyt', 'saty', 'qcaet', 'xaet', 'coart', 'ecft', 'cavd', 'catrd', 'catvh', 'cyart', 'caotc', 'catxk', 'caxzt', 'canc', 'cahtx', 'ocjt', 'clate', 'craa', 'icait', 'cxte', 'kcar', 'cdatz', 'cbtl', 'camut', 'rvcat', 'cafl', 'hnat', 'caaft', 'cxt', 'catja', 'pctt', 'ha', 'cacdt', 'pcmt', 'csut', 'caets', 'catwy', 'zap', 'caus', 'ceabt', 'iait', 'hrcat', 'zaf', 'ccpat', 'citk', 'bcwat', 'pcata', 'uaat', 'cxtc', 'cjait', 'qatv', 'za', 'gcat

In [None]:
def get_corrections(word, vocab, n=3):
    suggestions = []
    if word in vocab:
        suggestions.append((word, vocab[word]))
    one_letter_edits = edit_one_letter(word)
    for suggestion in one_letter_edits:
        if suggestion in vocab:
            suggestions.append((suggestion, vocab[suggestion]))
    if not suggestions:
        two_letter_edits = edit_two_letters(word)
        for suggestion in two_letter_edits:
            if suggestion in vocab:
                suggestions.append((suggestion, vocab[suggestion]))
    if not suggestions:
        suggestions.append((word, 0))
    best_words = {word: prob for word, prob in suggestions}
    sorted_suggestions = sorted(suggestions, key=lambda x: x[1], reverse=True)
    n_best = sorted_suggestions[:n]

    return n_best

# Example
vocab = {
    'cat': 0.6,
    'dog': 0.8,
    'bat': 0.4,
    'rat': 0.5,
    'mat': 0.3
}

word = 'dat'
corrections = get_corrections(word, vocab)
print(corrections)


[('cat', 0.6), ('rat', 0.5), ('bat', 0.4)]


In [None]:
def min_distance(source, target):
    m, n = len(source), len(target)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if source[i - 1] == target[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1
    return dp[m][n]

# Example
source = "kitten"
target = "sitting"
edit_distance = min_distance(source, target)
print(f"Edit distance between '{source}' and '{target}': {edit_distance}")


Edit distance between 'kitten' and 'sitting': 3
