# **Programming Assessment \#4**

Names: \<please supply your names\>

More information on the assessment is found in our Canvas course.

# **Load Data**

*While you don't have to separate your code into blocks, it might be easier if you separated loading your data from actually implementation of your code. Consider placing all loading of data into the code block below.*

In [5]:
!pip install editdistpy




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# **Noisy Channel Model Implementation**

*Again, you don't have to follow this directly, but consider placing your implementation of the model in the code block below.*

In [14]:
from collections import Counter
import pandas as pd
import numpy as np

from editdistpy import damerau_osa

# Finder
class Finder:
    def __init__(self, model):
        self.model = model

    def queryModel(self, tok):
        exists = tok in self.model
        val = self.model[tok] if exists else None
        return exists, val

    def getD(self, tok):
        candidates = []
        max_d = 1
        sentinel = 45 << 1
        while not (candidates or max_d > sentinel):
            candidates = [word for word in self.model.keys() if damerau_osa.distance(tok, word, max_d) > -1]
            max_d += 1
        return candidates

    def getCandidates(self, tok, n=5):
        exists, _ = self.queryModel(tok)
        candidates = []
        if exists:
            return [tok]
        else:
            candidates = self.getD(tok)
            p_c = [self.model[word] for word in candidates]
            weighted = list(zip(candidates, p_c))
            weighted.sort(key=lambda tu: tu[1], reverse=True)
            islong = len(weighted) >= n
            return weighted[::n] if islong else weighted

# Modeler
class Modeler:
    def __init__(self, corpus):
        if isinstance(corpus, str):

            words = corpus.split()
            counts = Counter(words)
            self.model = self.getP_C(counts)
        elif isinstance(corpus, Counter):
            self.model = self.getP_C(corpus)

    def getP_C(self, counts):
        total = sum(counts.values())
        r_t = 1 / total
        keys = counts.keys()
        vals = np.array(list(counts.values())) * r_t
        return dict(zip(keys, vals))



def calculate_edit_type_and_edit(input_word, candidate_word):
    if len(input_word) == len(candidate_word):
        # substitution
        diff_count = 0
        edit_type = "sub"
        edit = ""
        for i in range(len(input_word)):
            if input_word[i] != candidate_word[i]:
                diff_count += 1
                edit += candidate_word[i]
        if diff_count == 1:
            return edit_type, edit

    elif len(input_word) + 1 == len(candidate_word):
        # insertion
        edit_type = "ins"
        edit = ""
        i, j = 0, 0
        while i < len(input_word) and j < len(candidate_word):
            if input_word[i] != candidate_word[j]:
                edit += candidate_word[j]
                j += 1
            else:
                i += 1
                j += 1
        if j == len(candidate_word) - 1:
            return edit_type, edit

    elif len(input_word) - 1 == len(candidate_word):
        # deletion
        edit_type = "del"
        edit = ""
        i, j = 0, 0
        while i < len(input_word) and j < len(candidate_word):
            if input_word[i] != candidate_word[j]:
                edit += input_word[i]
                i += 1
            else:
                i += 1
                j += 1
        if i == len(input_word) - 1:
            return edit_type, edit

    # transposition
    if len(input_word) == len(candidate_word) and input_word != candidate_word:
        for i in range(len(input_word) - 1):
            if input_word[i] == candidate_word[i + 1] and input_word[i + 1] == candidate_word[i]:
                edit_type = "trans"
                edit = input_word[i:i+2]
                return edit_type, edit

    return None, None

# someone load the 
corpus = "This is a sample text for the spell correction model. Modeler is working fine."
model = Modeler(corpus)
print(model.model)

finder = Finder(model.model)

token = "afar"
candidates = finder.getCandidates(token)

print(candidates)
results = []
# fun_ny = lambda cand:calculate_edit_type_and_edit(token,cand)
# genny = map(fun_ny,candidates)
for candidate, probability in candidates:
    edit_type, edit = calculate_edit_type_and_edit(token, candidate)
    if edit_type:
        result = {
            "word": token,
            "candidate": candidate,
            "edit_type": edit_type,
            "edit": edit,
            "P(c)": probability,
            "P(w|c)": model.model[candidate],
            "P(c) x P(w|c)": probability * model.model[candidate]
        }
    else:
        result = "broken"
        results.append(result)

df = pd.DataFrame(results)

print(df)



{'This': 0.07142857142857142, 'is': 0.14285714285714285, 'a': 0.07142857142857142, 'sample': 0.07142857142857142, 'text': 0.07142857142857142, 'for': 0.07142857142857142, 'the': 0.07142857142857142, 'spell': 0.07142857142857142, 'correction': 0.07142857142857142, 'model.': 0.07142857142857142, 'Modeler': 0.07142857142857142, 'working': 0.07142857142857142, 'fine.': 0.07142857142857142}
[('for', 0.07142857142857142)]
        0
0  broken
