# The approach to solving the problem:

1. Run the default solution.
2. Compare the output to the expected output.
3. Iteratively optimise.

## 1. Running the default solution

In [1]:
from default import *
from io import StringIO

In [2]:
with StringIO("4\tit will put your maind into non-stop learning.") as f:
    for (locations, spellchk_sent) in spellchk(f):
        print("{locs}\t{sent}".format(
            locs=",".join([str(i) for i in locations]),
            sent=" ".join(spellchk_sent)
        ))

4	it will put your mind into non-stop learning.


## So far, looks like it does a good job
## Let's write functions to see sentences compare to expected output

In [3]:
def print_corrected_sentence(sentence):
    with StringIO(sentence) as f:
        for (locations, spellchk_sent) in spellchk(f):
            print("{locs}\t{sent}".format(
                locs=",".join([str(i) for i in locations]),
                sent=" ".join(spellchk_sent)
            ))

print_corrected_sentence("4\tit will put your maind into non-stop learning.")
print_corrected_sentence("8\tThere was no doubt that Herr Schaffner meant evey word of what he said .")
print_corrected_sentence("5,14\tJust before Myra left -- Sue was saying good-by to Cathy , and she didm't realize I was near '' .")

4	it will put your mind into non-stop learning.
8	There was no doubt that Herr Schaffner meant every word of what he said .
5,14	Just before Myra left -- cathy was saying good-by to Cathy , and she did realize I was near '' .


The code seems to work in some instances, however it is unexpectedly changing a few words that it should not be and thus needs to be further optomized to ensure a better success rate

## Let's optimise 

In [4]:
import Levenshtein

In [5]:
def select_correction_v2(typo, predict):
    # return the most likely prediction for the mask token
    predictions = [item['token_str'] for item in predict]
    most_similar = min(predictions, key=lambda x: Levenshtein.distance(x, typo))
    return most_similar

In [6]:
def spellchk_v2(fh):
    for (locations, sent) in get_typo_locations(fh):
        spellchk_sent = sent
        for i in locations:
            # predict top_k replacements only for the typo word at index i
            predict = fill_mask(
                " ".join([ sent[j] if j != i else mask for j in range(len(sent)) ]), 
                top_k=20
            )
            logging.info(predict)
            spellchk_sent[i] = select_correction_v2(sent[i], predict)
        yield(locations, spellchk_sent)

In [7]:
def print_corrected_sentence_v2(sentence):
    with StringIO(sentence) as f:
        for (locations, spellchk_sent) in spellchk_v2(f):
            print("{locs}\t{sent}".format(
                locs=",".join([str(i) for i in locations]),
                sent=" ".join(spellchk_sent)
            ))

In [8]:
print_corrected_sentence("4\tit will put your maind into non-stop learning.")
print_corrected_sentence("8\tThere was no doubt that Herr Schaffner meant evey word of what he said .")
print_corrected_sentence("5,14\tJust before Myra left -- Sue was saying good-by to Cathy , and she didm't realize I was near '' .")

4	it will put your mind into non-stop learning.
8	There was no doubt that Herr Schaffner meant every word of what he said .
5,14	Just before Myra left -- cathy was saying good-by to Cathy , and she did realize I was near '' .


Once again, further progress can be seen however there seems to be an interference in the code that doesn't allow for the acceptance of apostrophe seperated words. In this case, it is reading it as didm and t thus resulting in the creation of the word did and not didn't

In [9]:
print(spellchk_v2("5,14\tJust before Myra left -- Sue was saying good-by to Cathy , and she didm't realize I was near '' ."))

<generator object spellchk_v2 at 0x000001C204431300>


In [10]:
fh = "5,14\tJust before Myra left -- Sue was saying good-by to Cathy, and she didm't realize I was near '' ."
fh = StringIO(fh)  # Converts string to a file-like object

for locations, spellchk_sent in spellchk_v2(fh):
    print(f"Locations of typos in the sentence: {locations}")
    print(f"Corrected sentence: {' '.join(spellchk_sent)}")
    print("\n")  # Just for a clean separation between different sentences


Locations of typos in the sentence: [5, 14]
Corrected sentence: Just before Myra left -- she was saying good-by to Cathy, and she didm't realize I was near '' .




The code is able to identify some of the potential errors within the sentence however it is not adequately handling the errors and fixing them in a correct manner. This much be due to its probability score.

We must further train the model to better understand the desired words


In [21]:
print(select_correction_v2("maind", fill_mask("it will put your [MASK] into non-stop learning.", top_k=20)))

mind


In [12]:
print(select_correction_v2("cant'", fill_mask("I [MASK] do this tonight.", top_k=20)))

can


In [13]:
predictions = fill_mask("it will put your [MASK] into non-stop learning.", top_k=20)
for prediction in predictions:
    word = prediction['token_str']
    probability = prediction['score']
    print(f"Word: {word}, Probability: {probability:.4f}")


Word: mind, Probability: 0.1139
Word: talents, Probability: 0.0942
Word: skills, Probability: 0.0432
Word: brain, Probability: 0.0412
Word: creativity, Probability: 0.0248
Word: hands, Probability: 0.0232
Word: imagination, Probability: 0.0197
Word: brains, Probability: 0.0186
Word: insight, Probability: 0.0155
Word: lessons, Probability: 0.0138
Word: life, Probability: 0.0127
Word: skill, Probability: 0.0125
Word: soul, Probability: 0.0115
Word: fingers, Probability: 0.0114
Word: students, Probability: 0.0108
Word: talent, Probability: 0.0108
Word: education, Probability: 0.0104
Word: attention, Probability: 0.0099
Word: learning, Probability: 0.0099
Word: children, Probability: 0.0094


In [14]:
# Predict for the first [MASK]
predictions1 = fill_mask("Just before Myra left -- [MASK] was saying good-by to Cathy , and she didn't realize I was near", top_k=20)
print("Predictions for the first mask:")
for prediction in predictions1:
    word = prediction['token_str']
    probability = prediction['score']
    print(f"Word: {word}, Probability: {probability:.4f}")

# Predict for the second [MASK]
predictions2 = fill_mask("Just before Myra left -- she was saying good-by to Cathy , and she [MASK] realize I was near", top_k=20)
print("\nPredictions for the second mask:")
for prediction in predictions2:
    word = prediction['token_str']
    probability = prediction['score']
    print(f"Word: {word}, Probability: {probability:.4f}")

# Predict for the second [MASK]
predictions3 = fill_mask("I am innocent, I [MASK] [MASK] commit the crime", top_k=20)
print("\nPredictions for the third mask:")
for prediction in predictions2:
    word = prediction['token_str']
    probability = prediction['score']
    print(f"Word: {word}, Probability: {probability:.4f}")

Predictions for the first mask:
Word: cathy, Probability: 0.0883
Word: she, Probability: 0.0323
Word: myra, Probability: 0.0306
Word: liz, Probability: 0.0300
Word: janice, Probability: 0.0191
Word: tammy, Probability: 0.0147
Word: i, Probability: 0.0145
Word: carol, Probability: 0.0123
Word: mum, Probability: 0.0107
Word: becky, Probability: 0.0102
Word: laura, Probability: 0.0092
Word: sara, Probability: 0.0090
Word: josie, Probability: 0.0085
Word: steve, Probability: 0.0080
Word: melanie, Probability: 0.0078
Word: susie, Probability: 0.0073
Word: linda, Probability: 0.0067
Word: sharon, Probability: 0.0067
Word: beth, Probability: 0.0067
Word: mom, Probability: 0.0065

Predictions for the second mask:
Word: did, Probability: 0.8966
Word: would, Probability: 0.0343
Word: could, Probability: 0.0338
Word: might, Probability: 0.0170
Word: must, Probability: 0.0069
Word: should, Probability: 0.0046
Word: does, Probability: 0.0021
Word: had, Probability: 0.0007
Word: may, Probability: 0.

Clearly the code still seperates didm and t into seperate words and thus impacts the ability for it to select the correct replacement word in the case.

In [15]:
# Predict for the two [MASK]
predictions3 = fill_mask("I am innocent, I [MASK]'[MASK] commit the crime", top_k=20)
print("\nPredictions for the third mask:")
for prediction in predictions3:
    print(prediction[0]['token_str'])
    # word = prediction['token_str']
    # probability = prediction['score']
    # print(f"Word: {word}, Probability: {probability:.4f}")


Predictions for the third mask:
don
t


In [16]:
predictions4 = fill_mask("Just before Myra left -- she was saying good-by to Cathy , and she [MASK]'[MASK] realize I was near", top_k=20)
print("\nPredictions for the third mask:")
    
word = predictions4[0][0]['token_str'] + "'" + predictions4[1][0]['token_str']

print(word)


Predictions for the third mask:
didn't


In [17]:
import numpy as np
from numpy.linalg import norm

def get_word_vector(word):
    # Placeholder function - replace with actual model to get word vectors
    # For example, using Word2Vec: return word2vec_model[word]
    return np.random.rand(300)  # Dummy 300-dim vector

def sentence_to_vector(sentence):
    words = sentence.split()
    word_vectors = [get_word_vector(word) for word in words]
    sentence_vector = np.mean(word_vectors, axis=0)
    return sentence_vector

def cosine_similarity(vecA, vecB):
    return np.dot(vecA, vecB) / (norm(vecA) * norm(vecB))

original_sentence = "Just before Myra left -- Sue was saying good-by to Cathy, and she didn't realize I was near"
corrected_sentence_1 = "Just before Myra left -- cathy was saying goodbye to Cathy, and she did realize I was near"
corrected_sentence_2 = "Just before Myra left -- she was saying goodbye to Cathy, and she didn't realize I was near"

vec_original = sentence_to_vector(original_sentence)
vec_corrected_1 = sentence_to_vector(corrected_sentence_1)
vec_corrected_2 = sentence_to_vector(corrected_sentence_2)

similarity_1 = cosine_similarity(vec_original, vec_corrected_1)
similarity_2 = cosine_similarity(vec_original, vec_corrected_2)

print(f"Cosine similarity: {similarity_1}")
print(f"Cosine similarity: {similarity_2}")


Cosine similarity: 0.9831833293162868
Cosine similarity: 0.9829060353262221


In [18]:
%pip install bert_score

Note: you may need to restart the kernel to use updated packages.


In [19]:
from bert_score import score

P, R, F1 = score([corrected_sentence_1], [original_sentence], lang="en")
print(f"BERTScore: Precision: {P.mean()}, Recall: {R.mean()}, F1 Score: {F1.mean()}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: Precision: 0.9623104333877563, Recall: 0.9623662233352661, F1 Score: 0.9623383283615112


In [20]:
P, R, F1 = score([corrected_sentence_2], [original_sentence], lang="en")
print(f"BERTScore: Precision: {P.mean()}, Recall: {R.mean()}, F1 Score: {F1.mean()}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: Precision: 0.9794557094573975, Recall: 0.9716479182243347, F1 Score: 0.9755361676216125
