## Task 1

In [49]:
import re

def parse_file(filename):
    with open(filename, 'r') as f:
        text = f.readlines()

    data = []
    for lines in text:
        
        words = lines.strip().split()

        original = ' '.join(words)
        corrected = []
        indexes = []
        for i, word in enumerate(words):
            if re.search('[^a-zA-Z]+', word):
                corrected.append(word)
                indexes.append(i)
            else:
                corrected.append(word)
        data.append((original, ' '.join(corrected), indexes))

    return data

In [50]:
data = parse_file('Lab06.txt')
test_set = data[:100]
training_set = data[100:]

## Task 2

In [51]:
from collections import Counter

def get_frequencies(training_set):
    text = ' '.join([sentence[1] for sentence in training_set]).lower()

    words = text.split()
    bigrams = [f'{words[i]} {words[i+1]}' for i in range(len(words) - 1)]

    word_freq = Counter(words)
    bigram_freq = Counter(bigrams)

    return word_freq, bigram_freq


In [52]:
word_freq, bigram_freq = get_frequencies(training_set)
print(word_freq)
print(bigram_freq)

Counter({'the': 1326, 'and': 909, '.': 886, 'a': 611, 'to': 536, 'i': 505, 'in': 317, 'he': 293, ',': 288, 'they': 272, 'was': 268, 'of': 263, 'it': 257, 'on': 209, 'you': 194, 'went': 190, 'is': 175, 'my': 153, 'we': 144, 'for': 143, 'said': 143, 'she': 130, 'with': 127, 'got': 118, 'when': 117, 'had': 115, 'so': 112, 'have': 112, 'up': 109, 'one': 107, 'then': 107, 'but': 105, 'out': 105, 'not': 103, 'that': 101, 'at': 101, 'will': 100, 'go': 95, 'down': 93, 'all': 90, 'get': 86, 'me': 86, 'her': 82, 'there': 80, 'man': 80, 'were': 73, 'his': 72, 'are': 70, 'see': 69, 'old': 69, 'him': 66, 'do': 66, 'very': 63, 'like': 61, 'some': 60, 'am': 60, 'be': 59, '"': 58, 'about': 55, 'john': 55, 'day': 53, 'them': 52, 'as': 50, 'back': 50, 'come': 49, 'house': 47, 'came': 47, 'this': 45, 'home': 45, 'has': 43, 'two': 43, 'what': 42, 'jean': 42, 'if': 42, 'going': 41, 'work': 41, 'boy': 41, 'after': 40, 'well': 39, 'would': 39, 'or': 39, 'police': 39, 'did': 37, 'mother': 35, 'little': 34, 'c

## Task 3

In [54]:
from nltk.metrics import edit_distance

def get_similar_words(word, train):
    train_tokens = set(train)

    min_distance = float('inf')
    for token in train_tokens:
        distance = edit_distance(word, token)
        if distance < min_distance:
            min_distance = distance

    similar_words = set()
    for token in train_tokens:
        if edit_distance(word, token) == min_distance:
            similar_words.add(token)

    return similar_words

In [55]:
train = set([word.lower() for sentence in training_set for word in sentence[1].split()])
similar_words = get_similar_words('speling', train)
print(similar_words)

{'spelling'}


## Task 4

In [56]:
def correct_sentence(sentence, train_words, train_bigrams, word_freq, bigram_freq):
    corrected_sentence = []

    words = sentence.split()

    for i, word in enumerate(words):
        if word not in train_words:
            candidates = get_similar_words(word, train_words)
            candidate_probs = {}
            for candidate in candidates:
                if i == 0:
                    bigram_prob = bigram_freq[(candidate, words[i+1])] / word_freq[candidate]
                elif i == len(words) - 1:
                    bigram_prob = bigram_freq[(words[i-1], candidate)] / word_freq[candidate]
                else:
                    bigram_prob = bigram_freq[(words[i-1], candidate)] / word_freq[candidate] * bigram_freq[(candidate, words[i+1])] / word_freq[candidate]
                candidate_probs[candidate] = bigram_prob

           
            corrected_word = max(candidate_probs, key=candidate_probs.get)
            corrected_sentence.append(corrected_word)
        else:
            
            corrected_sentence.append(word)

   
    corrected_sentence = ' '.join(corrected_sentence)

    return corrected_sentence


In [61]:
train_words = set([word.lower() for sentence in training_set for word in sentence[1].split()])
train_bigrams = set([f'{sentence[1].split()} {sentence[1].split()}' for sentence in training_set for i in range(len(sentence[1].split()) - 1)])
word_freq, bigram_freq = get_frequencies(training_set)

sentence = 'this is an exampel of a misspeled sentence'
corrected_sentence = correct_sentence(sentence, train_words, train_bigrams, word_freq, bigram_freq)
print(corrected_sentence)

this is an examined of a kissed sixpence


## Task 5

In [69]:
nltk.download('punkt')
corrected_sentences = []

for sentence in test_set:
    corrected = correct_sentence(sentence[1], train_words, train_bigrams, word_freq, bigram_freq)
    corrected_sentences.append(corrected)

total_words = 0
substitutions = 0
deletions = 0
insertions = 0

for i in range(len(test_set)):
    reference_tokens = nltk.word_tokenize(test_set[i][1])
    corrected_tokens = nltk.word_tokenize(corrected_sentences[i])
    total_words += len(reference_tokens)
    edit_distance = nltk.edit_distance(reference_tokens, corrected_tokens)
    substitutions += edit_distance
    deletions += len(reference_tokens) - len(corrected_tokens) + edit_distance
    insertions += len(corrected_tokens) - len(reference_tokens) + edit_distance

WER = (substitutions + deletions + insertions) / total_words
print('Word error rate:', WER)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Word error rate: 0.8402107111501317
