In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

nltk.download('punkt')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Drogias\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Drogias\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
from src.spell_correction import BigramSpellCorrector, TrigramSpellCorrector
from src.autocomplete import BigramModel, START_TOKEN, END_TOKEN, TrigramModel

In [3]:
gutenberg_corpus = nltk.corpus.gutenberg.fileids()                                 #Get all the files
gutenberg_corpus

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
combined_text = ""             
for file_id in gutenberg_corpus:                                        # Combine the text from all files
    combined_text += nltk.corpus.gutenberg.raw(file_id)

print(combined_text[:500])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died t


In [5]:
combined_text = combined_text.lower()                              #Convert to lowercase

In [6]:
combined_text[:500]

"[emma by jane austen 1816]\n\nvolume i\n\nchapter i\n\n\nemma woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.\n\nshe was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.  her mother\nhad died t"

In [7]:
import re

def remove_special_chars(text):
   text = text.replace('[', '')
   text = text.replace(']', '')
   text = text.replace('\n', ' ')
   text = re.sub(r'[^a-zA-z.?!\']', ' ', text)                     #Remove these characters   

   return text

In [8]:
combined_text = remove_special_chars(combined_text)
combined_text[:500]

"emma by jane austen       volume i  chapter i   emma woodhouse  handsome  clever  and rich  with a comfortable home and happy disposition  seemed to unite some of the best blessings of existence  and had lived nearly twenty one years in the world with very little to distress or vex her.  she was the youngest of the two daughters of a most affectionate  indulgent father  and had  in consequence of her sister's marriage  been mistress of his house from a very early period.  her mother had died too"

In [9]:
len(combined_text.split())                     

2119883

In [10]:
len(combined_text)                             # How many characters

11793056

In [11]:
combined_text[11792000:]

"n the sea  the universe  the stars there in the     heavens   urging slowly  surely forward  forming endless  and waiting ever more  forever more behind.       good bye my fancy!  good bye my fancy! farewell dear mate  dear love! i'm going away  i know not where  or to what fortune  or whether i may ever see you again  so good bye my fancy.  now for my last  let me look back a moment  the slower fainter ticking of the clock is in me  exit  nightfall  and soon the heart thud stopping.  long have we lived  joy'd  caress'd together  delightful!  now separation  good bye my fancy.  yet let me not be too hasty  long indeed have we lived  slept  filter'd  become really blended     into one  then if we die we die together   yes  we'll remain one   if we go anywhere we'll go together to meet what happens  may be we'll be better off and blither  and learn something  may be it is yourself now really ushering me to the true songs   who     knows?  may be it is you the mortal knob really undoing 

In [12]:
def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(''.join(text))                  #Get the sentences
    return sentences     

In [13]:
sentences = tokenize_sentences(combined_text) 
print(len(sentences))    
print(sentences[2])  
print(sentences[57649])

96282
her mother had died too long ago for her to have more than an indistinct remembrance of her caresses  and her place had been supplied by an excellent woman as governess  who had fallen little short of a mother in affection.
i hardly understand you   replied the scientist  with a cold intensity of manner.


In [14]:
def tokenize_words(text):
    words = nltk.word_tokenize(text)
    return words

In [15]:
words = tokenize_words(combined_text)
print(len(words))
print(words[3])
print(words[-2])

2235498
austen
fancy


In [16]:
words_list = []                                    #list of all the words of sentences
for f in sentences:
    words_list.append(tokenize_words(f))                   

In [17]:
len(words_list)

96282

In [18]:
for word in words_list[1]:                     # all the words of the second sentence
    print(word) 

she
was
the
youngest
of
the
two
daughters
of
a
most
affectionate
indulgent
father
and
had
in
consequence
of
her
sister
's
marriage
been
mistress
of
his
house
from
a
very
early
period
.


In [19]:
import random
import math


random.shuffle(words_list)
train_len = math.floor(0.6 * len(words_list))                      #Training set length(60%)
dev_len = math.floor(0.2 * len(words_list))                        #Development set length (20%)
test_len = math.floor(0.2 * len(words_list))                       #Test set length (20%)

training_set = []
development_set = []
test_set = []

for content in words_list[0:train_len]:
    training_set.append(content)
    
for content in words_list[train_len: train_len + dev_len]:
    development_set.append(content)

for content in words_list[train_len + dev_len:]:
    test_set.append(content)

In [20]:
import itertools
from collections import Counter

from nltk.util import ngrams


def _calc_ngrams(all_corpus: list[str], ngram: int) -> Counter:
    """
    Process a tokenized sentence into a list of ngrams.
    :param all_corpus: a list of all the corpus words
    :param ngram: whether the ngrams will be unigrams, bigrams etc
    :return: the counter of either unigram, bigram or trigram
    """
    unigram_counter = Counter()
    bigram_counter = Counter()
    trigram_counter = Counter()
     
    

    if ngram == 1 :
        for sentence in all_corpus:
             grams = [gram for gram in ngrams(sentence, ngram, pad_left=True, pad_right=True,
                                    left_pad_symbol=START_TOKEN, right_pad_symbol=END_TOKEN)]
             unigram_counter.update(grams)
        return unigram_counter
        
    elif ngram == 2:
        for sentence in all_corpus:
             grams = [gram for gram in ngrams(sentence, ngram, pad_left=True, pad_right=True,
                                    left_pad_symbol=START_TOKEN, right_pad_symbol=END_TOKEN)]
             bigram_counter.update(grams)
        return bigram_counter
        
    elif ngram == 3:
        for sentence in all_corpus:
             grams = [gram for gram in ngrams(sentence, ngram, pad_left=True, pad_right=True,
                                    left_pad_symbol=START_TOKEN, right_pad_symbol=END_TOKEN)]
             trigram_counter.update(grams)
        return trigram_counter
        
    return 0

In [21]:
def replace_OOV_words_train(all_corpus):
    unigram_counter = _calc_ngrams(all_corpus,1)
    OOV_words = {}

    for k, v in unigram_counter.items():
        if v < 10:
            key = k[0]
            OOV_words[key] = "UNK"                 #set the word to "UNK"

    replaced_corpus = []                          #the original corpus having the OOV words replaced by 'UNK'
    for sentence in all_corpus:
        clean_sentence = []
    
        for word in sentence:
            clean_sentence.append(OOV_words.get(word, word))
    
        replaced_corpus.append(clean_sentence)


    vocabulary = []

    for key in unigram_counter.keys():        #Iterate the unigram counter
        word = key[0]                         #get the word
        if word not in OOV_words:
            vocabulary.append(word)

    vocabulary = set(vocabulary)              #Keep unique words
    return vocabulary, replaced_corpus, OOV_words
    

In [22]:
vocabulary, new_corpus, OOV_words = replace_OOV_words_train(training_set)

In [23]:
def replace_OOV_words_test(all_corpus, vocabulary, oov_words):
    
    replaced_corpus = []
    for sentence in all_corpus:
        updated_sent = []

        for word in sentence:
            if (word not in vocabulary) or (word in oov_words):
                updated_sent.append('UNK')
            else:
                updated_sent.append(word)
                
    replaced_corpus.append(updated_sent)
    return replaced_corpus   

In [24]:
development_set = replace_OOV_words_test(development_set, vocabulary, OOV_words)
test_set = replace_OOV_words_test(test_set, vocabulary, OOV_words)

In [39]:
vocab_len = len(vocabulary)
print ("Vocabulary length: ", vocab_len)

print("Unigram's 20 most common words:")
unigram_top_20 = _calc_ngrams(new_corpus, 1).most_common(20)
for gram in unigram_top_20:
    print(gram)

print("\n")
print("Bigram's 20 most common words:")
bigram_top_20 = _calc_ngrams(new_corpus,2).most_common(20)
for gram in bigram_top_20:
    print(gram)


print("\n")
print("Trigram's 20 most common words:")
trigram_top_20 = _calc_ngrams(new_corpus,3).most_common(20)
for gram in trigram_top_20:
    print(gram)

Vocabulary length:  7347
Unigram's 20 most common words:
(('the',), 79954)
(('UNK',), 68516)
(('and',), 57292)
(('.',), 46357)
(('of',), 42578)
(('to',), 28919)
(('a',), 20399)
(('in',), 20194)
(('i',), 18039)
(('that',), 17237)
(('he',), 15634)
(('it',), 13349)
(('his',), 12869)
(('for',), 11845)
(('was',), 11293)
(('not',), 10842)
(('with',), 10450)
(('is',), 9976)
(('you',), 9732)
(('be',), 9691)


Bigram's 20 most common words:
(('.', '<end>'), 46124)
(('of', 'the'), 11330)
(('the', 'UNK'), 8149)
(('<start>', 'and'), 8071)
(('UNK', 'and'), 6263)
(('?', '<end>'), 6100)
(('in', 'the'), 6092)
(('and', 'the'), 5368)
(('UNK', 'UNK'), 5365)
(('!', '<end>'), 4976)
(('UNK', '.'), 4945)
(('and', 'UNK'), 4693)
(('the', 'lord'), 4214)
(('UNK', 'of'), 4115)
(('<start>', 'i'), 3456)
(('of', 'UNK'), 3398)
(('UNK', 'the'), 3267)
(('to', 'the'), 3233)
(('<start>', 'the'), 3046)
(('a', 'UNK'), 2569)


Trigram's 20 most common words:
(('.', '<end>', '<end>'), 46124)
(('<start>', '<start>', 'and'), 8

(ii). First step: Tune α (alpha)

##  Calculate bi-gram probability

### $ P(w_2|w_1) = \frac{C(w_1,w_2) + \alpha}{C(w_1) + \alpha \cdot|V|} $

* $ C(w_1,w_2) $ : bigram count
* $ C(w_1) $ : unigram count
* $ 0 \leq\alpha \leq1 $ :  smoothing hyper-parameter
* |V|: vocabulary size

In [26]:
#Find Bigram and trigram probabilities using Laplace and a-smoothing
uni_counter = _calc_ngrams(training_set, 1)                     #Unigram counter
bi_counter = _calc_ngrams(training_set, 2)                      #Bigram counter
tri_counter = _calc_ngrams(training_set, 3)                     #Trigram counter
def prob_bigram_model(w0,w1,alpha,vocabulary, uni_counter, bi_counter): 
    c_w = bi_counter[w0,w1]
    c = uni_counter[w0]
    prob = (c_w + alpha) / (c + alpha * len(vocabulary))                  #probability of bigram 
    return prob

def prob_trigram_model(w0,w1,w2,alpha,vocabulary, bi_counter, tri_counter):
    c_w = tri_counter[w0,w1,w2]
    c = bi_counter[w0,w1]
    prob = (c_w + alpha) / (c + alpha * len(vocabulary))                  #probability of trigram
    return prob

## Bi-gram LM Cross entropy & perplexity

* $ CrossEntropy = -\frac{1}{N}\sum^{bigrams}{log_2(P(w_2|w_1))} $
 * N: Number of bigrams
* $ Perplexity = 2^{H(p)} $

In [27]:
HC = []
perplexity = []

min_entropy = 10000
min_index = 0
alpha_values = np.linspace(0.001, 1)
best_alpha_bigram = 0
for i, alpha in enumerate(alpha_values):
    sum_prob = 0
    bi_count = 0
    for sentence in development_set:
        sentence = ['<s>']  + sentence + ['<e>']
        for i in range(1, len(sentence)):
            bi_prob = prob_bigram_model(sentence[i - 1] , sentence[i], alpha, vocabulary, uni_counter, bi_counter)
            sum_prob += math.log2(bi_prob)
            bi_count +=1
        
    HC.append(-sum_prob / bi_count)
    perplexity.append(math.pow(2, -sum_prob / bi_count))
    if ((-sum_prob / bi_count) < min_entropy ):
        min_entropy = -sum_prob / bi_count
        min_index = i
        best_alpha_bigram = alpha

print("Best alpha after tuning: ", best_alpha_bigram)
print("Cross Entropy: {0:.3f}".format(HC[min_index]))
print("perplexity: {0:.3f}".format(perplexity[min_index]))

Best alpha after tuning:  0.001
Cross Entropy: 7.247
perplexity: 151.889


## Tri-gram LM Cross entropy & perplexity

### $ P(w_3|w_1,w_2) = \frac{C(w_1,w_2,w_3) + \alpha}{C(w_1,w_2) + \alpha \cdot |V|} $

* $ C(w_1,w_2,w_3) $ : trigram count
* $ C(w_1,w_2) $ : bigram count
* $ 0 \leq\alpha \leq1 $ :  smoothing hyper-parameter
* |V|: vocabulary size

In [28]:
HC = []
perplexity = []

min_entropy = 10000
min_index = 0
alpha_values = np.linspace(0.001, 1)
best_alpha_trigram = 0
for i, alpha in enumerate(alpha_values):
    sum_prob = 0
    tri_count = 0
    for sentence in development_set:
        sentence = ['<s>']  + sentence + ['<e>']
        for i in range(2, len(sentence)):
            tri_prob = prob_trigram_model(sentence[i - 2] , sentence[i - 1], sentence[i], alpha, vocabulary, bi_counter, tri_counter)
            sum_prob += math.log2(tri_prob)
            tri_count +=1
        
    HC.append(-sum_prob / tri_count)
    perplexity.append(math.pow(2, -sum_prob / tri_count))
    if ((-sum_prob / tri_count) < min_entropy ):
        min_entropy = -sum_prob / tri_count
        min_index = i
        best_alpha_trigram = alpha

print("Best alpha after tuning: ", round(best_alpha_trigram,3))
print("Cross Entropy: {0:.3f}".format(HC[min_index]))
print("perplexity: {0:.3f}".format(perplexity[min_index]))

Best alpha after tuning:  0.001
Cross Entropy: 10.163
perplexity: 1146.705


Now, let's test the performance in the test set, after having defined the optimal alpha.

1. Bigram Model

In [29]:
lang_sequence = []
for sentence in test_set:
    lang_sequence += ['<s>']  + sentence + ['<e>']
sum_prob = 0
bi_count = 0
for i in range(1, len(lang_sequence)):
    if lang_sequence[i] != '<s>' :
        bi_prob = prob_bigram_model(lang_sequence[i - 1] , lang_sequence[i], best_alpha_bigram, vocabulary, uni_counter, bi_counter)
        sum_prob += math.log2(bi_prob)
        bi_count +=1
            
HC = -sum_prob / bi_count
perplexity = math.pow(2, -sum_prob / bi_count)
    
print("Language Cross Entropy: {0:.3f}".format(HC))
print("Language perplexity: {0:.3f}".format(perplexity))

Language Cross Entropy: 1.062
Language perplexity: 2.088


2. Trigram model

In [30]:
lang_sequence = []
for sentence in test_set:
    lang_sequence += ['<s>']  + sentence + ['<e>']
sum_prob = 0
tri_count = 0
for i in range(2, len(lang_sequence)):
    if lang_sequence[i] != '<s>' :
        tri_prob = prob_trigram_model(lang_sequence[i - 2] ,lang_sequence[i - 1], lang_sequence[i], best_alpha_trigram, vocabulary, bi_counter, tri_counter)
        sum_prob += math.log2(tri_prob)
        tri_count +=1
            
HC = -sum_prob / tri_count
perplexity = math.pow(2, -sum_prob / tri_count)
    
print("Language Cross Entropy: {0:.3f}".format(HC))
print("Language perplexity: {0:.3f}".format(perplexity))

Language Cross Entropy: 9.795
Language perplexity: 888.076


In [31]:
max_depth = 5
beam_width = 3

v. Create a fake dataset

In [32]:
import random

def corrupt_sentence(sentence, probability):
    corrupted_sentence = ""
    for char in sentence:
        if char != ' ' and random.random() < probability:
            
            corrupted_sentence += get_similar_char(char)                 #replace with a similar character
        else:
            corrupted_sentence += char
    return corrupted_sentence

def get_similar_char(char):
    
    similar_chars = {
        'a': 'e',
        'b': 'p',
        'c': 's',
        'd': 'b',
        'e': 'a',
        'f': 't',
        'g': 'j',
        'h': 'n',
        'i': 'l',
        'j': 'g',
        'k': 'x',
        'l': 'i',
        'm': 'n',
        'n': 'm',
        'o': 'u',
        'p': 'b',
        'q': 'g',
        'r': 't',
        's': 'c',
        't': 'f',
        'u': 'o',
        'v': 'w',
        'w': 'v',
        'x': 'k',
        'y': 'v',
        'z': 's',
    }

    
    return similar_chars.get(char, char)                            #return a randomly chosen character

'''
test_corpus = ["he plays football",
               "he plais footbal",
               "she enjoys good football",
               "she plays good music",
               "he prays to god",
               "please buy me the other ball",
               "he pleases the other players by playing good football",
               "he plys god ftball"]

'''
probability = 0.1                                         #probability of character replacement


corrupted_corpus = [corrupt_sentence(sentence, probability) for sentence in sentences[1:50]]          #generate the corrupted corpus


for original, corrupted in zip(sentences[1:50], corrupted_corpus):
    print(f"Original: {original}")
    print(f"Corrupted: {corrupted}")
    print("\n---\n")


Original: she was the youngest of the two daughters of a most affectionate  indulgent father  and had  in consequence of her sister's marriage  been mistress of his house from a very early period.
Corrupted: she was the yuungest of tne two deughters of a most affectlunate  lnbulgenf father  and had  in consequence uf her sictar's marriage  been mistress of hls house from a vary early period.

---

Original: her mother had died too long ago for her to have more than an indistinct remembrance of her caresses  and her place had been supplied by an excellent woman as governess  who had fallen little short of a mother in affection.
Corrupted: het mothet had diad tuo long ago for her to have mota then an indistinct remembrance of het caresces  and her plase had been sopblled by an ekcellenf woman as governass  who hab fallen little shotf of a mother ln affection.

---

Original: sixteen years had miss taylor been in mr. woodhouse's family  less as a governess than a friend  very fond of both

In [33]:
from nltk.tokenize import TweetTokenizer


tweet_wt = TweetTokenizer()
tokenized = [tweet_wt.tokenize(sentence) for sentence in sentences[1:50]]   #Get the first 50 sentences

model = BigramModel(alpha=0.01)
model.fit(tokenized)                                # model is fitted with the correct and tokenized words

corrupted_tokenized = [tweet_wt.tokenize(sentence) for sentence in corrupted_corpus]             # tokenize the corrupted sentences

In [34]:
corrected = []
corrector = BigramSpellCorrector(model, lamda1=0.5, lamda2=-0.5)
for sent in corrupted_tokenized:
  output_seq = corrector.spell_correct(original_tokenized_sentence=sent, max_depth = 5, beam_width = 3)  #give the corrupt sentences to spell correct
  corrected.append(output_seq)
    
print('Original sentences:', sentences[1:4])
print('\n')
print('Corrupted(wrong) sentences:', corrupted_corpus[:3])
print('\n')
print('Final result (corrected sentences):', corrected[:3])

Original sentences: ["she was the youngest of the two daughters of a most affectionate  indulgent father  and had  in consequence of her sister's marriage  been mistress of his house from a very early period.", 'her mother had died too long ago for her to have more than an indistinct remembrance of her caresses  and her place had been supplied by an excellent woman as governess  who had fallen little short of a mother in affection.', "sixteen years had miss taylor been in mr. woodhouse's family  less as a governess than a friend  very fond of both daughters  but particularly of emma."]


Corrupted(wrong) sentences: ["she was the yuungest of tne two deughters of a most affectlunate  lnbulgenf father  and had  in consequence uf her sictar's marriage  been mistress of hls house from a vary early period.", 'het mothet had diad tuo long ago for her to have mota then an indistinct remembrance of het caresces  and her plase had been sopblled by an ekcellenf woman as governass  who hab fallen 

In [35]:
from jiwer import cer, wer


count = 0
sum_cer = 0
sum_wer = 0

In [36]:
for corrected_sentence in corrected:
    for index in range(len(corrected_sentence)):
        token_k = START_TOKEN if index == 0 else tokenized[i][index - 1]
        token_j = corrected_sentence[index]
        sum_cer += cer(token_k, token_j)
        sum_wer += wer(token_k, token_j)
        count += 1
        
avg_cer = sum_cer/count
avg_wer = sum_wer/count

print(f'Avg cer = {avg_cer}')
print(f'Avg wer = {avg_wer}')

Avg cer = 0.9166666666666666
Avg wer = 0.6666666666666666
