In [1]:
import nltk
from nltk.lm import Laplace, MLE, StupidBackoff
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends

from collections import Counter

# Exercise 1 - Ngrams

In [65]:
sentence = """To Sherlock Holmes she is always ‘The Woman’. I have
seldom heard him mention her under any other name."""

In [66]:
# Tokenizing the sentence
sentence_token = nltk.word_tokenize(sentence)

In [67]:
# Computing the bigrams
bin_grams  = list(nltk.ngrams(sentence_token, 2))
bin_grams

[('To', 'Sherlock'),
 ('Sherlock', 'Holmes'),
 ('Holmes', 'she'),
 ('she', 'is'),
 ('is', 'always'),
 ('always', '‘'),
 ('‘', 'The'),
 ('The', 'Woman'),
 ('Woman', '’'),
 ('’', '.'),
 ('.', 'I'),
 ('I', 'have'),
 ('have', 'seldom'),
 ('seldom', 'heard'),
 ('heard', 'him'),
 ('him', 'mention'),
 ('mention', 'her'),
 ('her', 'under'),
 ('under', 'any'),
 ('any', 'other'),
 ('other', 'name'),
 ('name', '.')]

In [68]:
# Computing the trigrams
tri_grams = list(nltk.ngrams(sentence_token, 3))
tri_grams

[('To', 'Sherlock', 'Holmes'),
 ('Sherlock', 'Holmes', 'she'),
 ('Holmes', 'she', 'is'),
 ('she', 'is', 'always'),
 ('is', 'always', '‘'),
 ('always', '‘', 'The'),
 ('‘', 'The', 'Woman'),
 ('The', 'Woman', '’'),
 ('Woman', '’', '.'),
 ('’', '.', 'I'),
 ('.', 'I', 'have'),
 ('I', 'have', 'seldom'),
 ('have', 'seldom', 'heard'),
 ('seldom', 'heard', 'him'),
 ('heard', 'him', 'mention'),
 ('him', 'mention', 'her'),
 ('mention', 'her', 'under'),
 ('her', 'under', 'any'),
 ('under', 'any', 'other'),
 ('any', 'other', 'name'),
 ('other', 'name', '.')]

In [10]:
# Remove special tokens
import string
print("---- Tokens that we remove ----\n", (set(string.punctuation) | set(["'", '”', '“', "’", "‘"])))
sentence_token = [word for word in sentence_token if word not in (set(string.punctuation) | set(["'", '”', '“', "’", "‘"]))]
print("\n", " ".join(sentence_token))

---- Tokens that we remove ----
 {'(', '?', '$', '=', ';', '!', '{', '>', '"', '}', ')', '#', "'", ',', ':', '-', '_', '`', '|', '~', '&', '”', '\\', '+', '[', '*', '^', '%', '/', '.', '’', '<', ']', '@', '‘', '“'}

 To Sherlock Holmes she is always The Woman. I have seldom heard him mention her under any other name


In [11]:
# Computing the bigrams
bigrams = list(nltk.ngrams(sentence_token, 2))
# The frequency of each bigram
bigrams_count = Counter(bigrams)
print(bigrams_count.most_common(10))

[(('To', 'Sherlock'), 1), (('Sherlock', 'Holmes'), 1), (('Holmes', 'she'), 1), (('she', 'is'), 1), (('is', 'always'), 1), (('always', 'The'), 1), (('The', 'Woman.'), 1), (('Woman.', 'I'), 1), (('I', 'have'), 1), (('have', 'seldom'), 1)]


In [12]:
with open('moby_dick.txt', 'r') as f:
    moby = f.read()

def get_n_gram_statistics(n_gram:int = 2):
    moby_token = nltk.word_tokenize(moby)
    moby_token = [word for word in moby_token if word not in (set(string.punctuation) | set(["'", '”', '“', "’", "‘"]))]
    ngrams = list(nltk.ngrams(moby_token, n_gram))
    ngrams_count = Counter(ngrams)
    return ngrams_count.most_common(10)

print("The 10 most common 2-gram are: \n", get_n_gram_statistics(2))
print("The 10 most common 3-gram are: \n", get_n_gram_statistics(3))
print("The 10 most common 4-gram are: \n", get_n_gram_statistics(4))

The 10 most common 2-gram are: 
 [(('of', 'the'), 1895), (('in', 'the'), 1139), (('to', 'the'), 733), (('from', 'the'), 437), (('of', 'his'), 371), (('and', 'the'), 365), (('of', 'a'), 337), (('on', 'the'), 337), (('the', 'whale'), 325), (('with', 'the'), 325)]
The 10 most common 3-gram are: 
 [(('of', 'the', 'whale'), 88), (('the', 'Sperm', 'Whale'), 77), (('the', 'White', 'Whale'), 62), (('one', 'of', 'the'), 61), (('of', 'the', 'sea'), 57), (('out', 'of', 'the'), 54), (('part', 'of', 'the'), 53), (('the', 'ship', 's'), 52), (('a', 'sort', 'of'), 49), (('the', 'whale', 's'), 48)]
The 10 most common 4-gram are: 
 [(('of', 'the', 'Sperm', 'Whale'), 30), (('at', 'the', 'same', 'time'), 20), (('of', 'the', 'whale', 's'), 17), (('the', 'bottom', 'of', 'the'), 17), (('the', 'Sperm', 'Whale', 's'), 17), (('the', 'old', 'man', 's'), 15), (('Project', 'Gutenberg', 'Literary', 'Archive'), 13), (('in', 'the', 'act', 'of'), 12), (('Project', 'Gutenberg-tm', 'electronic', 'works'), 12), (('Gutenb

# Exercise 2 - Ngrams

#apuntar paraules predites i comparar amb reals, casos bons i dolents

In [14]:
# Loading Sherlock holmes book
with open('Adventures_Holmes.txt', 'r') as f:
    lines = f.readlines()


# Optaining the first paragraph for test data
lines = lines[28:]
test_paragraph = []
new_paragraph = []
for i in range(len(lines)- 1):
    new_paragraph.append(lines[i][:-1])
    if lines[i][-1] == '\n' and lines[i + 1] == '\n':
        test_paragraph.append(" ".join(new_paragraph))
        print("Line num: ", i)
        break

Line num:  16


In [105]:
n_grams_size = 4
# Prepare the training data

# Join all the lines together
train_data = " ".join(lines[16:])

# Separate the data into sentences
train_sentences = nltk.sent_tokenize(train_data)
print("The first sentence of the training data: \n", train_sentences[0])

# Tokenize the sentences
train_tokens = [nltk.word_tokenize(sentence) for sentence in train_sentences]

# Adding padding to the tokens and then computing the n-grams and the vocabulary
train, vocab = padded_everygram_pipeline(n_grams_size, train_tokens)

The first sentence of the training data: 
 Adler, of dubious and questionable memory.


In [106]:
# Create the lenguage model and train it
model = StupidBackoff(order=n_grams_size, alpha=0.4)
model.fit(train, vocabulary_text=vocab)

In [107]:
list_vocab = list(model.vocab)
print("The vocabulary size is: ", len(list_vocab))

The vocabulary size is:  10006


In [108]:
# Separate test paragraph into sentences
test_sentences = nltk.sent_tokenize(test_paragraph[0])

# Tokenizing each sentence in the test data and adding padding
test_tokens = [pad_both_ends(nltk.word_tokenize(sentence), n=n_grams_size) for sentence in test_sentences]

# Generating n-grams for each sentence in the test data
test_n_grams = [list(nltk.ngrams(sentence, n_grams_size)) for sentence in test_tokens]

# Flatten the list of n-grams
test_n_grams_2 = []
for sentence in test_n_grams:
        test_n_grams_2 += sentence
test_n_grams = test_n_grams_2

# Removing n-grams that contain '</s>' as we don't want to test the prediction of the end of the sentence
test_n_grams_2 = []
for n_gram in test_n_grams:
        if '</s>' not in n_gram:
                test_n_grams_2.append(n_gram)
test_n_grams = test_n_grams_2


#### Picking the most probable word

In [109]:
# Remove special tokens from the vocabulary list
if "<s>" in list_vocab:
    list_vocab.remove("<s>")
if "</s>" in list_vocab:
    list_vocab.remove("</s>")

correct = []
wrong = []
correct_count = 0
correct_count_top3 = 0
total_count = 0
all_predicted = []
# Prediting the next word for each n-gram in the test data
for n_gram in test_n_grams:
    # print(n_gram)
    # print(n_gram[:-1])

    # Iterating over the vocabulary to find the word with the highest probability
    # We also store the 3 words with the highest probability
    dict_prob = {}
    for word in list_vocab:
        prob = model.unmasked_score(word, context=n_gram[:-1])
        dict_prob[word] = prob
    
    top_3 = sorted(dict_prob, key=dict_prob.get, reverse=True)[:3]
    predicted_word = top_3[0]
    # print("The predicted word is: ", predicted_word)
    # print("The top 3 words are: ", top_3)
    all_predicted.append(predicted_word)
    if predicted_word == n_gram[-1]:
        correct.append(n_gram)
        correct_count += 1
    if n_gram[-1] in top_3:
        correct_count_top3 += 1
    else:
        wrong.append((n_gram, predicted_word))
    total_count += 1

print("The accuracy is: ", correct_count/total_count)
print("The top 3 accuracy is: ", correct_count_top3/total_count)

The accuracy is:  0.1608695652173913
The top 3 accuracy is:  0.26956521739130435


In [110]:
print("Some of the correct predictions are: \n", correct[:10])

Some of the correct predictions are: 
 [('<s>', 'To', 'Sherlock', 'Holmes'), ('is', 'always', '_the_', 'woman'), ('always', '_the_', 'woman', '.'), ('<s>', '<s>', 'I', 'have'), ('her', 'under', 'any', 'other'), ('<s>', '<s>', 'It', 'was'), ('any', 'emotion', 'akin', 'to'), ('love', 'for', 'Irene', 'Adler'), ('<s>', 'All', 'emotions', ','), ('All', 'emotions', ',', 'and')]


In [111]:
print("Some of the wrong predictions are:")
for i in range(10):
    n_gram, predicted_word = wrong[i]
    print("The n-gram is: ", n_gram)
    print("The word predicted: ", predicted_word)

Some of the wrong predictions are:
The n-gram is:  ('<s>', '<s>', '<s>', 'To')
The word predicted:  “
The n-gram is:  ('<s>', '<s>', 'To', 'Sherlock')
The word predicted:  me
The n-gram is:  ('To', 'Sherlock', 'Holmes', 'she')
The word predicted:  ,
The n-gram is:  ('Sherlock', 'Holmes', 'she', 'is')
The word predicted:  bade
The n-gram is:  ('Holmes', 'she', 'is', 'always')
The word predicted:  not
The n-gram is:  ('she', 'is', 'always', '_the_')
The word predicted:  of
The n-gram is:  ('<s>', 'I', 'have', 'seldom')
The word predicted:  no
The n-gram is:  ('I', 'have', 'seldom', 'heard')
The word predicted:  seen
The n-gram is:  ('have', 'seldom', 'heard', 'him')
The word predicted:  of
The n-gram is:  ('seldom', 'heard', 'him', 'mention')
The word predicted:  do


#### Ground Truth

In [117]:
text = test_paragraph[0]

printable_text = ""
for i in range(len(text)):
    if i % 82 == 0:
        printable_text += '\n'
    printable_text += text[i]

print(printable_text)


To Sherlock Holmes she is always _the_ woman. I have seldom heard him mention her 
under any other name. In his eyes she eclipses and predominates the whole of her s
ex. It was not that he felt any emotion akin to love for Irene Adler. All emotions
, and that one particularly, were abhorrent to his cold, precise but admirably bal
anced mind. He was, I take it, the most perfect reasoning and observing machine th
at the world has seen, but as a lover he would have placed himself in a false posi
tion. He never spoke of the softer passions, save with a gibe and a sneer. They we
re admirable things for the observer—excellent for drawing the veil from men’s mot
ives and actions. But for the trained reasoner to admit such intrusions into his o
wn delicate and finely adjusted temperament was to introduce a distracting factor 
which might throw a doubt upon all his mental results. Grit in a sensitive instrum
ent, or a crack in one of his own high-power lenses, would not be more disturbing 
tha

#### Predicted text with 4-grams

In [121]:
pred_text = " ".join(all_predicted).replace(" ,", ",").replace(" .", ".")
printable_text = ""
for i in range(len(pred_text)):
    if i % 80 == 0:
        printable_text += '\n'
    printable_text += pred_text[i]

print(printable_text)


“ me Holmes, bade not of woman. “ have no seen of do of, the other work, “ the s
ingular. had, I,, house that own, “ was a the of had that other. to bad him. the
 Adler, “ the, and I the of to and sufficient, the feet, Mr. as I done,, “ was a
 as have it, then leather extraordinary happiness, extraordinary the and I door.
 been very and I I rule ; had not been before to the perplexing alarm in “ was d
id, the nervous,, and that a very, I few. “ were all queen which some best, the 
of,, the, s heads were I. “ I the best as, the that a, the chair, for I, that, a
 be you little, in I have up little as that that strength,, “, the few,, and the
 means, the of the keen,,, and you have very than, the precious nature. the few 
of a I client “ now I was an one way, bear, and that the ’ standing name Ezekiah
 Adler, as dubious and questionable memory.
