In [22]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize

In [23]:
# Set path to the input dataset
filePath = open("./archive/Harry_Potter_all_books_preprocessed.txt")
inputText = filePath.read()

In [24]:
# Take the first 10,000 words from the input text
spaceCount = 0
totalWordsRequired = 10000
N = 0
while spaceCount < totalWordsRequired:
    if inputText[N] == ' ':
        spaceCount += 1
    N += 1

In [25]:
text = inputText[:N]
print(f'Number of characters in first {totalWordsRequired} words: {len(text)}')
print(f'Sample text: {text[:300]}')

Number of characters in first 10000 words: 53909
Sample text: THE BOY WHO LIVED Mr and Mrs Dursley of number four Privet Drive were proud to say that they were perfectly normal thank you very much .They were the last people youd expect to be involved in anything strange or mysterious because they just didnt hold with such nonsense .Mr Dursley was the director 


In [26]:
sentences_text = text.split(' .')
print(f'Number of sentences: {len(sentences_text)}')
print(f'Sample sentence: {sentences_text[35]}')

Number of sentences: 660
Sample sentence: yes that would be it


# Preprocessing

## Remove punctuations

In [27]:
nltk.download('punkt')
def remPunctuations(text):
    # table is a translation table for removing the punctuation marks from the words
    table = str.maketrans({key: None for key in string.punctuation})
    translated = text.translate(table)
    return translated

[nltk_data] Downloading package punkt to /home/taha_adeel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
punc_removed_sentences_text = []
for s in sentences_text:
    punc_removed_sentences_text.append(remPunctuations(s).lower())
print(f'Preprocessed sample sentence: {punc_removed_sentences_text[0]}')

Preprocessed sample sentence: the boy who lived mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal thank you very much


## Tokenize the text

In [29]:
from nltk.tokenize import word_tokenize

# Takes a string input and returns a list of tokens
def tokenize(text):
    return word_tokenize(text)

In [30]:
sentences_tokens = []
for s in punc_removed_sentences_text:
    sentences_tokens.append(tokenize(s))

print(f'Sample sentence tokens: {sentences_tokens[0]}')

Sample sentence tokens: ['the', 'boy', 'who', 'lived', 'mr', 'and', 'mrs', 'dursley', 'of', 'number', 'four', 'privet', 'drive', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', 'thank', 'you', 'very', 'much']


# Fitting bigram language models

In [31]:
from nltk.lm.preprocessing import padded_everygram_pipeline

ngram_order = 2
train_data, vocab_data = padded_everygram_pipeline(
                            ngram_order,
                            sentences_tokens)

## MLE model

In [32]:
from nltk.lm import MLE

mle_lm = MLE(ngram_order)
mle_lm.fit(train_data, vocab_data)

print(f'Vocabulary: {len(mle_lm.vocab)}')
print(f'Voabulary lookup sample: {mle_lm.vocab.lookup(sentences_tokens[2])}')

Vocabulary: 1905
Voabulary lookup sample: ('mr', 'dursley', 'was', 'the', 'director', 'of', 'a', 'firm', 'called', 'grunnings', 'which', 'made', 'drills')


### Example Outputs

In [33]:
predicted_sentence_1 = mle_lm.generate(20, text_seed=['harry', 'potter'])
predicted_sentence_1 = 'harry potter ' + ' '.join(predicted_sentence_1)

print(f'Predicted sentence for Harry Potter: \n{predicted_sentence_1}')

Predicted sentence for Harry Potter: 
harry potter was a grip on yourself should have to brazil here was in his bed </s> while dudley </s> going to


In [34]:
# mle_lm.generate(20, text_seed=['Dumbledore'])
predicted_sentence_2 = mle_lm.generate(20, text_seed=['dumbledore'])
predicted_sentence_2 = 'Dumbledore ' + ' '.join(predicted_sentence_2)

print(f'Predicted sentence for Dumbledore: \n{predicted_sentence_2}')

Predicted sentence for Dumbledore: 
Dumbledore bowed his last even worth just didnt seem to mention of privet drive </s> mancrushing pythons </s> bed and worried


## Kneser-Ney model

In [35]:
from nltk.lm.models import KneserNeyInterpolated

ngram_order = 2
train_data, vocab_data = padded_everygram_pipeline(
                            ngram_order,
                            sentences_tokens)

In [36]:
kn_lm = KneserNeyInterpolated(ngram_order)
kn_lm.fit(train_data, vocab_data)

print(f'Vocabulary: {len(kn_lm.vocab)}')

Vocabulary: 1905


### Example Outputs

In [37]:
predicted_sentence_1 = kn_lm.generate(20, text_seed=['harry', 'potter'])
predicted_sentence_1 = 'harry potter ' + ' '.join(predicted_sentence_1)

print(f'Predicted sentence for Harry Potter: \n{predicted_sentence_1}')

Predicted sentence for Harry Potter: 
harry potter day </s> whatever everyone knows youre here without it had said that when september came up their sleeping pattern </s>


In [38]:
predicted_sentence_2 = kn_lm.generate(20, text_seed=['dumbledore'])
predicted_sentence_2 = 'Dumbledore ' + ' '.join(predicted_sentence_2)

print(f'Predicted sentence for Dumbledore: \n{predicted_sentence_2}')

Predicted sentence for Dumbledore: 
Dumbledore bowed to be back down on the stairs was the bill and the dream before the kitchen the same heavy


# Beam Search

In [39]:
def top_k_words_generator(context, k, model=mle_lm):
    context = [context[-1]] # Coz bigrams
    score_word = [(model.score(word, context), word) for word in model.vocab]

    score_word.sort(reverse=True)
    return score_word[:k]

print(f'Top 5 words for context ["harry", "potter"]: {top_k_words_generator(context=["harry", "potter"], k=5, model=mle_lm)}')

Top 5 words for context ["harry", "potter"]: [(0.21428571428571427, 'the'), (0.14285714285714285, 'was'), (0.07142857142857142, 'who'), (0.07142857142857142, 'wasnt'), (0.07142857142857142, 'voldemorts')]


In [40]:
from collections import deque

def beam_search(context, k, max_depth, model, num_outputs, debug=False):
    q = deque()
    q.append([1.0, context, 1]) # [probability, context, depth]
    final_sentences = []
    while len(q) != 0:
        [prob, context, depth] = q.popleft()
        top_k_words = top_k_words_generator(context, k, model)
        for [word_prob, word] in top_k_words:
            new_context = context.copy()
            new_context.append(word)
            if depth < max_depth:
                if debug: 
                    print(f'Depth: {depth}, Context: {" ".join(new_context)}, p = {prob * word_prob:0.4f}')
                q.append([prob * word_prob, new_context, depth + 1])
            else:
                final_sentences.append([prob, new_context])

    final_sentences.sort(reverse=True)
    return final_sentences[:num_outputs]

### Example Outputs

In [41]:
predicted_sentences_1 = beam_search(["harry", "potter"], k=2, max_depth=10, model=mle_lm, num_outputs=5, debug=False)

print('Predicted sentences for context ["harry", "potter"]')
for i, [p, s] in enumerate(predicted_sentences_1):
    print(f'{i+1}) {" ".join(s)}  [{p=}]')

Predicted sentences for context ["harry", "potter"]
1) harry potter was a large pink beach ball wearing square glasses and  [p=4.052906145562826e-06]
2) harry potter was a large pink beach ball wearing square glasses </s>  [p=4.052906145562826e-06]
3) harry potter the dursleys had a large pink beach ball wearing square  [p=4.800762058886991e-07]
4) harry potter the dursleys had a large pink beach ball wearing a  [p=4.800762058886991e-07]
5) harry potter was a large pink beach ball wearing a large tawny  [p=3.699748348646563e-07]


In [42]:
predicted_sentences_2 = beam_search(["dumbledore"], k=2, max_depth=10, model=mle_lm, num_outputs=5, debug=False)

print('Predicted sentences for context ["dumbledore"]')
for i, [p, s] in enumerate(predicted_sentences_2):
    print(f'{i+1}) {" ".join(s)} [{p=}]')

Predicted sentences for context ["dumbledore"]
1) dumbledore you cant take him in his aunt petunia </s> ‘ [p=2.346560298042777e-08]
2) dumbledore you cant take him in his aunt petunia </s> zoo [p=2.346560298042777e-08]
3) dumbledore you cant take him in his aunt petunia had been [p=1.9554669150356473e-08]
4) dumbledore you cant take him in his aunt petunia had a [p=1.9554669150356473e-08]
5) dumbledore you know what looked like yourself should have been watching [p=1.9058354930107985e-08]
