In [1]:
import os
from utils import download, read_data, tokenize_sentence, batch_tokenize_sentences
        
download()

Already downloaded!


In [2]:
data = read_data()
data[:10]

['First Citizen:',
 'Before we proceed any further, hear me speak.',
 'All:',
 'Speak, speak.',
 'First Citizen:',
 'You are all resolved rather to die than to famish?',
 'All:',
 'Resolved. resolved.',
 'First Citizen:',
 'First, you know Caius Marcius is chief enemy to the people.']

In [3]:
tokenized_sents = batch_tokenize_sentences(data)
tokenized_sents[:1]

[['First', 'Citizen:']]

In [4]:
len(tokenized_sents)

32777

In [5]:
from tqdm.auto import tqdm

def create_vocab(tokenized_sentences=tokenized_sents):
    vocabulary = dict() # word to count mapping
    
    for _, sentence in tqdm(enumerate(tokenized_sentences), total=len(tokenized_sentences)):
        for token in sentence:
            if token in vocabulary.keys():
                vocabulary[token] += 1.0
            else:
                vocabulary[token] = 0.0
                

    vocabulary["[OOV]"] = 0.0
    return vocabulary
                
                
vocabulary = create_vocab()

  0%|          | 0/32777 [00:00<?, ?it/s]

In [6]:
vocab_size = len(list(vocabulary.keys()))
vocab_size

25672

In [7]:
import numpy as np
from functools import reduce

def get_total_word_count(vocabulary):
    total = reduce(lambda start, values: start + np.sum(values), vocabulary.values(), 0)
    return total

total_tokens = get_total_word_count(vocabulary)
print(total_tokens)

176998.0


In [8]:
from scipy.special import softmax

def unigram_probabilities(vocabulary, total_tokens, vocab_size, smoothing="laplace"):
    probabilities = dict() # unigram -> probability
    for k, v in vocabulary.items():
        if smoothing:
            probabilities[k] = (v + 1) / (float(total_tokens) + vocab_size)
        
    return probabilities


def normalize_probabilities(unigram_probs):
    words = list()
    probs = list()
    
    for k, v in unigram_probs.items():
        words.append(k)
        probs.append(v)
        
    normalised = softmax(probs, axis=-1).tolist()
    d = dict()
    for w, p in zip(words, normalised):
        d[w] = p
        
    return d
    
    

unigram_probs = unigram_probabilities(vocabulary, total_tokens, vocab_size)
unigram_probs = normalize_probabilities(unigram_probs)

In [9]:
unigram_probs["[OOV]"]

3.895161667619404e-05

In [10]:
def get_sent_probs(sentence, unigram_probs=unigram_probs) -> np.ndarray:
    if not isinstance(sentence, list):
        # tokenize
        tokens = tokenize_sentence(sentence)
    else:
        tokens = sentence
        
    
    sentence_probs = []
    for tok in tokens:
        if tok in unigram_probs.keys():
            sentence_probs.append(unigram_probs[tok])
        else:
            sentence_probs.append(unigram_probs["[OOV]"])
    
    return np.array(sentence_probs)

In [11]:
def perplexity(prob):
    product_prob = np.prod(prob)
    return np.power(product_prob, (-1 / prob.shape[0]))


In [12]:
test_inputs = [
    "First Citizen:",
    "Manners maketh a man.",
    "Where's the ghost, Othello?"
]

def evaluate(test_inputs=test_inputs) -> None:
    for _, ti in tqdm(enumerate(test_inputs), total=len(test_inputs)):
        probs = get_sent_probs(ti)
        p = perplexity(probs)
        
        print(f"Input : {ti}\nPerplexity : {p}\nProbabilities : {probs}\n")
        
evaluate(test_inputs)

  0%|          | 0/3 [00:00<?, ?it/s]

Input : First Citizen:
Perplexity : 25651.919488240488
Probabilities : [3.89966157e-05 3.89702638e-05]

Input : Manners maketh a man.
Perplexity : 25589.59636479137
Probabilities : [3.89516167e-05 3.89516167e-05 3.94562878e-05 3.89564218e-05]

Input : Where's the ghost, Othello?
Perplexity : 25500.830398334332
Probabilities : [3.89541153e-05 4.00105114e-05 3.89520011e-05 3.89516167e-05]



In [13]:
def generate(n_tokens, unigram_probs=unigram_probs, trials=50):
    words = list()
    probs = list()
    
    for k, v in unigram_probs.items():
        words.append(k)
        probs.append(v)
        
        
    assert len(words) == len(probs)
    
    logits = np.random.multinomial(trials, probs, n_tokens)
    
    logits = np.argmax(logits, axis=-1)
    
    indexes = logits.tolist()
    
    
    out  = " ".join(words[i] for i in indexes)
    return out

In [14]:
for _ in range(20):
    print(generate(15))
    print()

hire. And, honours. us, Under it; doors! than hunger may end: Thanks. even strike evil.

kind this They Marcius people Piercing Marcius altitude men be any price. Cominius' persuaded; court,

You rise, know your act sick, O' leg, First, yourselves. yet, live lady will; sir:

end: Come, Second lungs, strange might only moved, speak. am trumpeter. looked Will senate; then?

folly. country? sell time A citizens. came marriages; superfluity, veins bear curbs known citizens, tell

we with love of us first: people. send want answer. First thirst must is, moon.

did wives, surfeits aunt be if become know't, change seem'd, ere he's let lowly hand?

us! love revenge Strike he, statutes Till petty The general's appetite heaven needs matter? come.

body. them, you. bent: especially Second takes, that, unapt Citizen: was run, place idle sigh'd

Lartius, ne'er poison'd proud. know't. indeed fragments! citizens, hand? company bats grown Why, present, fathers

What thrives Above fit yourself remain Y