In [1]:
import os
from utils import download, read_data, tokenize_sentence, batch_tokenize_sentences
        
download()

Already downloaded!


In [2]:
data = read_data()
data[:10]

['First Citizen:',
 'Before we proceed any further, hear me speak.',
 'All:',
 'Speak, speak.',
 'First Citizen:',
 'You are all resolved rather to die than to famish?',
 'All:',
 'Resolved. resolved.',
 'First Citizen:',
 'First, you know Caius Marcius is chief enemy to the people.']

In [3]:
tokenized_sents = batch_tokenize_sentences(data)
tokenized_sents[:1]

[['First', 'Citizen:']]

In [4]:
len(tokenized_sents)

32777

In [5]:
from tqdm.auto import tqdm

def create_vocab(tokenized_sentences=tokenized_sents):
    vocabulary = dict() # word to count mapping
    
    for _, sentence in tqdm(enumerate(tokenized_sentences), total=len(tokenized_sentences)):
        for token in sentence:
            if token in vocabulary.keys():
                vocabulary[token] += 1.0
            else:
                vocabulary[token] = 0.0
                

    vocabulary["[OOV]"] = 0.0
    return vocabulary
                
                
vocabulary = create_vocab()

  0%|          | 0/32777 [00:00<?, ?it/s]

In [6]:
vocab_size = len(list(vocabulary.keys()))
vocab_size

25672

In [7]:
import numpy as np
from functools import reduce

def get_total_word_count(vocabulary):
    total = reduce(lambda start, values: start + np.sum(values), vocabulary.values(), 0)
    return total

total_tokens = get_total_word_count(vocabulary)
print(total_tokens)

176998.0


In [8]:
def unigram_probabilities(vocabulary, total_tokens, vocab_size, smoothing="laplace"):
    probabilities = dict() # unigram -> probability
    for k, v in vocabulary.items():
        if smoothing:
            probabilities[k] = (v + 1) / (float(total_tokens) + vocab_size)
        
    return probabilities


unigram_probs = unigram_probabilities(vocabulary, total_tokens, vocab_size)

In [9]:
unigram_probs["[OOV]"]

4.934129372872157e-06

In [10]:
def get_sent_probs(sentence, unigram_probs=unigram_probs) -> np.ndarray:
    if not isinstance(sentence, list):
        # tokenize
        tokens = tokenize_sentence(sentence)
    else:
        tokens = sentence
        
    
    sentence_probs = []
    for tok in tokens:
        if tok in unigram_probs.keys():
            sentence_probs.append(unigram_probs[tok])
        else:
            sentence_probs.append(unigram_probs["[OOV]"])
    
    return np.array(sentence_probs)

In [11]:
def perplexity(prob):
    product_prob = np.prod(prob)
    return np.power(product_prob, (-1 / prob.shape[0]))


In [12]:
test_inputs = [
    "First Citizen:",
    "Manners maketh a man.",
    "Where's the ghost, Othello?"
]

def evaluate(test_inputs=test_inputs) -> None:
    for _, ti in tqdm(enumerate(test_inputs), total=len(test_inputs)):
        probs = get_sent_probs(ti)
        p = perplexity(probs)
        
        print(f"Input : {ti}\nPerplexity : {p}\nProbabilities : {probs}\n")
        
evaluate(test_inputs)

  0%|          | 0/3 [00:00<?, ?it/s]

Input : First Citizen:
Perplexity : 1335.4957137943866
Probabilities : [0.00115952 0.00048354]

Input : Manners maketh a man.
Perplexity : 12557.003163608966
Probabilities : [4.93412937e-06 4.93412937e-06 1.28780777e-02 1.28287364e-04]

Input : Where's the ghost, Othello?
Perplexity : 9271.229071152722
Probabilities : [6.90778112e-05 2.68268614e-02 1.48023881e-05 4.93412937e-06]



In [13]:
from scipy.special import softmax

def generate(n_tokens, unigram_probs=unigram_probs, trials=50):
    words = list()
    probs = list()
    
    for k, v in unigram_probs.items():
        words.append(k)
        probs.append(v)
        
        
    assert len(words) == len(probs)
    
    dist = softmax(probs, axis=-1)
    logits = np.random.multinomial(trials, dist, n_tokens)
    
    logits = np.argmax(logits, axis=-1)
    
    indexes = logits.tolist()
    
    
    out  = " ".join(words[i] for i in indexes)
    return out

In [14]:
for _ in range(20):
    print(generate(15))
    print()

well send pikes, arms, instruction. There famously, Citizen: Marcius? ever But, virtue. pot, Worthy It

they let to Rome:' attends Ye're thirst much said than envying They'll me,-- business!' patricians

gaze wholesome, their enough: run cracking 'Though else then partly did accusers, help. fortune knees

arms Speak, Marcius? minute all speak could backs appetite already. citizens, wall, less this had;

friends,'--this three-and-twenty, shop would words shadow ne'er countrymen, 'tis heart, grave surplus, nor dog revenge.

clubs? folly. granted cannot, upon; you, sufferance are you Very yet then, once misery, finds

Nay, former sufferance the piercing already. All: so! ridiculous Whereby about shall to this rates;

prevail'd burn'd us, hares; commonalty. Some cared madam. XI: We For, puts very to barren

their sufferance must misery, an it stand might proceeds accusations; things country lord o' neighbours,

vile Rome:' Citizen: think rather well: did rates; altitude treads 'Though yiel