In [17]:
import os
from utils import download, read_data, tokenize_sentence, batch_tokenize_sentences
        
download()

Already downloaded!


In [18]:
data = read_data()
data[:10]

['First Citizen:',
 'Before we proceed any further, hear me speak.',
 'All:',
 'Speak, speak.',
 'First Citizen:',
 'You are all resolved rather to die than to famish?',
 'All:',
 'Resolved. resolved.',
 'First Citizen:',
 'First, you know Caius Marcius is chief enemy to the people.']

In [19]:
tokenized_sents = batch_tokenize_sentences(data)
tokenized_sents[:1]

[['First', 'Citizen:']]

In [20]:
len(tokenized_sents)

32777

In [21]:
from tqdm.auto import tqdm

def create_vocab(tokenized_sentences=tokenized_sents):
    vocabulary = dict() # word to count mapping
    
    for _, sentence in tqdm(enumerate(tokenized_sentences), total=len(tokenized_sentences)):
        for token in sentence:
            if token in vocabulary.keys():
                vocabulary[token] += 1.0
            else:
                vocabulary[token] = 0.0
                

    vocabulary["[OOV]"] = 0.0
    return vocabulary
                
                
vocabulary = create_vocab()

  0%|          | 0/32777 [00:00<?, ?it/s]

In [22]:
vocab_size = len(list(vocabulary.keys()))
vocab_size

25672

In [23]:
import numpy as np
from functools import reduce

def get_total_word_count(vocabulary):
    total = reduce(lambda start, values: start + np.sum(values), vocabulary.values(), 0)
    return total

total_tokens = get_total_word_count(vocabulary)
print(total_tokens)

176998.0


In [24]:
def unigram_probabilities(vocabulary, total_tokens, vocab_size, smoothing="laplace"):
    probabilities = dict() # unigram -> probability
    for k, v in vocabulary.items():
        if smoothing:
            probabilities[k] = (v + 1) / (float(total_tokens) + vocab_size)
        
    return probabilities


unigram_probs = unigram_probabilities(vocabulary, total_tokens, vocab_size)

In [25]:
unigram_probs["[OOV]"]

4.934129372872157e-06

In [26]:
def get_sent_probs(sentence, unigram_probs=unigram_probs) -> np.ndarray:
    if not isinstance(sentence, list):
        # tokenize
        tokens = tokenize_sentence(sentence)
    else:
        tokens = sentence
        
    
    sentence_probs = []
    for tok in tokens:
        if tok in unigram_probs.keys():
            sentence_probs.append(unigram_probs[tok])
        else:
            sentence_probs.append(unigram_probs["[OOV]"])
    
    return np.array(sentence_probs)

In [27]:
def perplexity(prob):
    product_prob = np.prod(prob)
    return np.power(product_prob, (-1 / prob.shape[0]))


In [28]:
test_inputs = [
    "First Citizen:",
    "Manners maketh a man.",
    "Where's the ghost, Othello?"
]

def evaluate(test_inputs=test_inputs) -> None:
    for _, ti in tqdm(enumerate(test_inputs), total=len(test_inputs)):
        probs = get_sent_probs(ti)
        p = perplexity(probs)
        
        print(f"Input : {ti}\nPerplexity : {p}\nProbabilities : {probs}\n")
        
evaluate(test_inputs)

  0%|          | 0/3 [00:00<?, ?it/s]

Input : First Citizen:
Perplexity : 1335.4957137943866
Probabilities : [0.00115952 0.00048354]

Input : Manners maketh a man.
Perplexity : 12557.003163608966
Probabilities : [4.93412937e-06 4.93412937e-06 1.28780777e-02 1.28287364e-04]

Input : Where's the ghost, Othello?
Perplexity : 9271.229071152722
Probabilities : [6.90778112e-05 2.68268614e-02 1.48023881e-05 4.93412937e-06]



In [29]:
from scipy.special import softmax

def generate(n_tokens, unigram_probs=unigram_probs, trials=50):
    words = list()
    probs = list()
    
    for k, v in unigram_probs.items():
        words.append(k)
        probs.append(v)
        
        
    assert len(words) == len(probs)
    
    dist = softmax(probs, axis=-1)
    logits = np.random.multinomial(trials, dist, n_tokens)
    
    logits = np.argmax(logits, axis=-1)
    
    indexes = logits.tolist()
    
    
    out  = " ".join(words[i] for i in indexes)
    return out



for _ in range(20):
    print(generate(15))
    print()

former devour out at Soft! Be-mock noble flour famish? Should hunt. highest! Not an-hungry; senate;

dismiss'd poorest, not--'Sdeath! please us, was Note matter, 'We giddy wholesome, wall, war the tauntingly

maliciously. maliciously. her maliciously. love minute own Against Against to Which rakes: profits, flatterers, mouths,

inkling Appear it's die account What! once, covetous. generosity, eat reputed comeliness Marcius, No bale.

about, us, store-houses were sprang inventory hear blood general SICINIUS: where cannot sun: more, a

Bolingbroke's answer'd: bear You mother virtue. work's, fob likely then? fight vantage. days one link

wholesome the troth, fought; What show senate, brain; need virtue. good. good us, All superfluity,

hereafter. proceeds folly. absence famously, guess in Lady food o'er, fob afflicts statutes kill awe,

chief help usest corn looked guard well. outweighs common, issue partly I'll darts, proud. appetite,

sink yield strongest you'll gaze rushes. sufferance 