In [1]:
def pad(tokens, n):
    """Takes an iterable of tokens and pads with sentence boundary symbols.
    
    Always adds sentence end symbols. 
    For unigram sequences, does not add sentence starts.
    
    Arguments
    ---------
    tokens : list, tuple, iterable
        sentence to be padded
    n : int
        the ngram order
    
    Returns
    -------
    tuple
        Input padded with sentence boundary symbols
    """
    start = "<s>"
    end = "</s>"    
    tokens = list(tokens)
    tokens = [start]*(n-1) + tokens + [end]
    # Always return tuples, we don't want to modify the input in-place.
    tokens = tuple(tokens)  
    # YOUR CODE HERE
    return tokens

In [2]:
def make_n_grams(tokens, n):
    """Takes in a tuple of tokens and forms n-grams
    
    Arguments
    ---------
    tokens : tuple
        Tokens to make ngrams from
    n : int
        The order of ngrams to make
    Returns
    -------
    list
        A list of tuples: all ngrams of the specified order.
    """
    if len(tokens) < n:
        print("N-grams order is too big")
        return []
    
    ngrams = []
    for i in range (len(tokens)-n+1):
        ngrams.append(tokens[i:i+n])
        
    return ngrams 

In [3]:
def allgrams_pipeline(data, max_n):
    """Produces ngrams of all orders up to max_n from data, with padding
    
    This uses the user defined pad() and make_n_grams() functions.
    It acts as an additional test for those.
    
    However, you must not change this. If there is some error, change 
    pad or make_n_grams instead."""
    for sentence in data:
        for n in range(1, max_n+1):
            padded = pad(sentence, n)
            yield from make_n_grams(padded, n)
    return

In [4]:
from collections import Counter, defaultdict

def get_counts(ngrams, max_n):
    """Counts ngrams in a dataset.
    
    Takes an iterable of ngrams, of variable order. Simply counts how many times each
    ngram is seen. The main idea is how the counts are organized.
    
    The input is an iterable, which might produce a stream such as:
    
    ('this',),
    ('is',), 
    ('the',), 
    ('first',), 
    ('sentence',),
    ('</s>',),    
    ('<s>', 'this'), 
    ('this', 'is'), 
    ('is', 'the'), 
    ('the', 'first'), 
    ('first', 'sentence'), 
    ('sentence', '</s>'),

    Note how the stream has a mix of unigrams and bigrams.

    The output is a triply nested dict.
    The first level is indexed by ngram order,
    the second level is indexed by the history,
    and the third level is indexed by the last token (the predicted token).
    Additionally, we recommend making the third level an extended type of dict: a Counter
    See https://docs.python.org/3/library/collections.html#collections.Counter
    Example of the output structure:
    {
        1: {
            (,): 
                Counter({
                    '<s>': 21,
                    'this': 43,
                    'most': 31,
                    'is': 50,
                })
        2: {
            ('<s>', ): 
                Counter({
                    'this': 21,
                }),
            ('the',):
                Counter({
                    'most': 31,
                    'least': 14,
                }),
        3: {
            ('<s>', 'this'): 
                Counter({
                    'is': 12,
                    'has': 8,
                    '</s>': 1,
                }),
            ('the', 'most'):
                Counter({
                    'beautiful': 8,
                    'intelligent': 10,
                    'funny': 3,
                }),
    }
    This structure is useful, since each history will also get its own 
    conditional probability distribution.
    Note that when n==1, the ngram history simply becomes 
    the empty tuple, (,). This is fine.
  
    Arguments
    ---------
    sentences : iterable (such as list)
        An iterable over ngrams.
    max_n : int
        The maximum ngram order.
        
    Returns
    -------
    dict
        Triply nested dict, from ngram order to n_gram history parts, 
        to a dictionary of all continuations and their counts, e.g.
        {2: {('a',): {'b': 3 'c': 4}}}
    
    """
    
    n_gram_dict = {order: defaultdict(Counter) for order in range(1,max_n+1)}
    # The line above creates the triply nested dict.
    # The second and third layers are special: defaultdict and Counter
    # See their documentation:
    # https://docs.python.org/3/library/collections.html#collections.defaultdict
    # https://docs.python.org/3/library/collections.html#collections.Counter    
    for ngram in iter(ngrams):
        order = len(ngram)
        if ngram[-1] not in n_gram_dict[order][ngram[:-1]].keys():
            n_gram_dict[order][ngram[:-1]][ngram[-1]] = 1
        else:
            n_gram_dict[order][ngram[:-1]][ngram[-1]] += 1
    
    # Lastly, make the defaultdicts into normal dicts, 
    # so that defaultdict doesn't bite us later (it can hide some bugs)        
    return {n: dict(counts) for n, counts in n_gram_dict.items()}

In [5]:
"""
This cell has a utility function, which you need to use down the line.
The function is already provided here because it is also needed for the
sanity checks in the visible tests for the next task.
"""

def logsumexp2(*logs):
    """Linear-scale addition in log-scale
    
    https://en.wikipedia.org/wiki/LogSumExp#log-sum-exp_trick_for_log-domain_calculations"""
    x_star = max(logs)
    return x_star + log2(sum(pow(2, x-x_star) for x in logs))

In [6]:
from math import log2
NEGINF = -float('inf')


# Look at logprob_abs_discount first, to understand the full picture.
# Then, start by implementing logprob_discounted
# It has its own tests below; see that you can pass them first.
# Next, implement log_interp_weight.
# It also has its own tests.
# Finally, fill in the missing parts in logprob_abs_discount

def logprob_discounted(counts, context, token, delta):
    """The discounted log probability
        
    Remember to discount to 0 at most, max(count-delta, 0).
    If discounted count becomes 0, the discounted log prob becomes -inf.
    And the same concerns as with logprob_mle apply.
    
    This is the left side of the sum in the probability equations
    (the log version of it).
    """
    n = len(context) + 1  # N-gram order
    token_count = counts[n][context][token]
    
    context_count = sum(counts[n][context].values())
    if (max(token_count-delta,0) == 0) or (context_count==0):
        return NEGINF
    else:        
        score = log2(max(token_count-delta,0)) - log2(context_count)
        return score
    
def log_interp_weight(counts, context, delta):
    """The interpolation weight, as determined by the discount.
    
    You will need to figure out the total sum of discount applied
    for this context.
    
    This is the lambda in the equations (log version of it).
    """
    n = len(context) + 1  # N-gram order
    discount_sum = 0
    
    for token in counts[n][context]:
        token_count = counts[n][context][token]
        discount_sum += token_count - max(token_count-delta,0)    
    
    context_count = sum(counts[n][context].values())
    if (discount_sum == 0) or (context_count==0):
        return NEGINF
    else:        
        lamda = log2(discount_sum) - log2(context_count)
        return lamda    
    

def logprob_abs_discount(counts, context, token, delta=0.2):
    """Produces smoothed estimate of log(P(token | context))
    
    Now we will use absolute discounting and interpolation to lower
    orders.
    
    There are four main challenges to compute here:
    1. The discounted count for the token
        - Remember to discount to 0 at most, 
          max(count-delta, 0)
        - If discounted count becomes 0, the discounted log prob becomes -inf.
          And the same concerns as with logprob_mle apply.
    2. The interpolation weight, as determined by the discount.
        - You will need to figure out the total sum of discount applied
          for this context.
    3. The log probability to interpolate with.
        - This is easy: use recursion. So just call:
          logprob_abs_discount(counts, context[1:], token, delta)
        - Unigrams are the special case: they interpolate with the uniform
          distribution P(x) = 1 / vocab size.
    4. Interpolation in the log domain.
        - So you can get log(P_delta(token|context)) and 
          log(P_interp(token|context[1:])) without problems. But then you need
          the logarithmic equivalent of a sum.
        - For that, use the logsumexp2 function defined above. 
    
    The ngram counts are as produced by get_counts, same format as 
    with logprob_mle.
    
    Arguments
    ---------
    counts : dict
        Triply nested dict as shown above.
    context : tuple
        The context to predict on as tuple, e.g. ('<s>',)
    token : str
        The token to predict.
    delta : float
        The value to discount by.
    """
    n = len(context) + 1  # N-gram order
    vocab = set(counts[1][tuple()])
    V = len(vocab)  # Vocabulary size
    
    # Check that word is in the intended vocabulary,
    # i.e. at least seen once in the data (as unigram).
    # If the word is never seen in the data, we cannot expect it.
    if token not in vocab:
        return NEGINF

    # Find an order where context has been seen:     
    if n not in counts or context not in counts[n]:
        if n == 1:
            raise ValueError("Invalid counts-dict, needs to have all lower order counts.")
        return logprob_abs_discount(counts, context[1:], token)
    
    # 1. Discounted prob (computed by separate function above)
    lp_discounted = logprob_discounted(counts, context, token, delta)
    
    # 2. Log interpolation weight (computed by separate function above):
    log_lambda = log_interp_weight(counts, context, delta)
    
    # 3. Log lower order probability:
    if n == 1:
        # Stopping recursion at the unigram level, by interpolating with
        # unigram distribution:
        lp_lower = - log2(V)
    else:  # Recursion
        lp_lower = logprob_abs_discount(counts, context[1:], token)
    
    # 4. Putting it all together:
    log_sum = [lp_discounted, log_lambda + lp_lower]
    result = logsumexp2(*log_sum)

    return result

In [7]:
def perplexity(test_data, model_counts, logprob_func, **lp_kwargs):
    """
    Computes perplexity on the given test data with the given language model
    (as specified by the counts and the logprob function).
    
    Arguments
    ---------
    test_data : list
        List of lists of tokenized sentences.
    model_counts : dict
        Triply nested dict of ngram counts, as returned by get_counts()
    logprob_func : function
        Function with signature (counts, context, token), which returns the
        log-probability of the token given the context.
    **lp_kwargs : kwargs
        Log prob key word arguments, passed to logprob_func
    
    Returns
    -------
    float
        The perplexity of the model on the test data.
    """
    max_n = max(model_counts.keys())
    total_log_prob = 0.
    num_tokens = 0
    for sentence in test_data:
        padded = pad(sentence, max_n)
        ngrams = make_n_grams(padded, max_n)
        for *context, token in ngrams:
            total_log_prob += logprob_func(model_counts, tuple(context), token, **lp_kwargs)
            num_tokens += 1
    ppl = pow(2, -total_log_prob / num_tokens)
    return ppl

In [8]:
import random

def generate_text(model_counts, logprob_func, seed_text=None, **lp_kwargs):
    """Generates text from an N-gram model.
    
    Arguments
    ---------
    model_counts : dict
        N-gram counts as returned by get_counts()
    logprob_func : callable
        Function with signature (counts, context, token), which returns the
        log-probability of the token given the context.
    seed_text : list, optional
        Text to start generating from. If None, will start from the
        appropriate amount of sentence-start symbols (N-1).
    **lp_kwargs : kwargs
        Log prob key word arguments, passed to logprob_func
        
    Returns
    -------
    tuple
        Sentence generated by model as a list of tokens. If
        seed_text was given, will include it. Padding is stripped.
    """
    max_n = max(model_counts.keys())
    vocab = list(model_counts[1][tuple()])
    if seed_text is None:
        seed_text = ('<s>',) * (max_n-1)
    end = '</s>'
    output = list(seed_text)
    while output[-1] != end and len(output) < 200:  # Also guard against infinite loops
        context = output[-max_n+1:] if max_n > 1 else []
        token_logprobs = [2**logprob_func(model_counts, tuple(context), token, **lp_kwargs) 
                          for token in vocab]
        next_part = random.choices(vocab, token_logprobs)
        output.extend(next_part)  # next_part is a list: [token]
    return tuple(token for token in output if token not in ['<s>', '</s>'])

In [9]:
def detokenize(seq, full_sentence=True):
    """A simple rule-based detokenizer for this assignment"""
    last_seen_start = full_sentence
    formatted_tokens = []
    for token in seq:
        if token == "<s>":
            last_seen_start = True
            continue
        if token == "</s>":
            continue
        if last_seen_start:
            token = token.capitalize()
            last_seen_start = False
        if token == "i":
            token = "I"
        if token in ".!?,;" or token == "'s":
            formatted_tokens.append(token)
        elif formatted_tokens:
            formatted_tokens.append(" " + token)
        else:
            formatted_tokens.append(token)
    return "".join(formatted_tokens)        

# Import our data here

In [10]:
import pickle
# Load result:
with open("data.txt", "rb") as fp:   # Unpickling
    sentences = pickle.load(fp)

print(sentences[22:35])

['We know you love Chewy.', "We know you're here.", "We know you know the Chewy-RyanCohen-GameStop connection, but it wasn't real enough for you yet.", "Well, I don't have to tell you, because you're not stupid, but I will anyway:  it's gotten really real enough for you now.", 'The Chewy executive triumvirate joining the GameStop board of directors is your signal, friend.', 'You may start pumping GME to your boomer audience.', 'Now.', "I don't believe reddit has been too kind to you in the past, but worry not, follow through with this and you'll have lots of friends here and we'll have your back forever.", 'Well, definitely not forever, but at least for a while.', 'What better time to start than today?', 'With love, brother.', 'P.S.', "- don't be afraid to use the rocket 🚀, it feels good."]


In [11]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm

janeausten_tokenized = []
for sentence in tqdm(sentences):
    
    # We split the sentences into a list to do the next step
    sentence_list = word_tokenize(sentence)    
    if sentence_list:        
        janeausten_tokenized.append(sentence_list) 

100%|████████████████████████████████████████████████████████████████████████| 967968/967968 [02:12<00:00, 7332.20it/s]


In [12]:
test_split_index = round(0.9 * len(janeausten_tokenized))
janeausten_train = janeausten_tokenized[:test_split_index]
janeausten_test = janeausten_tokenized[test_split_index:]

print("The training set has", len(janeausten_train), "sentences, and the test set", len(janeausten_test), "sentences")

The training set has 871171 sentences, and the test set 96797 sentences


In [61]:
from collections import Counter
import itertools
janeausten_unigram_counts = Counter(itertools.chain.from_iterable(janeausten_train)).most_common()

print("The 10 most common tokens:")
print("\n".join(word for word, freq in janeausten_unigram_counts[:10]))
print()
print("The 2490-2500 most common tokens:")
print("\n".join(word for word, freq in janeausten_unigram_counts[2490:2500]))


#######################################################################################
#######################################################################################
#######################################################################################
janeausten_vocab_filt = set(word for word, freq in janeausten_unigram_counts[:2500]) | {"</s>", "<unk>"}


The 10 most common tokens:
.
the
,
to
I
a
and
is
of
you

The 2490-2500 most common tokens:
Bill
offering
exp
manager
28
bond
GREEN
prior
pumps
club


In [62]:
def replace_oovs(vocab, data, unk="<unk>"):
    """Replace OOV words with unknown-token
    
    Arguments
    ---------
    vocab : set
        The set of tokens that are in-vocabulary.
        token not in vocab => token is out-of-vocabulary.
    data : list of iterables
        List of sentences, which are lists (or other iterables) of tokens.
    unk : str
        Token to replace tokens which are not in the vocabulary
    
    Returns
    -------
    list
        list of lists, (list of sentences in data, sentences are lists of tokens)
        The data with out-of-vocabulary tokens replaced with the unknown token.
        Does NOT modify in-place.
    
    """
    # NOTE: Do not modify input in-place.
    data_oovs_replaced = [list(sentence) for sentence in data]
    for i, sentence in enumerate(data):
        for j, token in enumerate(sentence):
            if token not in vocab:
                data_oovs_replaced[i][j] = unk
        
    return data_oovs_replaced

In [63]:
janeausten_train_filt = replace_oovs(janeausten_vocab_filt, janeausten_train)
janeausten_test_filt = replace_oovs(janeausten_vocab_filt, janeausten_test)

print("Let's see an example:")
print(detokenize(janeausten_train_filt[2000]))

Let's see an example:
This is a company that is being <unk> very well given the <unk>.


In [64]:
# Produce counts from the real world data:
janeausten_counts = get_counts(allgrams_pipeline(janeausten_train_filt, 5), 5)

In [65]:
class LIWCache:
    """Very simple cache for log_interp_weight for speeding up querys
    
    The log_interp_weight function gets called many times with the same
    arguments. The normal Python LRU Cache decorator however cannot handle
    the counts argument, as it is unhashable. 
    """
    def __init__(self, func):
        self.func = func
        self.cache = {}
        self._caching = False
        self.hits = 0
        self.misses = 0
        
    def __call__(self, counts, context, delta):
        if not self._caching:
            return self.func(counts, context, delta)
        key = (context, delta)
        if key not in self.cache:
            self.cache[key] = self.func(counts, context, delta)
            self.misses += 1
        else:
            self.hits += 1
        return self.cache[key]
        
    @property
    def caching(self):
        return self._caching

    @caching.setter
    def caching(self, value):
        self._caching = value
        if not value:
            self.cache = {}  # Empty
            self.hits = 0
            self.misses = 0

if not isinstance(log_interp_weight, LIWCache):
    log_interp_weight = LIWCache(log_interp_weight)

In [66]:
seed_text = ['it', 'is', 'a', 'truth', 'universally', 'acknowledged', ',', 'that',]

In [67]:
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(janeausten_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Sentences from a smoothed model:


 25%|█████████████████████                                                               | 1/4 [00:15<00:46, 15.63s/it]

It is a truth universally acknowledged, that kind of money on this like my own brother.


 50%|██████████████████████████████████████████                                          | 2/4 [00:28<00:27, 13.90s/it]

It is a truth universally acknowledged, that would cause me to crash the stock.


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [01:19<00:30, 30.75s/it]

It is a truth universally acknowledged, that's right .... * to the fucking retards on this earth that would make sense from a bunch of <unk> <unk> <unk>, <unk> <unk> ) and August <unk> on Monday, much love.


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:26<00:00, 21.75s/it]

It is a truth universally acknowledged, that means drop is coming.





In [68]:
for sentence in text_results:
    text = sentence.split()
    text[-1] = text[-1][:-1]
    print(text)
    print('---------------------------------------------------')    
    print('Score {}'.format(perplexity([text], janeausten_counts, logprob_abs_discount)))
    print('===================================================')

['It', 'is', 'a', 'truth', 'universally', 'acknowledged,', 'that', 'kind', 'of', 'money', 'on', 'this', 'like', 'my', 'own', 'brother']
---------------------------------------------------
Score inf
['It', 'is', 'a', 'truth', 'universally', 'acknowledged,', 'that', 'would', 'cause', 'me', 'to', 'crash', 'the', 'stock']
---------------------------------------------------
Score inf
['It', 'is', 'a', 'truth', 'universally', 'acknowledged,', "that's", 'right', '....', '*', 'to', 'the', 'fucking', 'retards', 'on', 'this', 'earth', 'that', 'would', 'make', 'sense', 'from', 'a', 'bunch', 'of', '<unk>', '<unk>', '<unk>,', '<unk>', '<unk>', ')', 'and', 'August', '<unk>', 'on', 'Monday,', 'much', 'love']
---------------------------------------------------
Score inf
['It', 'is', 'a', 'truth', 'universally', 'acknowledged,', 'that', 'means', 'drop', 'is', 'coming']
---------------------------------------------------
Score inf


In [69]:
seed_text = ['Elon', 'Musk',]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(janeausten_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False

Sentences from a smoothed model:

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]




 25%|█████████████████████                                                               | 1/4 [00:04<00:14,  4.72s/it]

Elon Musk and <unk>.


 50%|██████████████████████████████████████████                                          | 2/4 [00:18<00:20, 10.22s/it]

Elon Musk <unk> to <unk> an <unk> on this gambling subreddit.


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:42<00:16, 16.29s/it]

Elon Musk's tweet ] ( https : <unk>? <unk> & amp; <unk> & amp; <unk>


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:46<00:00, 26.65s/it]

Elon Musk is <unk> <unk> ( <unk> to <unk> bought it that long and are already worried about it <unk> really got “ I ’ d have a <unk> my friend to options so I can short <unk> plus <unk> are totally not a thing <unk> <unk>!





In [70]:
for sentence in text_results:
    text = sentence.split()
    text[-1] = text[-1][:-1]
    print(text)
    print('---------------------------------------------------')    
    print('Score {}'.format(perplexity([text], janeausten_counts, logprob_abs_discount)))
    print('===================================================')

['Elon', 'Musk', 'and', '<unk>']
---------------------------------------------------
Score 299.87244152083383
['Elon', 'Musk', '<unk>', 'to', '<unk>', 'an', '<unk>', 'on', 'this', 'gambling', 'subreddit']
---------------------------------------------------
Score 69.26147146592136
['Elon', "Musk's", 'tweet', ']', '(', 'https', ':', '<unk>?', '<unk>', '&', 'amp;', '<unk>', '&', 'amp;', '<unk']
---------------------------------------------------
Score inf
['Elon', 'Musk', 'is', '<unk>', '<unk>', '(', '<unk>', 'to', '<unk>', 'bought', 'it', 'that', 'long', 'and', 'are', 'already', 'worried', 'about', 'it', '<unk>', 'really', 'got', '“', 'I', '’', 'd', 'have', 'a', '<unk>', 'my', 'friend', 'to', 'options', 'so', 'I', 'can', 'short', '<unk>', 'plus', '<unk>', 'are', 'totally', 'not', 'a', 'thing', '<unk>', '<unk>']
---------------------------------------------------
Score 19.78849991375468


In [71]:
seed_text = ['GME', 'to',]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(janeausten_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Sentences from a smoothed model:


 25%|█████████████████████                                                               | 1/4 [00:36<01:49, 36.45s/it]

Gme to do that but you 're still bullish, SPY to <unk> $ <unk> % <unk> <unk> <unk> FOR A <unk> <unk> TO <unk> AT ALL.


 50%|██████████████████████████████████████████                                          | 2/4 [00:41<00:35, 17.74s/it]

Gme to the moon!


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:49<00:13, 13.63s/it]

Gme to <unk> miss the next <unk> <unk>


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:04<00:00, 16.22s/it]

Gme to $ <unk> since the <unk> <unk> is working well.





In [72]:
for sentence in text_results:
    text = sentence.split()
    text[-1] = text[-1][:-1]
    print(text)
    print('---------------------------------------------------')    
    print('Score {}'.format(perplexity([text], janeausten_counts, logprob_abs_discount)))
    print('===================================================')

['Gme', 'to', 'do', 'that', 'but', 'you', "'re", 'still', 'bullish,', 'SPY', 'to', '<unk>', '$', '<unk>', '%', '<unk>', '<unk>', '<unk>', 'FOR', 'A', '<unk>', '<unk>', 'TO', '<unk>', 'AT', 'ALL']
---------------------------------------------------
Score inf
['Gme', 'to', 'the', 'moon']
---------------------------------------------------
Score inf
['Gme', 'to', '<unk>', 'miss', 'the', 'next', '<unk>', '<unk']
---------------------------------------------------
Score inf
['Gme', 'to', '$', '<unk>', 'since', 'the', '<unk>', '<unk>', 'is', 'working', 'well']
---------------------------------------------------
Score inf


In [73]:
seed_text = ['Daddy',]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(janeausten_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Sentences from a smoothed model:


 25%|█████████████████████                                                               | 1/4 [00:06<00:19,  6.60s/it]

Daddy You fuck around too much


 50%|██████████████████████████████████████████                                          | 2/4 [00:07<00:06,  3.08s/it]

Daddy


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:11<00:03,  3.44s/it]

Daddy my upvote.


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:12<00:00,  3.13s/it]

Daddy.





In [74]:
for sentence in text_results:
    text = sentence.split()
    text[-1] = text[-1][:-1]
    print(text)
    print('---------------------------------------------------')    
    print('Score {}'.format(perplexity([text], janeausten_counts, logprob_abs_discount)))
    print('===================================================')

['Daddy', 'You', 'fuck', 'around', 'too', 'muc']
---------------------------------------------------
Score inf
['Dadd']
---------------------------------------------------
Score inf
['Daddy', 'my', 'upvote']
---------------------------------------------------
Score inf
['Daddy']
---------------------------------------------------
Score inf


In [75]:
seed_text = ['My', 'wife',]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(janeausten_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Sentences from a smoothed model:


 25%|█████████████████████                                                               | 1/4 [00:21<01:03, 21.14s/it]

My wife ’ s boyfriend is the <unk> <unk> of the United States has to do.


 50%|██████████████████████████████████████████                                          | 2/4 [00:50<00:51, 25.83s/it]

My wife ’ s boyfriend said if SPY did n't even see any fucking positions or strike prices in here OP BAN


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:54<00:16, 16.13s/it]

My wife ’ s boyfriend


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:03<00:00, 15.82s/it]

My wife ’ s boyfriend ’ comments.





In [77]:
for sentence in text_results:
    text = sentence.split()
    text[-1] = text[-1][:-1]
    print(text)
    print('---------------------------------------------------')    
    print('Score {}'.format(perplexity([text], janeausten_counts, logprob_abs_discount)))
    print('===================================================')

['My', 'wife', '’', 's', 'boyfriend', 'is', 'the', '<unk>', '<unk>', 'of', 'the', 'United', 'States', 'has', 'to', 'do']
---------------------------------------------------
Score 11.595135423904363
['My', 'wife', '’', 's', 'boyfriend', 'said', 'if', 'SPY', 'did', "n't", 'even', 'see', 'any', 'fucking', 'positions', 'or', 'strike', 'prices', 'in', 'here', 'OP', 'BA']
---------------------------------------------------
Score 15.468115285824581
['My', 'wife', '’', 's', 'boyfrien']
---------------------------------------------------
Score inf
['My', 'wife', '’', 's', 'boyfriend', '’', 'comments']
---------------------------------------------------
Score 25.27509886143185


In [None]:
seed_text = ['More', 'rocket', 'please',]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(janeausten_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False