In [107]:
import random
import itertools
import pickle
from math import log2
from tqdm.notebook import tqdm
from collections import Counter, defaultdict
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#Import our data here
filepath = r'/content/drive/My Drive/RetardBot/'
filename = r"clean_data.txt"

# Load result:
with open(filepath+filename, "rb") as fp:   # Unpickling
    sentences = pickle.load(fp)

print(sentences[22:35])

["we know you know the chewy-ryancohen-gamestop connection, but it wasn't real enough for you yet.", "well, i don't have to tell you, because you're not stupid, but i will anyway: it's gotten really real enough for you now.", 'the chewy executive triumvirate joining the gamestop board of directors is your signal, friend.', 'you may start pumping gme to your boomer audience.', "i don't believe reddit has been too kind to you in the past, but worry not, follow through with this and you'll have lots of friends here and we'll have your back forever.", 'well, definitely not forever, but at least for a while.', 'what better time to start than today?', 'with love, brother.', "- don't be afraid to use the rocket 🚀, it feels good.", 'gme yolo update — dec 17 2020', '8x my annual income on this toss.', 'when i try to convince my broker for more margin', ' december 18, 2020']


## Here are functions we took from the assignment notebook

In [4]:
#function to pad the sentences with <s> and </s> from SNLP assignment
def pad(tokens, n):
    """Takes an iterable of tokens and pads with sentence boundary symbols.
    
    Always adds sentence end symbols. 
    For unigram sequences, does not add sentence starts.
    
    Arguments
    ---------
    tokens : list, tuple, iterable
        sentence to be padded
    n : int
        the ngram order
    
    Returns
    -------
    tuple
        Input padded with sentence boundary symbols
    """
    start = "<s>"
    end = "</s>"    
    tokens = list(tokens)
    tokens = [start]*(n-1) + tokens + [end]
    # Always return tuples, we don't want to modify the input in-place.
    tokens = tuple(tokens)  
    # YOUR CODE HERE
    return tokens

In [5]:
#function to create n-grams from SNLP assignment
def make_n_grams(tokens, n):
    """Takes in a tuple of tokens and forms n-grams
    
    Arguments
    ---------
    tokens : tuple
        Tokens to make ngrams from
    n : int
        The order of ngrams to make
    Returns
    -------
    list
        A list of tuples: all ngrams of the specified order.
    """
    if len(tokens) < n:
        print("N-grams order is too big")
        return []
    
    ngrams = []
    for i in range (len(tokens)-n+1):
        ngrams.append(tokens[i:i+n])
        
    return ngrams 

In [6]:
#from SNLP assignment
#this is probably needed for smoothing, because it requires all lower order n-grams
def allgrams_pipeline(data, max_n):
    """Produces ngrams of all orders up to max_n from data, with padding
    
    This uses the user defined pad() and make_n_grams() functions.
    It acts as an additional test for those.
    
    However, you must not change this. If there is some error, change 
    pad or make_n_grams instead."""
    for sentence in data:
        for n in range(1, max_n+1):
            padded = pad(sentence, n)
            yield from make_n_grams(padded, n)
    return

In [7]:
#from SNLP assignment
#the n-gram counter

def get_counts(ngrams, max_n):
    """Counts ngrams in a dataset.
    
    Takes an iterable of ngrams, of variable order. Simply counts how many times each
    ngram is seen. The main idea is how the counts are organized.
    
    The input is an iterable, which might produce a stream such as:
    
    ('this',),
    ('is',), 
    ('the',), 
    ('first',), 
    ('sentence',),
    ('</s>',),    
    ('<s>', 'this'), 
    ('this', 'is'), 
    ('is', 'the'), 
    ('the', 'first'), 
    ('first', 'sentence'), 
    ('sentence', '</s>'),

    Note how the stream has a mix of unigrams and bigrams.

    The output is a triply nested dict.
    The first level is indexed by ngram order,
    the second level is indexed by the history,
    and the third level is indexed by the last token (the predicted token).
    Additionally, we recommend making the third level an extended type of dict: a Counter
    See https://docs.python.org/3/library/collections.html#collections.Counter
    Example of the output structure:
    {
        1: {
            (,): 
                Counter({
                    '<s>': 21,
                    'this': 43,
                    'most': 31,
                    'is': 50,
                })
        2: {
            ('<s>', ): 
                Counter({
                    'this': 21,
                }),
            ('the',):
                Counter({
                    'most': 31,
                    'least': 14,
                }),
        3: {
            ('<s>', 'this'): 
                Counter({
                    'is': 12,
                    'has': 8,
                    '</s>': 1,
                }),
            ('the', 'most'):
                Counter({
                    'beautiful': 8,
                    'intelligent': 10,
                    'funny': 3,
                }),
    }
    This structure is useful, since each history will also get its own 
    conditional probability distribution.
    Note that when n==1, the ngram history simply becomes 
    the empty tuple, (,). This is fine.
  
    Arguments
    ---------
    sentences : iterable (such as list)
        An iterable over ngrams.
    max_n : int
        The maximum ngram order.
        
    Returns
    -------
    dict
        Triply nested dict, from ngram order to n_gram history parts, 
        to a dictionary of all continuations and their counts, e.g.
        {2: {('a',): {'b': 3 'c': 4}}}
    
    """
    
    n_gram_dict = {order: defaultdict(Counter) for order in range(1,max_n+1)}
    # The line above creates the triply nested dict.
    # The second and third layers are special: defaultdict and Counter
    # See their documentation:
    # https://docs.python.org/3/library/collections.html#collections.defaultdict
    # https://docs.python.org/3/library/collections.html#collections.Counter    
    for ngram in iter(ngrams):
        order = len(ngram)
        if ngram[-1] not in n_gram_dict[order][ngram[:-1]].keys():
            n_gram_dict[order][ngram[:-1]][ngram[-1]] = 1
        else:
            n_gram_dict[order][ngram[:-1]][ngram[-1]] += 1
    
    # Lastly, make the defaultdicts into normal dicts, 
    # so that defaultdict doesn't bite us later (it can hide some bugs)        
    return {n: dict(counts) for n, counts in n_gram_dict.items()}

In [8]:
#from SNLP assignment
#used for absolute discounting and calculating perplexity
#(we use the perplexity with base 2)
"""
This cell has a utility function, which you need to use down the line.
The function is already provided here because it is also needed for the
sanity checks in the visible tests for the next task.
"""

def logsumexp2(*logs):
    """Linear-scale addition in log-scale
    
    https://en.wikipedia.org/wiki/LogSumExp#log-sum-exp_trick_for_log-domain_calculations"""
    x_star = max(logs)
    return x_star + log2(sum(pow(2, x-x_star) for x in logs))

In [9]:
#absolute smoothing from SNLP assignment
NEGINF = -float('inf')


# Look at logprob_abs_discount first, to understand the full picture.
# Then, start by implementing logprob_discounted
# It has its own tests below; see that you can pass them first.
# Next, implement log_interp_weight.
# It also has its own tests.
# Finally, fill in the missing parts in logprob_abs_discount

def logprob_discounted(counts, context, token, delta):
    """The discounted log probability
        
    Remember to discount to 0 at most, max(count-delta, 0).
    If discounted count becomes 0, the discounted log prob becomes -inf.
    And the same concerns as with logprob_mle apply.
    
    This is the left side of the sum in the probability equations
    (the log version of it).
    """
    n = len(context) + 1  # N-gram order
    token_count = counts[n][context][token]
    
    context_count = sum(counts[n][context].values())
    if (max(token_count-delta,0) == 0) or (context_count==0):
        return NEGINF
    else:        
        score = log2(max(token_count-delta,0)) - log2(context_count)
        return score
    
def log_interp_weight(counts, context, delta):
    """The interpolation weight, as determined by the discount.
    
    You will need to figure out the total sum of discount applied
    for this context.
    
    This is the lambda in the equations (log version of it).
    """
    n = len(context) + 1  # N-gram order
    discount_sum = 0
    
    for token in counts[n][context]:
        token_count = counts[n][context][token]
        discount_sum += token_count - max(token_count-delta,0)    
    
    context_count = sum(counts[n][context].values())
    if (discount_sum == 0) or (context_count==0):
        return NEGINF
    else:        
        lamda = log2(discount_sum) - log2(context_count)
        return lamda    
    

def logprob_abs_discount(counts, context, token, delta=0.2):
    """Produces smoothed estimate of log(P(token | context))
    
    Now we will use absolute discounting and interpolation to lower
    orders.
    
    There are four main challenges to compute here:
    1. The discounted count for the token
        - Remember to discount to 0 at most, 
          max(count-delta, 0)
        - If discounted count becomes 0, the discounted log prob becomes -inf.
          And the same concerns as with logprob_mle apply.
    2. The interpolation weight, as determined by the discount.
        - You will need to figure out the total sum of discount applied
          for this context.
    3. The log probability to interpolate with.
        - This is easy: use recursion. So just call:
          logprob_abs_discount(counts, context[1:], token, delta)
        - Unigrams are the special case: they interpolate with the uniform
          distribution P(x) = 1 / vocab size.
    4. Interpolation in the log domain.
        - So you can get log(P_delta(token|context)) and 
          log(P_interp(token|context[1:])) without problems. But then you need
          the logarithmic equivalent of a sum.
        - For that, use the logsumexp2 function defined above. 
    
    The ngram counts are as produced by get_counts, same format as 
    with logprob_mle.
    
    Arguments
    ---------
    counts : dict
        Triply nested dict as shown above.
    context : tuple
        The context to predict on as tuple, e.g. ('<s>',)
    token : str
        The token to predict.
    delta : float
        The value to discount by.
    """
    n = len(context) + 1  # N-gram order
    vocab = set(counts[1][tuple()])
    V = len(vocab)  # Vocabulary size
    
    # Check that word is in the intended vocabulary,
    # i.e. at least seen once in the data (as unigram).
    # If the word is never seen in the data, we cannot expect it.
    if token not in vocab:
        return NEGINF

    # Find an order where context has been seen:     
    if n not in counts or context not in counts[n]:
        if n == 1:
            raise ValueError("Invalid counts-dict, needs to have all lower order counts.")
        return logprob_abs_discount(counts, context[1:], token)
    
    # 1. Discounted prob (computed by separate function above)
    lp_discounted = logprob_discounted(counts, context, token, delta)
    
    # 2. Log interpolation weight (computed by separate function above):
    log_lambda = log_interp_weight(counts, context, delta)
    
    # 3. Log lower order probability:
    if n == 1:
        # Stopping recursion at the unigram level, by interpolating with
        # unigram distribution:
        lp_lower = - log2(V)
    else:  # Recursion
        lp_lower = logprob_abs_discount(counts, context[1:], token)
    
    # 4. Putting it all together:
    log_sum = [lp_discounted, log_lambda + lp_lower]
    result = logsumexp2(*log_sum)

    return result

In [10]:
#perplexity function from SNLP assignment
def perplexity(test_data, model_counts, logprob_func, **lp_kwargs):
    """
    Computes perplexity on the given test data with the given language model
    (as specified by the counts and the logprob function).
    
    Arguments
    ---------
    test_data : list
        List of lists of tokenized sentences.
    model_counts : dict
        Triply nested dict of ngram counts, as returned by get_counts()
    logprob_func : function
        Function with signature (counts, context, token), which returns the
        log-probability of the token given the context.
    **lp_kwargs : kwargs
        Log prob key word arguments, passed to logprob_func
    
    Returns
    -------
    float
        The perplexity of the model on the test data.
    """
    max_n = max(model_counts.keys())
    total_log_prob = 0.
    num_tokens = 0
    for sentence in test_data:
        padded = pad(sentence, max_n)
        ngrams = make_n_grams(padded, max_n)
        for *context, token in ngrams:
            total_log_prob += logprob_func(model_counts, tuple(context), token, **lp_kwargs)
            num_tokens += 1
    ppl = pow(2, -total_log_prob / num_tokens)
    return ppl

In [11]:
#text generator from SNLP assignment

def generate_text(model_counts, logprob_func, seed_text=None, **lp_kwargs):
    """Generates text from an N-gram model.
    
    Arguments
    ---------
    model_counts : dict
        N-gram counts as returned by get_counts()
    logprob_func : callable
        Function with signature (counts, context, token), which returns the
        log-probability of the token given the context.
    seed_text : list, optional
        Text to start generating from. If None, will start from the
        appropriate amount of sentence-start symbols (N-1).
    **lp_kwargs : kwargs
        Log prob key word arguments, passed to logprob_func
        
    Returns
    -------
    tuple
        Sentence generated by model as a list of tokens. If
        seed_text was given, will include it. Padding is stripped.
    """
    max_n = max(model_counts.keys())
    vocab = list(model_counts[1][tuple()])
    if seed_text is None:
        seed_text = ('<s>',) * (max_n-1)
    end = '</s>'
    output = list(seed_text)
    while output[-1] != end and len(output) < 200:  # Also guard against infinite loops
        context = output[-max_n+1:] if max_n > 1 else []
        token_logprobs = [2**logprob_func(model_counts, tuple(context), token, **lp_kwargs) 
                          for token in vocab]
        next_part = random.choices(vocab, token_logprobs)
        output.extend(next_part)  # next_part is a list: [token]
    return tuple(token for token in output if token not in ['<s>', '</s>'])

In [12]:
#deetokenizer from SNLP assignment
def detokenize(seq, full_sentence=True):
    """A simple rule-based detokenizer for this assignment"""
    last_seen_start = full_sentence
    formatted_tokens = []
    for token in seq:
        if token == "<s>":
            last_seen_start = True
            continue
        if token == "</s>":
            continue
        if last_seen_start:
            token = token.capitalize()
            last_seen_start = False
        if token == "i":
            token = "I"
        if token in ".!?,;" or token == "'s":
            formatted_tokens.append(token)
        elif formatted_tokens:
            formatted_tokens.append(" " + token)
        else:
            formatted_tokens.append(token)
    return "".join(formatted_tokens)        

In [13]:
#from SNLP assignment
#the function to replace any word out of the vocabulary with <unk>
def replace_oovs(vocab, data, unk="<unk>"):
    """Replace OOV words with unknown-token
    
    Arguments
    ---------
    vocab : set
        The set of tokens that are in-vocabulary.
        token not in vocab => token is out-of-vocabulary.
    data : list of iterables
        List of sentences, which are lists (or other iterables) of tokens.
    unk : str
        Token to replace tokens which are not in the vocabulary
    
    Returns
    -------
    list
        list of lists, (list of sentences in data, sentences are lists of tokens)
        The data with out-of-vocabulary tokens replaced with the unknown token.
        Does NOT modify in-place.
    
    """
    # NOTE: Do not modify input in-place.
    data_oovs_replaced = [list(sentence) for sentence in data]
    for i, sentence in enumerate(data):
        for j, token in enumerate(sentence):
            if token not in vocab:
                data_oovs_replaced[i][j] = unk
        
    return data_oovs_replaced

In [14]:
#the cache thing from the SNLP assignment
class LIWCache:
    """Very simple cache for log_interp_weight for speeding up querys
    
    The log_interp_weight function gets called many times with the same
    arguments. The normal Python LRU Cache decorator however cannot handle
    the counts argument, as it is unhashable. 
    """
    def __init__(self, func):
        self.func = func
        self.cache = {}
        self._caching = False
        self.hits = 0
        self.misses = 0
        
    def __call__(self, counts, context, delta):
        if not self._caching:
            return self.func(counts, context, delta)
        key = (context, delta)
        if key not in self.cache:
            self.cache[key] = self.func(counts, context, delta)
            self.misses += 1
        else:
            self.hits += 1
        return self.cache[key]
        
    @property
    def caching(self):
        return self._caching

    @caching.setter
    def caching(self, value):
        self._caching = value
        if not value:
            self.cache = {}  # Empty
            self.hits = 0
            self.misses = 0

if not isinstance(log_interp_weight, LIWCache):
    log_interp_weight = LIWCache(log_interp_weight)

# Start our text generation here

In [122]:
#tokenize the WSB sentences here
wsb_tokenized = []
for sentence in tqdm(sentences):
    
    # We split the sentences into a list to do the next step
    sentence_list = word_tokenize(sentence)    
    if sentence_list:        
        wsb_tokenized.append(sentence_list) 

HBox(children=(FloatProgress(value=0.0, max=851344.0), HTML(value='')))




In [123]:
#create training and test sets here
#test_split_index = round(0.9 * len(wsb_tokenized)) #original
test_split_index = round(0.99 * len(wsb_tokenized))
wsb_train = wsb_tokenized[:test_split_index]
wsb_test = wsb_tokenized[test_split_index:]

print("The training set has", len(wsb_train), "sentences, and the test set", len(wsb_test), "sentences")

The training set has 842831 sentences, and the test set 8513 sentences


## 5,000 word vocabulary

In [150]:
#here is where we set our vocab and the discount delta
#only the top vocab_size words will be kept as they are and the rest will be replaced by <unk>
vocab_size = 5000
delta = 5.2 

In [151]:
#here is where we set the vocabulary and put <unk>s in
wsb_unigram_counts = Counter(itertools.chain.from_iterable(wsb_train)).most_common()

print("The 10 most common tokens:")
print("\n".join(word for word, freq in wsb_unigram_counts[:10]))
print()
print("The least common tokens in vocabulary:")
print("\n".join(word for word, freq in wsb_unigram_counts[vocab_size-10:vocab_size]))


#######################################################################################
#######################################################################################
#######################################################################################
wsb_vocab_filt = set(word for word, freq in wsb_unigram_counts[:vocab_size]) | {"</s>", "<unk>"} #this is the vocabulary

#from here we replace anything out of the vocabulary with <unk>
wsb_train_filt = replace_oovs(wsb_vocab_filt, wsb_train)
wsb_test_filt = replace_oovs(wsb_vocab_filt, wsb_test)

print("\n Example of a sentence with limited vocab and <unk>:")
print(detokenize(wsb_train_filt[2000]))

The 10 most common tokens:
.
the
,
to
i
a
and
you
is
of

The least common tokens in vocabulary:
severely
phrase
haircut
420.69
signals
collapsing
fuckery
vagina
cmon
withdrew

 Example of a sentence with limited vocab and <unk>:
<unk> election discussion thread


In [152]:
# Produce counts from the real world data:
#we're using 5-grams
wsb_counts = get_counts(allgrams_pipeline(wsb_train_filt, 5), 5)

In [127]:
def eval_ngram(wcounts, smoothing_func, seed_text, delta):
  log_interp_weight.caching = True
  text_results = []
  for i in tqdm(range(4)):
    text_result = detokenize(generate_text(wcounts, smoothing_func, seed_text, delta=delta))
    text_results.append(text_result)
    print(text_results[i])

  for sentence in text_results:
    text = sentence.split()
    text = [each_string.lower() for each_string in text]
    #text[-1] = text[-1][:-1]
    perplex = perplexity(text, wsb_counts, logprob_abs_discount)
    print(text)
    print('---------------------------------------------------')
    print('Score {}'.format(perplex))
    print('===================================================')
  log_interp_weight.caching = False


In [128]:
seed_text = ['gme',]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Gme ✅ oil war is all about memes and making the stocks where they need to take in september this year.
Gme, socialism with their dying breath.
Gme.
Gme to be the place told to come here on a saturday, sounds legit.

['gme', '✅', 'oil', 'war', 'is', 'all', 'about', 'memes', 'and', 'making', 'the', 'stocks', 'where', 'they', 'need', 'to', 'take', 'in', 'september', 'this', 'year.']
---------------------------------------------------
Score 3196.56331573966
['gme,', 'socialism', 'with', 'their', 'dying', 'breath.']
---------------------------------------------------
Score 5250.2048852117405
['gme.']
---------------------------------------------------
Score 5830.973639729579
['gme', 'to', 'be', 'the', 'place', 'told', 'to', 'come', 'here', 'on', 'a', 'saturday,', 'sounds', 'legit.']
---------------------------------------------------
Score 2971.829493571773


In [129]:
seed_text = ["to", "the", "moon",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

To the moon
To the moon
To the moon with no <unk> technology and they 're all invited to my yacht ”
To the moon.

['to', 'the', 'moon']
---------------------------------------------------
Score 2485.8101111854708
['to', 'the', 'moon']
---------------------------------------------------
Score 2485.8101111854708
['to', 'the', 'moon', 'with', 'no', '<unk>', 'technology', 'and', 'they', "'re", 'all', 'invited', 'to', 'my', 'yacht', '”']
---------------------------------------------------
Score inf
['to', 'the', 'moon.']
---------------------------------------------------
Score 2231.0866086027136


In [130]:
seed_text = ["elon", "mush",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Elon mush the article : * powell is legit bad at this and you just wan na <unk> this existence.
Elon mush mistake
Elon mush an iphone know exactly what it looked like apple was.
Elon mush earnings.

['elon', 'mush', 'the', 'article', ':', '*', 'powell', 'is', 'legit', 'bad', 'at', 'this', 'and', 'you', 'just', 'wan', 'na', '<unk>', 'this', 'existence.']
---------------------------------------------------
Score inf
['elon', 'mush', 'mistake']
---------------------------------------------------
Score 2998.757196406375
['elon', 'mush', 'an', 'iphone', 'know', 'exactly', 'what', 'it', 'looked', 'like', 'apple', 'was.']
---------------------------------------------------
Score 3782.3414506397544
['elon', 'mush', 'earnings.']
---------------------------------------------------
Score 191.88196661913108


In [131]:
seed_text = ["pltr",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Pltr has no unique tech, a negative <unk>, and I mean like options?
Pltr to be like you have a nice cock <unk>
Pltr, I wish <unk> was public <unk>
Pltr, peter thiel is <unk> fucking real.

['pltr', 'has', 'no', 'unique', 'tech,', 'a', 'negative', '<unk>,', 'and', 'i', 'mean', 'like', 'options?']
---------------------------------------------------
Score inf
['pltr', 'to', 'be', 'like', 'you', 'have', 'a', 'nice', 'cock', '<unk>']
---------------------------------------------------
Score inf
['pltr,', 'i', 'wish', '<unk>', 'was', 'public', '<unk>']
---------------------------------------------------
Score inf
['pltr,', 'peter', 'thiel', 'is', '<unk>', 'fucking', 'real.']
---------------------------------------------------
Score inf


In [132]:
seed_text = ["stonk",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Stonk I invested in them and what they said about my pltr <unk>?
Stonk town.
Stonk go - $ 2k immediately at open?
Stonk goes on a twitter <unk>, someone broke into my boss's boss, <unk>, <unk>, and <unk> page about to get crazy high, at least take someone from r/investing?

['stonk', 'i', 'invested', 'in', 'them', 'and', 'what', 'they', 'said', 'about', 'my', 'pltr', '<unk>?']
---------------------------------------------------
Score inf
['stonk', 'town.']
---------------------------------------------------
Score 263.44420763285945
['stonk', 'go', '-', '$', '2k', 'immediately', 'at', 'open?']
---------------------------------------------------
Score 2249.6264579600006
['stonk', 'goes', 'on', 'a', 'twitter', '<unk>,', 'someone', 'broke', 'into', 'my', "boss's", 'boss,', '<unk>,', '<unk>,', 'and', '<unk>', 'page', 'about', 'to', 'get', 'crazy', 'high,', 'at', 'least', 'take', 'someone', 'from', 'r/investing?']
---------------------------------------------------
Score inf


In [133]:
seed_text = ["stock","market",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Stock market is <unk> right now.
Stock market is still free enough to realize that gold is nowhere near <unk>.
Stock market not react in a `` in negotiations for economic growth?
Stock market you have to say if you mind your own business and just keep making all the wrong moves and losing money 🍻 boys

['stock', 'market', 'is', '<unk>', 'right', 'now.']
---------------------------------------------------
Score inf
['stock', 'market', 'is', 'still', 'free', 'enough', 'to', 'realize', 'that', 'gold', 'is', 'nowhere', 'near', '<unk>.']
---------------------------------------------------
Score inf
['stock', 'market', 'not', 'react', 'in', 'a', '``', 'in', 'negotiations', 'for', 'economic', 'growth?']
---------------------------------------------------
Score 717.297535830128
['stock', 'market', 'you', 'have', 'to', 'say', 'if', 'you', 'mind', 'your', 'own', 'business', 'and', 'just', 'keep', 'making', 'all', 'the', 'wrong', 'moves', 'and', 'losing', 'money', '🍻', 'boys']
-------------------

In [134]:
seed_text = ["invest","in",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Invest in.
Invest in european stocks said no one ever <unk> for.
Invest in gme
Invest in it.

['invest', 'in.']
---------------------------------------------------
Score 718.3458750609933
['invest', 'in', 'european', 'stocks', 'said', 'no', 'one', 'ever', '<unk>', 'for.']
---------------------------------------------------
Score inf
['invest', 'in', 'gme']
---------------------------------------------------
Score 1456.5348318779718
['invest', 'in', 'it.']
---------------------------------------------------
Score 1709.2951830803709


In [135]:
seed_text = ["short","term",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Short term in ... <unk> extremely low territory, even for a day or two.
Short term options will get the better of me `` <unk>, <unk>, <unk> stories.
Short term gains tax, you ’ re <unk> to <unk> big <unk> and <unk> on my balls
Short term.

['short', 'term', 'in', '...', '<unk>', 'extremely', 'low', 'territory,', 'even', 'for', 'a', 'day', 'or', 'two.']
---------------------------------------------------
Score inf
['short', 'term', 'options', 'will', 'get', 'the', 'better', 'of', 'me', '``', '<unk>,', '<unk>,', '<unk>', 'stories.']
---------------------------------------------------
Score inf
['short', 'term', 'gains', 'tax,', 'you', '’', 're', '<unk>', 'to', '<unk>', 'big', '<unk>', 'and', '<unk>', 'on', 'my', 'balls']
---------------------------------------------------
Score inf
['short', 'term.']
---------------------------------------------------
Score 4077.618305498825


In [153]:
log_interp_weight.caching = True
ppx_5000 = perplexity(wsb_test_filt, wsb_counts, logprob_abs_discount)
log_interp_weight.caching = False
print("Perplexity for test set: ",ppx_5000)

Perplexity for test set:  69.38473968998498


## 2,500 word vocabulary

In [136]:
vocab_size = 2500
delta = 5.2

In [137]:
#here is where we set the vocabulary and put <unk>s in
wsb_unigram_counts = Counter(itertools.chain.from_iterable(wsb_train)).most_common()

print("The 10 most common tokens:")
print("\n".join(word for word, freq in wsb_unigram_counts[:10]))
print()
print("The least common tokens in vocabulary:")
print("\n".join(word for word, freq in wsb_unigram_counts[vocab_size-10:vocab_size]))


#######################################################################################
#######################################################################################
#######################################################################################
wsb_vocab_filt = set(word for word, freq in wsb_unigram_counts[:vocab_size]) | {"</s>", "<unk>"} #this is the vocabulary

#from here we replace anything out of the vocabulary with <unk>
wsb_train_filt = replace_oovs(wsb_vocab_filt, wsb_train)
wsb_test_filt = replace_oovs(wsb_vocab_filt, wsb_test)

print("\n Example of a sentence with limited vocab and <unk>:")
print(detokenize(wsb_train_filt[2000]))

The 10 most common tokens:
.
the
,
to
i
a
and
you
is
of

The least common tokens in vocabulary:
sheet
staring
iq
peace
mattress
valuable
deliver
expires
ups
pathetic

 Example of a sentence with limited vocab and <unk>:
<unk> election discussion thread


In [138]:
# Produce counts from the real world data:
#we're using 5-grams
wsb_counts = get_counts(allgrams_pipeline(wsb_train_filt, 5), 5)

In [139]:
seed_text = ['gme',]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Gme
Gme was a <unk> to no cut next week and not getting it.
Gme happen and is <unk> <unk> <unk>
Gme <unk>.

['gme']
---------------------------------------------------
Score inf
['gme', 'was', 'a', '<unk>', 'to', 'no', 'cut', 'next', 'week', 'and', 'not', 'getting', 'it.']
---------------------------------------------------
Score inf
['gme', 'happen', 'and', 'is', '<unk>', '<unk>', '<unk>']
---------------------------------------------------
Score inf
['gme', '<unk>.']
---------------------------------------------------
Score inf


In [140]:
seed_text = ["to", "the", "moon",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

To the moon, <unk> long on rkt with 2 year leaps
To the moon.
To the moon 🚀
To the moon.

['to', 'the', 'moon,', '<unk>', 'long', 'on', 'rkt', 'with', '2', 'year', 'leaps']
---------------------------------------------------
Score inf
['to', 'the', 'moon.']
---------------------------------------------------
Score inf
['to', 'the', 'moon', '🚀']
---------------------------------------------------
Score inf
['to', 'the', 'moon.']
---------------------------------------------------
Score inf


In [141]:
seed_text = ["elon", "mush",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Elon mush book.
Elon mush that spce <unk> back up.
Elon mush a <unk>.
Elon mush <unk> some, *i <unk>, are you totally retarded

['elon', 'mush', 'book.']
---------------------------------------------------
Score inf
['elon', 'mush', 'that', 'spce', '<unk>', 'back', 'up.']
---------------------------------------------------
Score inf
['elon', 'mush', 'a', '<unk>.']
---------------------------------------------------
Score inf
['elon', 'mush', '<unk>', 'some,', '*i', '<unk>,', 'are', 'you', 'totally', 'retarded']
---------------------------------------------------
Score inf


In [142]:
seed_text = ["pltr",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Pltr rocket.
Pltr.
Pltr mooning '' over <unk> moves is out of the most obvious bubble that's ever <unk>.
Pltr cap so early 2022

['pltr', 'rocket.']
---------------------------------------------------
Score inf
['pltr.']
---------------------------------------------------
Score inf
['pltr', 'mooning', "''", 'over', '<unk>', 'moves', 'is', 'out', 'of', 'the', 'most', 'obvious', 'bubble', "that's", 'ever', '<unk>.']
---------------------------------------------------
Score inf
['pltr', 'cap', 'so', 'early', '2022']
---------------------------------------------------
Score inf


In [143]:
seed_text = ["stonk",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Stonk market <unk> of our <unk>
Stonk go up.
Stonk
Stonk both <unk> their own service

['stonk', 'market', '<unk>', 'of', 'our', '<unk>']
---------------------------------------------------
Score inf
['stonk', 'go', 'up.']
---------------------------------------------------
Score inf
['stonk']
---------------------------------------------------
Score 39.73412507785317
['stonk', 'both', '<unk>', 'their', 'own', 'service']
---------------------------------------------------
Score inf


In [144]:
seed_text = ["stock","market",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Stock market back here in 30 minutes ... .. green by open and then drill aren ’ t selling and the average of what is said every friday on this sub <unk> me
Stock market
Stock market go <unk>.
Stock market has decided they like that trade off more than paying <unk>.

['stock', 'market', 'back', 'here', 'in', '30', 'minutes', '...', '..', 'green', 'by', 'open', 'and', 'then', 'drill', 'aren', '’', 't', 'selling', 'and', 'the', 'average', 'of', 'what', 'is', 'said', 'every', 'friday', 'on', 'this', 'sub', '<unk>', 'me']
---------------------------------------------------
Score inf
['stock', 'market']
---------------------------------------------------
Score 36.18961100188603
['stock', 'market', 'go', '<unk>.']
---------------------------------------------------
Score inf
['stock', 'market', 'has', 'decided', 'they', 'like', 'that', 'trade', 'off', 'more', 'than', 'paying', '<unk>.']
---------------------------------------------------
Score inf


In [145]:
seed_text = ["invest","in",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Invest in a <unk> <unk>.
Invest in a <unk> hot stock tip stories I 've doing this summer ( asking if I had just left <unk> <unk> <unk> and the sex isnt great.
Invest in <unk> bears blowing guys in 2 days.
Invest in a month

['invest', 'in', 'a', '<unk>', '<unk>.']
---------------------------------------------------
Score inf
['invest', 'in', 'a', '<unk>', 'hot', 'stock', 'tip', 'stories', 'i', "'ve", 'doing', 'this', 'summer', '(', 'asking', 'if', 'i', 'had', 'just', 'left', '<unk>', '<unk>', '<unk>', 'and', 'the', 'sex', 'isnt', 'great.']
---------------------------------------------------
Score inf
['invest', 'in', '<unk>', 'bears', 'blowing', 'guys', 'in', '2', 'days.']
---------------------------------------------------
Score inf
['invest', 'in', 'a', 'month']
---------------------------------------------------
Score inf


In [146]:
seed_text = ["short","term",]
eval_ngram(wsb_counts, logprob_abs_discount, seed_text, delta)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Short term play.
Short term fucked long term <unk> of policy changes, <unk> money is coming from ... we 're going to be in the <unk>.
Short term <unk> with an <unk> at <unk>, based on the fuckin virus and unemployment is <unk> to all the bulls screaming : the market is n't <unk> them cause of their <unk> is to write cnbc <unk> you <unk> in on the ground and the great american story about how <unk> a casino and gamble in <unk>, pltr, and <unk> their <unk> a little bit, me having to pay this back next year.
Short term right into off the wall shit, don ’ t know what is one <unk> % day <unk> with spx I ’ ve been laid off in the summer.

['short', 'term', 'play.']
---------------------------------------------------
Score inf
['short', 'term', 'fucked', 'long', 'term', '<unk>', 'of', 'policy', 'changes,', '<unk>', 'money', 'is', 'coming', 'from', '...', 'we', "'re", 'going', 'to', 'be', 'in', 'the', '<unk>.']
---------------------------------------------------
Score inf
['short', 'term', '<u

In [147]:
log_interp_weight.caching = True
ppx_2000 = perplexity(wsb_test_filt, wsb_counts, logprob_abs_discount)
log_interp_weight.caching = False
print("Perplexity for test set: ",ppx_2000)

Perplexity for test set:  57.32341502302042


### Original experimental codes

In [None]:
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(wsb_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False


  0%|          | 0/4 [00:00<?, ?it/s][A

Sentences from a smoothed model:



 25%|██▌       | 1/4 [10:58<32:55, 658.38s/it][A

Gme I had set up a national testing supply chain ( who knew ) and we now have an entire diversified meme portfolio printing tendies at record highs, unprecedented fed <unk> needed to stop credit market <unk>, these are the same china market.



 50%|█████     | 2/4 [14:50<17:40, 530.39s/it][A

Gme would know, I knew this was up over missing out or selling too early, again.



 75%|███████▌  | 3/4 [16:04<06:33, 393.58s/it][A

Gme my mouth are permanently <unk>?



100%|██████████| 4/4 [16:27<00:00, 246.85s/it]

Gme <unk>.





In [None]:
for sentence in text_results:
    text = sentence.split()
    text = [each_string.lower() for each_string in text]
    text[-1] = text[-1][:-1]
    print(text)
    print('---------------------------------------------------')
    print('Score {}'.format(perplexity([text], wsb_counts, logprob_abs_discount)))
    print('===================================================')

['gme', 'i', 'had', 'set', 'up', 'a', 'national', 'testing', 'supply', 'chain', '(', 'who', 'knew', ')', 'and', 'we', 'now', 'have', 'an', 'entire', 'diversified', 'meme', 'portfolio', 'printing', 'tendies', 'at', 'record', 'highs,', 'unprecedented', 'fed', '<unk>', 'needed', 'to', 'stop', 'credit', 'market', '<unk>,', 'these', 'are', 'the', 'same', 'china', 'market']
---------------------------------------------------
Score inf
['gme', 'would', 'know,', 'i', 'knew', 'this', 'was', 'up', 'over', 'missing', 'out', 'or', 'selling', 'too', 'early,', 'again']
---------------------------------------------------
Score inf
['gme', 'my', 'mouth', 'are', 'permanently', '<unk>']
---------------------------------------------------
Score 73.52216756741151
['gme', '<unk>']
---------------------------------------------------
Score 358.7804250492436


In [None]:
perplexity(['gme', 'i', 'had', 'set', 'up', 'a', 'national', 'testing', 'supply', 'chain', '(', 'who', 'knew', ')', 'and', 'we', 'now', 'have', 'an', 'entire', 'diversified', 'meme', 'portfolio', 'printing', 'tendies', 'at', 'record', 'highs,', 'unprecedented', 'fed', 'needed', 'to', 'stop', 'credit', 'market', 'these', 'are', 'the', 'same', 'china', 'market','.'],wsb_counts, logprob_abs_discount)

2495.6783340283496

In [None]:
perplexity(['gme', 'would', 'know,', 'i', 'knew', 'this', 'was', 'up', 'over', 'missing', 'out', 'or', 'selling', 'too', 'early,', 'again','.'], wsb_counts, logprob_abs_discount)

3715.0339165799496

In [None]:
seed_text = ["to", "the", "moon",]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(wsb_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False


  0%|          | 0/4 [00:00<?, ?it/s][A

Sentences from a smoothed model:



 25%|██▌       | 1/4 [00:25<01:16, 25.52s/it][A

To the moon tomorrow



 50%|█████     | 2/4 [00:51<00:51, 25.56s/it][A

To the moon!



 75%|███████▌  | 3/4 [01:02<00:21, 21.20s/it][A

To the moon



100%|██████████| 4/4 [01:27<00:00, 21.85s/it]

To the moon.





In [None]:
for sentence in text_results:
    text = sentence.split()
    text = [each_string.lower() for each_string in text]
    text[-1] = text[-1][:-1]
    
    print(text)
    print('---------------------------------------------------')    
    print('Score {}'.format(perplexity([text], wsb_counts, logprob_abs_discount)))
    print('===================================================')

['to', 'the', 'moon', 'tomorro']
---------------------------------------------------
Score inf
['to', 'the', 'moon']
---------------------------------------------------
Score 34.799007445288815
['to', 'the', 'moo']
---------------------------------------------------
Score inf
['to', 'the', 'moon']
---------------------------------------------------
Score 34.799007445288815


In [None]:
perplexity(['to', 'the', 'moon', 'tomorrow'], wsb_counts, logprob_abs_discount)

3189.329977624238

In [None]:
perplexity(['to', 'the', 'moon', '!'], wsb_counts, logprob_abs_discount)

6535.755967479491

In [None]:
seed_text = ["elon", "mush",]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(wsb_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False



  0%|          | 0/4 [00:00<?, ?it/s][A[A

Sentences from a smoothed model:




 25%|██▌       | 1/4 [01:05<03:16, 65.67s/it][A[A

Elon mush there is rona.




 50%|█████     | 2/4 [01:13<01:36, 48.40s/it][A[A

Elon mush




 75%|███████▌  | 3/4 [01:33<00:39, 39.72s/it][A[A

Elon mush ''




100%|██████████| 4/4 [01:41<00:00, 25.25s/it]

Elon mush





In [None]:
for sentence in text_results:
    text = sentence.split()
    text = [each_string.lower() for each_string in text]
    #text[-1] = text[-1][:-1]
    text[-1] = text[-1]
    print(text)
    print('---------------------------------------------------')    
    print('Score {}'.format(perplexity(text, wsb_counts, logprob_abs_discount)))
    print('===================================================')

['elon', 'mush', 'there', 'is', 'rona.']
---------------------------------------------------
Score 1654.690388507937
['elon', 'mush']
---------------------------------------------------
Score 1049.1177840846137
['elon', 'mush', "''"]
---------------------------------------------------
Score 2038.7328570488178
['elon', 'mush']
---------------------------------------------------
Score 1049.1177840846137


In [None]:
perplexity(['elon', 'mush', 'there', 'is', 'rona.'], wsb_counts, logprob_abs_discount)

1654.690388507937

In [21]:
seed_text = ['Daddy',]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(wsb_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False

  0%|          | 0/4 [00:00<?, ?it/s]

Sentences from a smoothed model:


 25%|██▌       | 1/4 [00:23<01:10, 23.64s/it]

Daddy mango.


 50%|█████     | 2/4 [01:54<01:27, 43.84s/it]

Daddy we will <unk> these as well?


 75%|███████▌  | 3/4 [05:13<01:30, 90.25s/it]

Daddy exactly the <unk> face in about 3 weeks ago lmao pt $ 80 jan 2021


KeyboardInterrupt: ignored

In [None]:
for sentence in text_results:
    text = sentence.split()
    text[-1] = text[-1][:-1]
    print(text)
    print('---------------------------------------------------')    
    print('Score {}'.format(perplexity([text], wsb_counts, logprob_abs_discount)))
    print('===================================================')

['Daddy', 'You', 'fuck', 'around', 'too', 'muc']
---------------------------------------------------
Score inf
['Dadd']
---------------------------------------------------
Score inf
['Daddy', 'my', 'upvote']
---------------------------------------------------
Score inf
['Daddy']
---------------------------------------------------
Score inf


In [None]:
seed_text = ['My', 'wife',]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(wsb_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Sentences from a smoothed model:


 25%|█████████████████████                                                               | 1/4 [00:21<01:03, 21.14s/it]

My wife ’ s boyfriend is the <unk> <unk> of the United States has to do.


 50%|██████████████████████████████████████████                                          | 2/4 [00:50<00:51, 25.83s/it]

My wife ’ s boyfriend said if SPY did n't even see any fucking positions or strike prices in here OP BAN


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:54<00:16, 16.13s/it]

My wife ’ s boyfriend


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:03<00:00, 15.82s/it]

My wife ’ s boyfriend ’ comments.





In [None]:
for sentence in text_results:
    text = sentence.split()
    text[-1] = text[-1][:-1]
    print(text)
    print('---------------------------------------------------')    
    print('Score {}'.format(perplexity([text], wsb_counts, logprob_abs_discount)))
    print('===================================================')

['My', 'wife', '’', 's', 'boyfriend', 'is', 'the', '<unk>', '<unk>', 'of', 'the', 'United', 'States', 'has', 'to', 'do']
---------------------------------------------------
Score 11.595135423904363
['My', 'wife', '’', 's', 'boyfriend', 'said', 'if', 'SPY', 'did', "n't", 'even', 'see', 'any', 'fucking', 'positions', 'or', 'strike', 'prices', 'in', 'here', 'OP', 'BA']
---------------------------------------------------
Score 15.468115285824581
['My', 'wife', '’', 's', 'boyfrien']
---------------------------------------------------
Score inf
['My', 'wife', '’', 's', 'boyfriend', '’', 'comments']
---------------------------------------------------
Score 25.27509886143185


In [None]:
seed_text = ['More', 'rocket', 'please',]
print("Sentences from a smoothed model:")
log_interp_weight.caching = True
text_results = []
for i in tqdm(range(4)):
    text_result = detokenize(generate_text(wsb_counts, logprob_abs_discount, seed_text, delta=5.2))
    text_results.append(text_result)
    print(text_results[i])
    
log_interp_weight.caching = False

In [None]:
log_interp_weight.caching = True
perplexity(wsb_test_filt, wsb_counts, logprob_abs_discount)
log_interp_weight.caching = False