In [7]:
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers.legacy import RMSprop
from sklearn.decomposition import IncrementalPCA

import textwrap
from tqdm import tqdm
from pprint import pprint

In [8]:
df = pd.read_csv("rJokesData/data/preprocessed.csv")
df = df.dropna(axis=0)
df.sample(5)

Unnamed: 0,joke,body,punchline,score,date
428292,Do all the voices in a schizophrenics head sou...,Asking for a... friend.,Do all the voices in a schizophrenics head sou...,2.0,1539959000.0
11216,A very thirsty man was wandering the desert .....,... when suddenly he spotted a well. With the ...,A very thirsty man was wandering the desert ...,44.0,1345755000.0
513239,I must be great at sex The last girl I was wit...,"The last girl I was with liked it so much, she...",I must be great at sex,3.0,1561805000.0
530464,Why did the students eat their homework? Becau...,Because the teacher said that it was a piece o...,Why did the students eat their homework?,53.0,1566330000.0
483438,My relationship with the time traveling girl w...,It was over even before it began.,My relationship with the time traveling girl w...,18.0,1554489000.0


In [9]:
n_context = 10

In [10]:
text = (" # " * n_context).join(list(df.loc[:, "joke"]))
print(len(text))
print(textwrap.fill(text[:300], 80))

154768303
I hope you're all getting your Walter Cronkite jokes in order. He's next.
Here's mine.   Ed McMahon, David Carradine, Farrah Fawcett, Michael Jackson,
Billy Mays, and Walter Cronkite walk into a bar.   And die.   Your turn.  #  #
#  #  #  #  #  #  #  # What is the only thing a woman can say that w


In [67]:
tokenizer = RegexpTokenizer(r'\w+|#')
tokens = tokenizer.tokenize(text.lower())
print(tokens[20:50])
print(len(tokens))
print(len(set(tokens)))

['david', 'carradine', 'farrah', 'fawcett', 'michael', 'jackson', 'billy', 'mays', 'and', 'walter', 'cronkite', 'walk', 'into', 'a', 'bar', 'and', 'die', 'your', 'turn', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', 'what']
32093292
134157


In [12]:
tokens = tokens[:10000] #NOTE: for test purposes

In [32]:
unique_tokens = set(tokens)
tokens_dict = {word: i for i, word in enumerate(unique_tokens)}
reverse_tokens_dict = {i: word for i, word in enumerate(unique_tokens)}
print(len(unique_tokens))

2026


In [14]:
input_words = []
next_words = []
for i in tqdm(range(len(tokens) - n_context)):
    input_words.append(tokens[i:i+n_context])
    next_words.append(tokens[i+n_context])

100%|██████████| 9990/9990 [00:00<00:00, 1460324.71it/s]


In [15]:
X = np.zeros(shape=(len(input_words), n_context, len(tokens_dict)), dtype=bool)
y = np.zeros(shape=(len(input_words), len(tokens_dict)), dtype=bool)

for i, words in enumerate(tqdm(input_words)):
    for j, word in enumerate(words):
        X[i, j, tokens_dict[words[j]]] = 1
    y[i, tokens_dict[next_words[i]]] = 1

100%|██████████| 9990/9990 [00:00<00:00, 177262.35it/s]


In [16]:
n_components_pca = 100
pca_batch_size = 10000

In [20]:
pca = IncrementalPCA(n_components=n_components_pca)
X = X.reshape(-1, X.shape[-1])

#fit X
for i in tqdm(range(X.shape[0] // pca_batch_size + 1)):
    chunk = X[i * pca_batch_size : (i + 1) * pca_batch_size]
    pca.partial_fit(chunk)

100%|██████████| 10/10 [01:36<00:00,  9.67s/it]


In [21]:
#transform X
X_transformed = None
for i in tqdm(range(X.shape[0] // pca_batch_size + 1)):
    chunk = X[i * pca_batch_size : (i + 1) * pca_batch_size]
    chunk = pca.transform(chunk)
    if X_transformed is None:
        X_transformed = chunk
    else:
        X_transformed = np.vstack((X_transformed, chunk))
        
X = X_transformed
X = X.reshape(-1, n_context, X.shape[-1])

100%|██████████| 10/10 [00:01<00:00,  5.86it/s]


In [22]:
print(X.shape)

(9990, 10, 100)


In [25]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_context, n_components_pca), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [27]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X, y, batch_size=100, epochs=30, shuffle=True).history

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [52]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_context, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, tokens_dict[word]] = 1
    X = X.reshape(-1, X.shape[-1])
    X = pca.transform(X)
    X = X.reshape(-1, n_context, X.shape[-1])
        
    predictions = model.predict(X)[0] @ pca.components_.T
    return np.argpartition(predictions, -n_best)[-n_best:]


In [65]:
possible = predict_next_word("Why did a cow", 5)
for idx in possible:
    print(reverse_tokens_dict[idx])

pitiful
everything
moment
look
cautioned


In [60]:
def generate_text(input_text, n_words, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(n_words):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = reverse_tokens_dict[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [66]:
generate_text("A snail walks into a bar and orders", 15, 10)



'A snail walks into a bar and orders pacific fell death ll dailylaughter dies eternal narwhals both doing both doing narwhals grab pace'