# Language model

In [112]:
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers.legacy import RMSprop, SGD
from sklearn.decomposition import IncrementalPCA

import textwrap
from tqdm import tqdm
from pprint import pprint

## Preprocessing

First, we will read the data from the file and drop NaNs. 

In [113]:
df = pd.read_csv("rJokesData/data/preprocessed.csv")
df = df.dropna(axis=0)
df.sample(5)

Unnamed: 0,joke,body,punchline,score,date
20508,Did you hear about the guy from Prague wearing...,The Czech's in the mail.,Did you hear about the guy from Prague wearing...,64.0,1361149000.0
77428,Cannibal Someone who is fed up with people,Someone who is fed up with people,Cannibal,0.0,1424818000.0
316192,I went to hell and I was playing video games S...,Sucked since it was just never ending loading ...,I went to hell and I was playing video games,0.0,1504932000.0
84211,Blowjobs did not live up to my expectations. T...,They suck.,Blowjobs did not live up to my expectations.,5.0,1427739000.0
264880,There's a 86.7 percent chance of a Zombie outb...,Hopefully the first zombie is black.,There's a 86.7 percent chance of a Zombie outb...,2.0,1489582000.0


`n_context` will be out parameter which tells the model how many words to check to predict the next word

In [114]:
n_context = 10

Now we will split the text into tokens. We will separate the jokes using `#` symbols and see if the model will predict it as the end of the line

In [115]:
text = (" # " * n_context).join(list(df.loc[:, "joke"]))
print(len(text))
print(textwrap.fill(text[:300], 80))

154768303
I hope you're all getting your Walter Cronkite jokes in order. He's next.
Here's mine.   Ed McMahon, David Carradine, Farrah Fawcett, Michael Jackson,
Billy Mays, and Walter Cronkite walk into a bar.   And die.   Your turn.  #  #
#  #  #  #  #  #  #  # What is the only thing a woman can say that w


In [116]:
tokenizer = RegexpTokenizer(r'\w+|#')
tokens = tokenizer.tokenize(text.lower())
print(tokens[20:50])
print(len(tokens))
print(len(set(tokens)))

['david', 'carradine', 'farrah', 'fawcett', 'michael', 'jackson', 'billy', 'mays', 'and', 'walter', 'cronkite', 'walk', 'into', 'a', 'bar', 'and', 'die', 'your', 'turn', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', 'what']
32093292
134157


See that we have over 32 million words in total and over 130 000 unique words

Let us limit the number of tokens so that the model learns faster and the kernel does not crash. This will negatively affect the quality though

In [117]:
tokens = tokens[:80000] #NOTE: for test purposes

In [118]:
unique_tokens = set(tokens)
tokens_dict = {word: i for i, word in enumerate(unique_tokens)}
reverse_tokens_dict = {i: word for i, word in enumerate(unique_tokens)}
print(len(unique_tokens))

7501


### Model 1: bag_of_words + PCA + LSTM Neural Network

Now let's create `X` and `y` datasets using the bag-of-words approach

In [119]:
input_words = []
next_words = []
for i in tqdm(range(len(tokens) - n_context)):
    input_words.append(tokens[i:i+n_context])
    next_words.append(tokens[i+n_context])

100%|██████████| 79990/79990 [00:00<00:00, 524867.89it/s]


In [120]:
X = np.zeros(shape=(len(input_words), n_context, len(tokens_dict)), dtype=bool)
y = np.zeros(shape=(len(input_words), len(tokens_dict)), dtype=bool)

for i, words in enumerate(tqdm(input_words)):
    for j, word in enumerate(words):
        X[i, j, tokens_dict[words[j]]] = 1
    y[i, tokens_dict[next_words[i]]] = 1

100%|██████████| 79990/79990 [00:00<00:00, 83837.66it/s]


Now we will set the number of PCA components. Also we will perform Incremental PCA as out dataset is quite large, so we also set the batch size.

In [121]:
n_components_pca = 100
pca_batch_size = 10000

In [122]:
pca = IncrementalPCA(n_components=n_components_pca)
X = X.reshape(-1, X.shape[-1])

#fit X
for i in tqdm(range(X.shape[0] // pca_batch_size + 1)):
    chunk = X[i * pca_batch_size : (i + 1) * pca_batch_size]
    pca.partial_fit(chunk)

  0%|          | 0/80 [00:00<?, ?it/s]

  1%|▏         | 1/80 [02:31<3:20:03, 151.95s/it]

In [12]:
#transform X
X_transformed = None
for i in tqdm(range(X.shape[0] // pca_batch_size + 1)):
    chunk = X[i * pca_batch_size : (i + 1) * pca_batch_size]
    chunk = pca.transform(chunk)
    if X_transformed is None:
        X_transformed = chunk
    else:
        X_transformed = np.vstack((X_transformed, chunk))
        
X = X_transformed
X = X.reshape(-1, n_context, X.shape[-1])

100%|██████████| 10/10 [00:01<00:00,  6.25it/s]


In [13]:
print(X.shape)

(9990, 10, 100)


Now out transformed dataset has less features and out model will train faster

We will use simple model: two LSTM layers + linear + activation

In [14]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_context, n_components_pca), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [13]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X, y, batch_size=100, epochs=50, shuffle=True).history

NameError: name 'model' is not defined

The model trains quite fast and the quality is great too. Now let us generate some text and see if it is comprehensible.

In [16]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_context, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i, tokens_dict[word]] = 1
    X = X.reshape(-1, X.shape[-1])
    X = pca.transform(X)
    X = X.reshape(-1, n_context, X.shape[-1])
        
    predictions = model.predict(X)[0] @ pca.components_.T
    return np.argpartition(predictions, -n_best)[-n_best:]


In [18]:
possible = predict_next_word("A snail walks into a bar and", 5)
for idx in possible:
    print(reverse_tokens_dict[idx])



deficient
sounds
gump
rose
admitted


See that the next word predictions here are quite bad

In [19]:
def generate_text(input_text, n_words, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(n_words):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = reverse_tokens_dict[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [20]:
generate_text("A snail walks into a bar and orders", 15, 10)



'A snail walks into a bar and orders rose 4 suffering buying dinner severed envy admitted sounds gump assistant etc admitted everything assistant'

The text is quite bad also

### Model 2: pre-trained word embedding + LSTM

We will use a large word embedding from https://github.com/stanfordnlp/GloVe?tab=readme-ov-file. It has around 1.9 million words.

In [6]:
pretrained_dict = dict()

with open("glove.42B.300d.txt", "r") as f:
    for line in tqdm(f):
        word_and_vec = line.split()
        word = word_and_vec[0]
        vec = np.array(list(map(float, word_and_vec[1:])))
        pretrained_dict[word] = vec

print(len(pretrained_dict))

1917495it [02:03, 15516.33it/s]

1917495





Let's bring back all tokens and see how many words we do not know

In [7]:
tokenizer = RegexpTokenizer(r'\w+|#')
tokens = tokenizer.tokenize(text.lower())
print(tokens[20:50])
print(len(tokens))
print(len(set(tokens)))

['david', 'carradine', 'farrah', 'fawcett', 'michael', 'jackson', 'billy', 'mays', 'and', 'walter', 'cronkite', 'walk', 'into', 'a', 'bar', 'and', 'die', 'your', 'turn', '#', '#', '#', '#', '#', '#', '#', '#', '#', '#', 'what']
32093292
134157


In [8]:
tokens = tokens[:100000]

In [9]:
unique_tokens = set(tokens)
tokens_dict = {word: i for i, word in enumerate(unique_tokens)}
reverse_tokens_dict = {i: word for i, word in enumerate(unique_tokens)}
print(len(unique_tokens))

8512


In [10]:
print(len(unique_tokens.difference(set(pretrained_dict.keys()))))

188


The word embedding does not have around 30% of our unique words, but probably they do not appear much so it is not that bad.

In [11]:
input_words = []
next_words = []
for i in tqdm(range(len(tokens) - n_context)):
    input_words.append(tokens[i:i+n_context])
    next_words.append(tokens[i+n_context])

X = np.zeros(shape=(len(input_words), n_context, 300), dtype='float32')
y = np.zeros(shape=(len(input_words), 300), dtype='float32')

for i, words in enumerate(tqdm(input_words)):
    for j, word in enumerate(words):
        X[i, j] = pretrained_dict.setdefault(word, np.zeros(300))
    y[i] = pretrained_dict.setdefault(next_words[i], np.zeros(300))

100%|██████████| 99990/99990 [00:00<00:00, 137418.76it/s]
100%|██████████| 99990/99990 [00:02<00:00, 40972.11it/s]


In [16]:
print(X.shape, y.shape)

(99990, 10, 300) (99990, 300)


In [105]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_context, 300), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(300))

In [106]:
optimizer = SGD(learning_rate=0.05)
model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=optimizer)
history = model.fit(X, y, batch_size=100, epochs=3, shuffle=True).history

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [107]:
def predict_next_word(input_text):
    input_text = input_text.lower().split()
    X = np.zeros((1, n_context, 300))
    for i in range(min(n_context, len(input_text))):
        word = input_text[-i-1]
        X[0, i] = pretrained_dict.setdefault(word, np.zeros(300))
        
    predictions = model.predict(X)[0]
    return predictions

In [108]:
next_word_vec = predict_next_word("A snail walks into a bar and says once")
min_dist = -1
closest_word = ""
for word, vec in pretrained_dict.items():
    dist = np.linalg.norm(vec - next_word_vec)
    if min_dist == -1 or dist < min_dist:
        closest_word = word
        min_dist = dist

print(closest_word)

even


In [109]:
def generate_text(input_text, n_words):
    word_sequence = input_text.split()
    current = 0
    for _ in tqdm(range(n_words)):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        next_word_vec = predict_next_word(input_text)
        min_dist = -1
        closest_word = ""
        for word, vec in pretrained_dict.items():
            dist = np.linalg.norm(vec - next_word_vec)
            if min_dist == -1 or dist < min_dist:
                closest_word = word
                min_dist = dist

        word_sequence.append(closest_word)
        current += 1
    return " ".join(word_sequence)

In [111]:
generate_text("Why did the chicken cross the road", 5)

  0%|          | 0/5 [00:00<?, ?it/s]



 20%|██        | 1/5 [00:25<01:42, 25.50s/it]



 40%|████      | 2/5 [00:57<01:27, 29.26s/it]



 60%|██████    | 3/5 [01:32<01:03, 31.74s/it]



 80%|████████  | 4/5 [02:04<00:32, 32.11s/it]



100%|██████████| 5/5 [02:40<00:00, 32.08s/it]


'Why did the chicken cross the road once once once once once'