# A Text Generation RNN

## Brian Chen | 2020

For CAH card generation

*Based on https://www.tensorflow.org/tutorials/text/text_generation*

In [3]:
#imports

import tensorflow as tf
import numpy as np
import os
import time

In [2]:
in_text = open("/content/drive/My Drive/TextGen RNN/Datasets/cah-answers.txt", "r").read()

#repeat a few times for more data

for x in range(10):
    in_text = in_text + in_text


In [4]:
unique_chars = sorted(set(in_text))
print(len(unique_chars), unique_chars)

84 ['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '’']


In [5]:
char_to_index = {u:i for i, u in enumerate(unique_chars)}
index_to_char = np.array(unique_chars)
text_as_int = np.array([char_to_index[c] for c in in_text]) #abcd -> 1,2,3,4, for example

In [6]:
#parameters
max_input_length = 100
examples_per_epoch = len(in_text)

dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [7]:
#turn chars into sequences of size
seqs = dataset.batch(max_input_length+1, drop_remainder=True)

In [8]:
for i in seqs.take(5):
  print(repr(''.join(index_to_char[i.numpy()])))

"Flying sex snakes.\nMichelle Obama's arms.\nGerman dungeon porn.\nWhite people.\nGetting so angry that yo"
'u pop a boner.\nTasteful sideboob.\nPraying the gay away.\nTwo midgets shitting into a bucket.\nMechaHitl'
'er.\nBeing a motherfucking sorcerer.\nA disappointing birthday party.\nPuppies!\nA windmill full of corps'
"es.\nGuys who don't call.\nRacially-biased SAT questions.\nDying.\nSteven Hawking talking dirty.\nBeing on"
' fire.\nA lifetime of sadness.\nAn erection that lasts longer than four hours.\nAIDS\nSame-sex ice dancin'


In [9]:
#duplicate input to create target & input text
"""for example:
Text: ABCDE
Input: ABCDE
Target: BCDE
Thus, for input A, expect output = B, input = B, expected output=C..."""
def create_target_input(sequence):
  input = sequence[:-1]
  target = sequence[1:]
  return input, target

In [10]:
dataset = seqs.map(create_target_input)

In [11]:
BATCH_SIZE = 64
BUFFER_SIZE =  10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [12]:
vocab_size = len(unique_chars)
embedding_size = 256
rnn_units = 1024

In [13]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ]) 

#sequential model w/ embedding layer for input, GRU as RNN (can use LSTM), dense layer for output (similar to sigver in some ways!)
  return model

In [14]:
shakspeard = build_model(vocab_size, embedding_size, rnn_units, BATCH_SIZE)
shakspeard.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           21504     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
gru_1 (GRU)                  (64, None, 1024)          6297600   
_________________________________________________________________
dense (Dense)                (64, None, 84)            86100     
Total params: 10,343,508
Trainable params: 10,343,508
Non-trainable params: 0
_________________________________________________________________


In [15]:
#Train Model
def loss(labels, logits): #model returns logits, so from_logits=True (log-odds, log of probability)
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [16]:
shakspeard.compile(optimizer="adam", loss=loss) #adam is usally the best choice

In [17]:
check_dir = "/content/drive/My Drive/TextGen RNN/Checkpoints_CAH-answers"
check_file_dir = os.path.join(check_dir, "checkpoint_{epoch}")

checkpoints = tf.keras.callbacks.ModelCheckpoint(filepath = check_file_dir, save_weights_only=True)

In [18]:
EPOCHS = 20
STEPS= 100
history = shakspeard.fit(dataset, epochs = EPOCHS, steps_per_epoch = STEPS, callbacks = [checkpoints])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
tf.train.latest_checkpoint(check_dir) #get latest chkpoint

'/content/drive/My Drive/TextGen RNN/Checkpoints_CAH-answers/checkpoint_20'

In [87]:
prediktor = build_model(vocab_size, embedding_dim=embedding_size, rnn_units=rnn_units, batch_size = 1)
# prediktor.load_weights(tf.train.latest_checkpoint(check_dir))
prediktor.load_weights('/content/drive/My Drive/TextGen RNN/Checkpoints_CAH-answers/checkpoint_15')
prediktor.build(tf.TensorShape([1, None]))
prediktor.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (1, None, 256)            21504     
_________________________________________________________________
gru_10 (GRU)                 (1, None, 1024)           3938304   
_________________________________________________________________
gru_11 (GRU)                 (1, None, 1024)           6297600   
_________________________________________________________________
dense_5 (Dense)              (1, None, 84)             86100     
Total params: 10,343,508
Trainable params: 10,343,508
Non-trainable params: 0
_________________________________________________________________


In [45]:
def gen_text(model, seed, num_generate, temp):
  """
  :param model: tf/keras model
  :param seed: first line of text to build off of
  :param num_generate: amount of chars to predict
  :param temp: how much variance is allowed in predictions: higher = more varied, and vice versa
  """
  gen_input = [char_to_index[i] for i in seed]
  gen_input = tf.expand_dims(gen_input, 0)
  out = []
  
  model.reset_states()
  for i in range(num_generate):
    pred = model(gen_input)
    pred = tf.squeeze(pred, 0)
    pred = pred/temp
    pred_id = tf.random.categorical(pred, num_samples = 1)[-1, 0].numpy()
    #add predicted output to next output
    gen_input = tf.expand_dims([pred_id], 0)
    out.append(index_to_char[pred_id])
  return (seed + "".join(out))


In [56]:
from random_word import RandomWords
r = RandomWords()

In [90]:
ex_out = gen_text(prediktor, "The ", 1000, 1)
print(ex_out)

The Wild Samoan.
Peanut Butter and Baby sandwicheing sex.
Vietnam flashbacks.
Running naked through a mall, pissing and shitting ever.
Getting your dick stuck in a Chinese finger trap with another dick.
Fishing.
A pyansord..
Lovingly animated bouncing boobs.
Dragon Balls.
Zangief's chest hair.
DeviantArt.
Giant fucking robots.
Crossplay.
Moeblob.
Carl Macek's rotting corpse.
My waifu.
Voice actress Megumi Hayashibara.
Lynn Minmei.
Panty shots.
Love and Justing.
The thin veneer of situational causality shots.
Love and Justice.
Consensual tentacle rape.
Gundam.
Capting and blowing.
The bullet with your name on it.
The entire rest of eternity, spent in fucking Brueges Dreemping.
Goats eating cans.
The KKK.
Kamikaze pilots.
Horrifying laser hair removal accidents.
Adderall&trade;.
A look-see.
Doing the right thing.
The taint; the grundle; the fleshy fun-bridge.
Lactation.
Pabst Blue Ribbon.
Powerful thighs.
Saxophone solos.
The gays.
A middle-aged man on roller skates.
A foul mouth.
The th

In [None]:
print(ex_out)