In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import random


print("Cuda Availability: ", tf.test.is_built_with_cuda())
print("Version of Tensorflow: ", tf.__version__)
print("GPU  Availability: ", tf.config.list_physical_devices('GPU'))
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Built with CUDA: ", tf.test.is_built_with_cuda())

In [None]:
INPUT = "data/quotes.csv"
# INPUT = "/kaggle/input/quotes-500k/quotes.csv"

ds = pd.read_csv(INPUT)

ds.head()

ds = ds.drop(columns=["author", "category"], axis=1)

ds = np.array(ds)
ds = ds.T[0].astype(str)

# ds = ds[0:1000]

ds = np.char.lower(ds)

ds = np.array(list(map(lambda x: re.sub("[^a-z0-9\s]+", "", x), ds)))

print(ds.shape)
print(ds[0:3])

In [None]:
print(ds.shape)

MAX_LENGTH = 150

def length_check(x):
    return len(x) < MAX_LENGTH

ds = np.array(list(filter(length_check, ds)))
print(ds.shape)

In [None]:
# fifty_most_common_words = [ "the", "be", "of", "and", "a", "to", "in", "he", "have", "it", "that", "for", "they", "I", "with", "as", "not", "on", "she", "at", "by", "this", "we", "you", "do", "but", "from", "or", "which", "one", "would", "all", "will", "there", "say", "who", "make", "when", "can", "more", "if", "no", "man", "out", "other", "so", "what", "time", "up", "go"]

text = " ".join(ds)
# def remove_fifty_most_common_words_from_text(text):
#     for word in fifty_most_common_words:
#         text = text.replace(word, "")
#     return text

# text = remove_fifty_most_common_words_from_text(text)

In [None]:
max_tokens = 20000
max_len = 400

def letters(input):
    valids = []
    for character in input:
        if character.isalpha():
            valids.append(character)
    return ''.join(valids)


def more_than_once(input):
    if len(input) < 1:
        return False
    return text.count(input) > 6
words = text.split(" ")

words = list(set(words))
vocab = sorted(words)

vocab = list(map(letters, vocab))
vocab = list(filter(lambda x: len(x) > 0, vocab))
vocab = list(set(vocab))
vocab = sorted(vocab)

vocab = list(filter(more_than_once, vocab))

print(len(vocab))
print(vocab[0:100])
print(vocab[-10:])

In [None]:

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=None,
    output_mode="int",
    output_sequence_length=max_len,
    vocabulary=vocab,
)

vocab = vectorize_layer.get_vocabulary()
vocab[0] = "[UNK]"
vocab[1] = ""
vocab_size = len(vocab)

print(vocab_size)
print(vocab[0:10])

In [None]:
lookup = tf.keras.layers.StringLookup(vocabulary=vocab, mask_token=None, invert=True)

def undo_vectorize(text):
    return lookup(text)

In [None]:
def pad(x):
    return tf.keras.preprocessing.sequence.pad_sequences(x, padding="post", truncating="post", maxlen=MAX_LENGTH)

def split_input_sequence(x):
    input_text = x[:-1]
    target_text = x[1:]
    return input_text, target_text

dataset = vectorize_layer(ds)

dataset = pad(dataset)

dataset = tf.data.Dataset.from_tensor_slices(dataset)

dataset = dataset.map(split_input_sequence)

BUFFER_SIZE = 1000
BATCH_SIZE = 128

dataset = (
    dataset.shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

for x in dataset.take(1):
    print(x)
    print(np.array(x).shape)

In [None]:
class QuotesModel(tf.keras.Model):
    def __init__(self, embedding_dim, rnn_units):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        self.rnn = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
        
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.rnn.get_initial_state(x)
        x, states = self.rnn(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        return x

embedding_dim = 1024
rnn_units = 2048

model = QuotesModel(embedding_dim, rnn_units)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])

In [None]:
EPOCHS = 10
model.fit(dataset, epochs=EPOCHS)

In [None]:
model.save_weights("qgt-2")

# model = model.load_weights("qgt-2")

model.summary()

In [None]:
for x in dataset.take(1):
    x = np.array(x[0])
    print(x.shape)
    y = model.predict(x)
    print(y.shape)
    pred = y[:, -1, :]
    print(pred.shape)

In [None]:


class OneStepModel(tf.keras.Model):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.chars_from_ids = undo_vectorize
        self.ids_from_chars = vectorize_layer

        # skip_ids = self.ids_from_chars(["[UNK]"])[:, :]
        skip_ids = [[0]]

        sparse_mask = tf.SparseTensor(
            values=[-float("inf")] * len(skip_ids),
            indices=skip_ids,
            dense_shape=[len(vectorize_layer.get_vocabulary())],
        )
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)
        
    @tf.function
    def generate_one_step(self, inputs, states=None):
        input_ids = self.ids_from_chars(inputs)
        predicted_logits, states = self.model(
            inputs=input_ids, states=states, return_state=True
        )
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits + self.prediction_mask
        
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)
        predicted_chars = self.chars_from_ids(predicted_ids)
        return predicted_chars, states

one_step_model = OneStepModel(model)

def generate_text(model, start_string, num_generate=1000):
    states = None

    next_char = tf.constant([start_string])

    result = [next_char]

    for n in range(num_generate):
        next_char, states = model.generate_one_step(next_char, states=states) # type: ignore
        result.append(next_char)

    result = tf.strings.join(result, separator=" ")
    return result[0].numpy().decode("utf-8")

for i in range(10):
    word = random.choice(vocab)
    print(generate_text(one_step_model, start_string="chess", num_generate=15), end="\n\n")


<h1>Version 1.0 (QGT-1)</h1>
131728 data points (less than 75 characters)<br>
7961 vocab size (6 occurrences and up)<br>
10 Epochs
Acc: .89

Noteworthy Quotes: <br>
"Chess is a game of grace, and life is a game of chess; it is never too late to have grace, now or never"<br>
"The meaning of life may only be to cope with the lies spread by nerves, but cats seem to have it figured out."<br>
"Guilt without glory isn't great, but my only purpose now is to do anything I can to make up for it."<br>
"Cats may be buried, but they are not gone forever. They are somehow still alive and ignited within us, even if we cannot see them."<br>
"Bliss may be more like the eternal life of trees, rather than the blood that flows through us. Not all of us are only awake and in pain."<br>
"Innocence and guilt breathe life into our choices, carefully projecting the will to move forward. In the end, even John may be abandoned."<br>
"The moon brings us to where we need to be, even if it isn't until afterward. Cats may not need leadership, but we do, and it is never too late to start again."<br>
"Humanity may be facing reality now, but the method to their madness is forever a mystery, even to the cats who seem to have it all figured out."<br>
"Bliss is a myth, more like more trees with eternal blood, though not all awake in pain"<br>

<h1>Version 2.0 (QGT-2)</h1>
352141 data points (less than 200 characters)<br>
30988 vocab size (50 most common words removed, 4 occurrences and up)<br>
10 Epochs<br>
Acc: .91

Noteworthy Quotes: <br>

