In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re

In [2]:
ds = pd.read_csv("/kaggle/input/quotes-500k/quotes.csv")

ds.head()

ds = ds.drop(columns=["author", "category"], axis=1)

ds = np.array(ds)
ds = ds.T[0].astype(str)

# ds = ds[0:10000]

ds = np.char.lower(ds)

ds = np.array(list(map(lambda x: re.sub("[^a-z0-9\s]+", "", x), ds)))

print(ds.shape)
print(ds[0:3])

(499709,)
['im selfish impatient and a little insecure i make mistakes i am out of control and at times hard to handle but if you cant handle me at my worst then you sure as hell dont deserve me at my best'
 'youve gotta dance like theres nobody watchinglove like youll never be hurtsing like theres nobody listeningand live like its heaven on earth'
 'you know youre in love when you cant fall asleep because reality is finally better than your dreams']


In [6]:
print(ds.shape)

MAX_LENGTH = 75

def length_check(x):
    return len(x) < MAX_LENGTH

da = np.array(list(filter(length_check, ds)))
print(da.shape)

ds = da

(499709,)
(131728,)


In [7]:
lens = np.array(list(map(len, ds)))

median_length = np.median(lens).astype(np.int32)
max_length = np.amax(lens).astype(np.int32)

print(median_length, max_length)

# ds[291945]

53 74


In [8]:
max_tokens = 10000

vocab = sorted(set("".join(ds)))
vocab_size = len(vocab) + 1

string_lookup = tf.keras.layers.StringLookup(max_tokens=max_tokens, vocabulary=vocab, mask_token=None)
ids_lookup = tf.keras.layers.StringLookup(vocabulary=string_lookup.get_vocabulary(), invert=True, mask_token=None)

print(len(vocab))
print(vocab[0:10])

string = np.array(["h", "e", "l", "l", "o"])
ids = string_lookup(string)

chars = ids_lookup(ids)
print(ids.numpy())
print(chars.numpy())

43
['\t', ' ', '0', '1', '2', '3', '4', '5', '6', '7']
[20 17 24 24 27]
[b'h' b'e' b'l' b'l' b'o']


2022-12-24 20:22:00.003696: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [10]:
def tokenize(x):
    x = tf.strings.unicode_split(x, "UTF-8")
    x = string_lookup(x)
    return x.numpy()

def pad(x):
    return tf.keras.preprocessing.sequence.pad_sequences(x, padding="post", truncating="post", maxlen=max_length)

def split_input_sequence(x):
    input_text = x[:-1]
    target_text = x[1:]
    return input_text, target_text

dataset = np.array(list(map(tokenize, ds)))

dataset = pad(dataset)

dataset = tf.data.Dataset.from_tensor_slices(dataset)

dataset = dataset.map(split_input_sequence)


BUFFER_SIZE = 1000
BATCH_SIZE = 128

dataset = (
    dataset.shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

for x in dataset.take(1):
    print(x)
    print(np.array(x).shape)

  


(<tf.Tensor: shape=(128, 73), dtype=int32, numpy=
array([[35, 20, 17, ...,  0,  0,  0],
       [21,  2, 20, ...,  0,  0,  0],
       [28, 30, 21, ...,  0,  0,  0],
       ...,
       [24, 27, 34, ...,  0,  0,  0],
       [27, 26, 17, ...,  0,  0,  0],
       [21, 18,  2, ...,  0,  0,  0]], dtype=int32)>, <tf.Tensor: shape=(128, 73), dtype=int32, numpy=
array([[20, 17, 26, ...,  0,  0,  0],
       [ 2, 20, 13, ...,  0,  0,  0],
       [30, 21, 16, ...,  0,  0,  0],
       ...,
       [27, 34, 17, ...,  0,  0,  0],
       [26, 17,  2, ...,  0,  0,  0],
       [18,  2, 37, ...,  0,  0,  0]], dtype=int32)>)
(2, 128, 73)


2022-12-24 20:26:37.092141: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [11]:
class QuotesModel(tf.keras.Model):
    def __init__(self, embedding_dim, rnn_units):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        self.rnn = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
        
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.rnn.get_initial_state(x)
        x, states = self.rnn(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        return x

embedding_dim = 256
rnn_units = 2048

model = QuotesModel(embedding_dim, rnn_units)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])

In [30]:
for x in dataset.take(1):
#     x = np.array(x[0][0])
#     print(x.shape)
#     y = model.predict(x)
#     print(y.shape)
    print(np.array(x).shape)   
    inputs = np.array(x[0])
    print("Inputs: ", inputs.shape)
    x = inputs
    x = model.embedding(x)
    print("Embedding: ", x.shape)
    x = model.rnn(x)
    # (64,127,2048)
    x = model.dense(x[0])
    print("Dense: ", x.shape)
    
    y = model.predict(inputs)
    print("Y: ", y.shape)
    

(2, 64, 114)
Inputs:  (64, 114)
Embedding:  (64, 114, 256)
Dense:  (64, 114, 41)
Y:  (64, 114, 41)


In [None]:
EPOCHS = 10
model.fit(dataset, epochs=EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [None]:
for x in dataset.take(1):
    x = np.array(x[0][0])
    y = model.predict(x)
    print(y.shape)
    print(y)
    pred = y[:, -1, :]
    print(pred)

In [None]:
class OneStepModel(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars):
        super().__init__()
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars
        
        skip_ids = self.ids_from_chars(["[UNK]"])[:, None]

        sparse_mask = tf.SparseTensor(
            values=[-float("inf")] * len(skip_ids),
            indices=skip_ids,
            dense_shape=[len(ids_from_chars.get_vocabulary())],
        )
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)
        
    @tf.function
    def generate_one_step(self, inputs, states=None):
        input_chars = tf.strings.unicode_split(inputs, "UTF-8")
        input_ids = self.ids_from_chars(input_chars).to_tensor()
        
        predicted_logits, states = self.model(
            inputs=input_ids, states=states, return_state=True
        )
        
        predicted_logits = predicted_logits[:, -1, :]
        
        predicted_logits = predicted_logits + self.prediction_mask
        
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)
        predicted_chars = self.chars_from_ids(predicted_ids)
            
        return predicted_chars, states
    
    
one_step_model = OneStepModel(model, ids_lookup, string_lookup)

states = None

next_char = tf.constant(["i"])

result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)
    
result = tf.strings.join(result)

print(result)

