In [2]:
# Import dependencies
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from keras.activations import tanh
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [24]:
# Fetch our text data
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [25]:
# Define the tokenizer, this tokenizes the input text at the character level instead of at the word level
tokenizer = keras.preprocessing.text.Tokenizer(char_level = True)
# Fit the tokenizer on the text
tokenizer.fit_on_texts([shakespeare_text])

In [26]:
# How many different characters that we have tokenized
max_id = len(tokenizer.word_index)
# How many documents we have tokenized, in this case we only have 1
dataset_size = tokenizer.document_count

In [27]:
# Creates a list of the tokenized characters
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [28]:
shape = encoded.shape[0]
print(shape)

1115394


In [29]:
# Defines our training size at 90%
train_size = shape * 90//100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [30]:
train_size

1003854

In [31]:
# Set the number of steps and the window length to use
n_steps = 100
window_length = n_steps + 1
# Create a dataset of sliding windows of window_length, each overlapping with the previous one by 1 character
dataset = dataset.window(window_length, shift = 1, drop_remainder = True)

In [32]:
# Applies the lambda function to each element in the dataset and flattens the nested structure into single dataset
# The lambda window function takes window as an input that takes window.batch() as the input
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [33]:
print(shakespeare_text[:175])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.




In [34]:
batch_size = 32
# Shuffle the dataset with a buffer of 10,000
dataset = dataset.shuffle(10000).batch(batch_size)
# Map each batch of the dataset to a new dataset that consisting of input sequence and target sequence
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [35]:
# Reminder that categorical features should be one-hot encoded
# This is what we do to our flat mapped dataset
dataset = dataset.map(lambda X_batch, Y_batch : (tf.one_hot(X_batch, depth = max_id), Y_batch))

# Prefetching allows the model to load new batches while still processing the current batch
dataset = dataset.prefetch(3)

In [36]:
model = keras.models.Sequential([
    # Using GRU layers
    keras.layers.GRU(128, return_sequences = True, input_shape = [None, max_id], dropout = 0.2, 
                     activation = tanh),
    keras.layers.GRU(128, return_sequences = True, dropout = 0.2, 
                     activation = tanh),
    # This layer predicts the probability distribution of the next character for each time step
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation = 'softmax'))
])

In [37]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam')

In [39]:
model.fit(dataset, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2183dab9730>

In [48]:
# def preprocess(strings):
#     # Tokenize each string into a sequence of individual characters
#     sequences = []
#     for string in strings:
#         sequence = []
#         for char in string:
#             sequence.append(char)
#         sequences.append(sequence)
#     # Pad the sequences to a fixed length
#     padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')
#     return padded_sequences

In [61]:
# def generate_letter(model, tokenizer, sequence):
#     # Encode the sequence
#     X_new = tokenizer.texts_to_sequences([sequence])[0]
#     # Pad the sequence to a fixed length
#     X_new = tf.keras.preprocessing.sequence.pad_sequences([X_new], maxlen=None, padding='post', truncating='post')
#     X_new = tf.one_hot(X_new, depth=max_id)
#     # Reshape the input tensor to add a batch and sequence length dimension
#     X_new = tf.expand_dims(X_new, axis=0)
#     X_new = tf.expand_dims(X_new, axis=1)
#     # Predict the probability distribution over the vocabulary of individual characters
#     y_proba = model.predict(X_new)[0]
#     # Sample from the distribution to get the next letter index
#     letter_index = np.random.choice(len(y_proba), p=y_proba)
#     # Convert the index to a letter and return it
#     return tokenizer.index_word[letter_index]

In [68]:
num_features = model.input_shape[2]

def string_to_input(string, tokenizer):
    # Convert string to lowercase
    string = string.lower()
    # Tokenize string
    tokens = tokenizer.texts_to_sequences([string])[0]
    # Convert tokens to array and add batch dimension
    tokens_array = np.array([tokens])
    # Convert tokens to one-hot encoding
    num_tokens = tokens_array.shape[1]
    one_hot = np.zeros((1, num_tokens, num_features))
    for i in range(num_tokens):
        one_hot[0, i, tokens_array[0, i]-1] = 1
    return one_hot

In [73]:
def generate_sentence(start_string):
    # initialize the input sequence with the starting string
    input_sequence = string_to_input(start_string, tokenizer)
    # initialize the generated sentence with the starting string
    generated_sentence = start_string.lower().strip()
    # set a flag to indicate when a period is reached
    period_reached = False
    # generate the next character until a period is reached
    while not period_reached:
        # predict the probability distribution for the next character
        output_probabilities = model.predict(input_sequence)[0][-1]
        # convert probabilities to character indices
        char_index = np.argmax(output_probabilities)
        # convert character index to character
        char = tokenizer.index_word[char_index+1]  # add one to index to account for padding
        # add character to generated sentence
        generated_sentence += char
        # update input sequence with the new character
        input_sequence = np.append(input_sequence, np.zeros((1, 1, input_sequence.shape[2])), axis=1)
        input_sequence[0, -1, char_index] = 1
        # check if period is reached
        if char == '.':
            period_reached = True
    return generated_sentence

generated_sentence = generate_sentence(sequence)



In [74]:
print(generated_sentence)

how are you all the matter for your hands, i will contrive the rest:
what have you not her father, that i will contrive the rest was so,
that i may be so believe me with a score and me as for my hands.


In [79]:
sequence = 'How are you?'
generate_sentence(sequence)



'how are you?\n\ngremio:\no this gentleman, i will contrive the rest was so,\nthat i may be so believe me with a score and me as for my hands.'

In [80]:
print(generated_sentence)

how are you all the matter for your hands, i will contrive the rest:
what have you not her father, that i will contrive the rest was so,
that i may be so believe me with a score and me as for my hands.


# That is quite interesting, however let's alter the config so that it instead looks to predict a response, as if it were a normal conversation

In [117]:
def generate_sentence(start_string):
    # initialize the input sequence with the starting string
    input_sequence = string_to_input(start_string, tokenizer)
    # initialize the generated sentence to an empty string
    generated_sentence = ''
    # set a flag to indicate when a period is reached
    period_reached = False
    # generate the next character until a period is reached
    while not period_reached:
        # predict the probability distribution for the next character
        output_probabilities = model.predict(input_sequence)[0][-1]
        # convert probabilities to character indices
        char_index = np.argmax(output_probabilities)
        # convert character index to character
        char = tokenizer.index_word[char_index+1]  # add one to index to account for padding
        # add character to generated sentence
        generated_sentence += char
        # update input sequence with the new character
        input_sequence = np.append(input_sequence, np.zeros((1, 1, input_sequence.shape[2])), axis=1)
        input_sequence[0, -1, char_index] = 1
        # check if period is reached
        if char == '.':
            period_reached = True
    return generated_sentence

In [118]:
def simulate_conversation(max_turns):
    conversation = ""
    turn_count = 0
    user_prompt = input('User: ')
    while turn_count < max_turns:
        # generate model's response to user's prompt
        model_response = generate_sentence(user_prompt)
        # append user's prompt and model's response to the conversation
        conversation += "User: " + user_prompt + "\n"
        conversation += "Model: " + model_response + "\n"
        # prompt user for next prompt
        user_prompt = input("User: ")
        turn_count += 1
    return conversation

In [123]:
conversation = simulate_conversation(max_turns = 1)
print(conversation)

User:  what say thee man?




User:  avast


User: what say thee man?
Model: 

petruchio:
sir, you may stand you to the choice love to her.



In [121]:
generate_sentence('how are you?')



'\n\ngremio:\no this gentleman, i will contrive the rest was so,\nthat i may be so believe me with a score and me as for my hands.'

In [122]:
print(generate_sentence)

<function generate_sentence at 0x000002183DA93C10>
