In [1]:
# Importing necessary libraries
import tensorflow as tf
import numpy as np
import os
import pickle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from string import punctuation

In [3]:
import requests
content = requests.get("http://www.gutenberg.org/cache/epub/11/pg11.txt").text
open("data/wonderland.txt", "w", encoding="utf-8").write(content)

167516

In [34]:
sequence_length = 100
BATCH_SIZE = 128
EPOCHS = 1

# Dataset file path

FILE_PATH = "data/wonderland.txt"
BASENAME = os.path.basename(FILE_PATH)

# Read the data
text = open(FILE_PATH, encoding="utf-8").read()

#Remove caps, leave this if uppercase is required
text = text.lower()

# Remove punctuation
text = text.translate(str.maketrans("", "", punctuation))

In [35]:
# Print some stats

n_chars = len(text)
vocab = ''.join(sorted(set(text)))
print("unique_chars:", vocab)
n_unique_chars = len(vocab)
print("Number of characters : ", n_chars)
print("Number of unique characters : ", n_unique_chars)

unique_chars: 
 0123456789abcdefghijklmnopqrstuvwxyz﻿
Number of characters :  158596
Number of unique characters :  39


In [36]:
# Dictionary that converts characters to integers
char2int = {c: i for i, c in enumerate(vocab)}

# Dictionary that converts integers to characters

int2char = {i: c for i, c in enumerate(vocab)}

In [37]:
# Save these dictionaries for later generation
pickle.dump(char2int, open(f"{BASENAME}-char2int.pickle", "wb"))
pickle.dump(int2char, open(f"{BASENAME}-int2char.pickle", "wb"))

In [38]:
# Convert all text into integers
encoded_text = np.array([char2int[c] for c in text])

In [39]:
# Construct tf.data.Dataset object
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [40]:
# Printing first five characters
for char in char_dataset.take(8):
    print(char.numpy(), int2char[char.numpy()])

38 ﻿
27 p
29 r
26 o
21 j
16 e
14 c
31 t


In [41]:
# Build sequences by batching
sequences = char_dataset.batch(2*sequence_length + 1, drop_remainder=True)

# Print sequences
for sequence in sequences.take(2):
    print(''.join([int2char[i] for i in sequence.numpy()]))

﻿project gutenbergs alices adventures in wonderland by lewis carroll



this ebook is for the use of anyone anywhere at no cost and with

almost no restrictions whatsoever  you may copy it give it away
 or

reuse it under the terms of the project gutenberg license included

with this ebook or online at wwwgutenbergorg





title alices adventures in wonderland



author lewis carroll



posting date 


In [42]:
def split_sample(sample):
    ds = tf.data.Dataset.from_tensors((sample[:sequence_length], sample[sequence_length]))
    for i in range(1, (len(sample)-1) // 2):
        input_ = sample[i: i+sequence_length]
        target = sample[i+sequence_length]
        
        # extend dataset with samples by concatenate() method
        
        other_ds = tf.data.Dataset.from_tensors((input_, target))
        ds = ds.concatenate(other_ds)
        
    return ds

# Prepare inputs and targets

dataset = sequences.flat_map(split_sample)

In [43]:
def one_hot_samples(input_, target):
    # Onehot encode both inputs and targets
    return tf.one_hot(input_, n_unique_chars), tf.one_hot(target, n_unique_chars)

dataset = dataset.map(one_hot_samples)

In [44]:
# Printing first two samples
for element in dataset.take(2):
    print("Input:", ''.join([int2char[np.argmax(char_vector)] for char_vector in element[0].numpy()]))
    print("Target:", int2char[np.argmax(element[1].numpy())])
    print("Input Shape:", element[0].shape)
    print("Target.shape:", element[1].shape)
    print("="*50, "\n")

Input: ﻿project gutenbergs alices adventures in wonderland by lewis carroll



this ebook is for the use of
Target:  
Input Shape: (100, 39)
Target.shape: (39,)

Input: project gutenbergs alices adventures in wonderland by lewis carroll



this ebook is for the use of 
Target: a
Input Shape: (100, 39)
Target.shape: (39,)



In [45]:
# Repeat, shuffle and batch the dataset
ds = dataset.repeat().shuffle(1024).batch(BATCH_SIZE, drop_remainder=True)

In [46]:
# Building model

model = Sequential([
    LSTM(256, input_shape=(sequence_length, n_unique_chars), return_sequences=True),
    Dropout(0.3),
    LSTM(256),
    Dense(n_unique_chars, activation="softmax"),
])

In [47]:
if not os.path.isdir("results"):
    os.mkdir("results")

# Compile and train the model
model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(ds, steps_per_epoch=(len(encoded_text) - sequence_length) // BATCH_SIZE, epochs=EPOCHS)
# save the model
model.save(f"results/{BASENAME}-{sequence_length}.h5")

Train for 1238 steps


In [48]:
sequence_length = 100

# Dataset file path
FILE_PATH = "data/wonderland.txt"

# FILE_PATH = "data/python_code.py"

BASENAME = os.path.basename(FILE_PATH)

In [49]:
seed = "chapter xiii"

In [50]:
# Load vocab dictionaries
char2int = pickle.load(open(f"{BASENAME}-char2int.pickle", "rb"))
int2char = pickle.load(open(f"{BASENAME}-int2char.pickle", "rb"))
vocab_size = len(char2int)

In [51]:
# Building model again
model = Sequential([
    LSTM(256, input_shape=(sequence_length, vocab_size), return_sequences=True),
    Dropout(0.3),
    LSTM(256),
    Dense(vocab_size, activation="softmax"),
])

In [52]:
# Load the optimal weights 
model.load_weights(f"results/{BASENAME}-{sequence_length}.h5")

In [60]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [62]:
s = seed
n_chars = 400
# Generate 400 characters
generated = ""
for i in tqdm(range(n_chars), "Generating text"):
    # Make the input sequence
    X = np.zeros((1, sequence_length, vocab_size))
    for t, char in enumerate(seed):
        X[0, (sequence_length - len(seed)) + t, char2int[char]] = 1
    # predict the next character
    predicted = model.predict(X, verbose=0)[0]
    # converting the vector to an integer
    next_index = np.argmax(predicted)
    # converting the integer to a character
    next_char = int2char[next_index]
    # add the character to results
    generated += next_char
    # shift seed and the predicted character
    seed = seed[1:] + next_char

print("Seed:", s)
print("Generated text:")
print(generated)

Generating text: 100%|██████████| 400/400 [00:34<00:00, 23.27it/s]


Seed: chapter xiii
Generated text:
ng of the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the pore the por
