In [21]:
import tensorflow as tf
import numpy as np
import os
import pickle
import requests
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from string import punctuation

In [22]:
content = requests.get("http://www.gutenberg.org/cache/epub/11/pg11.txt").text
open("data/wonderland.txt", "w", encoding="utf-8").write(content)

167516

In [23]:
sequence_length = 100
BATCH_SIZE = 256
EPOCHS = 30

FILE_PATH = "data/wonderland.txt"
BASENAME = os.path.basename(FILE_PATH)
# read the data
text = open(FILE_PATH, encoding="utf-8").read()

text = text.lower()

text = text.translate(str.maketrans("", "", punctuation))

In [24]:
n_chars = len(text)
vocab = ''.join(sorted(set(text)))
print("unique_chars:", vocab)
n_unique_chars = len(vocab)
print("Number of characters:", n_chars)
print("Number of unique characters:", n_unique_chars)

unique_chars: 
 0123456789abcdefghijklmnopqrstuvwxyz﻿
Number of characters: 158596
Number of unique characters: 39


In [25]:
char2int = {c: i for i, c in enumerate(vocab)}

int2char = {i: c for i, c in enumerate(vocab)}

In [26]:
pickle.dump(char2int, open(f"{BASENAME}-char2int.pickle", "wb"))
pickle.dump(int2char, open(f"{BASENAME}-int2char.pickle", "wb"))

In [27]:
encoded_text = np.array([char2int[c] for c in text])

In [28]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [29]:
for char in char_dataset.take(8):
    print(char.numpy(), int2char[char.numpy()])

38 ﻿
27 p
29 r
26 o
21 j
16 e
14 c
31 t


In [30]:
sequences = char_dataset.batch(2*sequence_length + 1, drop_remainder=True)

for sequence in sequences.take(2):
    print(''.join([int2char[i] for i in sequence.numpy()]))

﻿project gutenbergs alices adventures in wonderland by lewis carroll



this ebook is for the use of anyone anywhere at no cost and with

almost no restrictions whatsoever  you may copy it give it away
 or

reuse it under the terms of the project gutenberg license included

with this ebook or online at wwwgutenbergorg





title alices adventures in wonderland



author lewis carroll



posting date 


In [31]:
def split_sample(sample):
    # example :
    # sequence_length is 10
    # sample is "python is a great pro" (21 length)
    # ds will equal to ('python is ', 'a') encoded as integers
    ds = tf.data.Dataset.from_tensors((sample[:sequence_length], sample[sequence_length]))
    for i in range(1, (len(sample)-1) // 2):
        # first (input_, target) will be ('ython is a', ' ')
        # second (input_, target) will be ('thon is a ', 'g')
        # third (input_, target) will be ('hon is a g', 'r')
        # and so on
        input_ = sample[i: i+sequence_length]
        target = sample[i+sequence_length]
        # extend the dataset with these samples by concatenate() method
        other_ds = tf.data.Dataset.from_tensors((input_, target))
        ds = ds.concatenate(other_ds)
    return ds

# prepare inputs and targets
dataset = sequences.flat_map(split_sample)

In [32]:
def one_hot_samples(input_, target):
    # onehot encode the inputs and the targets
    # Example:
    # if character 'd' is encoded as 3 and n_unique_chars = 5
    # result should be the vector: [0, 0, 0, 1, 0], since 'd' is the 4th character
    return tf.one_hot(input_, n_unique_chars), tf.one_hot(target, n_unique_chars)

dataset = dataset.map(one_hot_samples)

In [33]:
for element in dataset.take(2):
    print("Input:", ''.join([int2char[np.argmax(char_vector)] for char_vector in element[0].numpy()]))
    print("Target:", int2char[np.argmax(element[1].numpy())])
    print("Input shape:", element[0].shape)
    print("Target shape:", element[1].shape)
    print("="*50, "\n")

Input: ﻿project gutenbergs alices adventures in wonderland by lewis carroll



this ebook is for the use of
Target:  
Input shape: (100, 39)
Target shape: (39,)

Input: project gutenbergs alices adventures in wonderland by lewis carroll



this ebook is for the use of 
Target: a
Input shape: (100, 39)
Target shape: (39,)



In [34]:
ds = dataset.repeat().shuffle(1024).batch(BATCH_SIZE, drop_remainder=True)

In [35]:
model = Sequential([
    LSTM(256, input_shape=(sequence_length, n_unique_chars), return_sequences=True),
    Dropout(0.3),
    LSTM(256),
    Dense(n_unique_chars, activation="softmax"),
])

In [36]:
model_weights_path = f"results/{BASENAME}-{sequence_length}.h5"
model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 100, 256)          303104    
                                                                 
 dropout_1 (Dropout)         (None, 100, 256)          0         
                                                                 
 lstm_3 (LSTM)               (None, 256)               525312    
                                                                 
 dense_1 (Dense)             (None, 39)                10023     
                                                                 
Total params: 838,439
Trainable params: 838,439
Non-trainable params: 0
_________________________________________________________________


In [None]:
if not os.path.isdir("results"):
    os.mkdir("results")
# train the model
model.fit(ds, steps_per_epoch=(len(encoded_text) - sequence_length) // BATCH_SIZE, epochs=EPOCHS)
# save the model
model.save(model_weights_path)

