In [1]:
from __future__ import print_function
import sys
import numpy as np
import re
import random
import pickle

from nltk.corpus import gutenberg

from keras.models import Sequential
from keras.layers import Dense, Bidirectional, Dropout
from keras.layers import SimpleRNN, GRU, BatchNormalization

from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
hamlet = gutenberg.words('shakespeare-hamlet.txt')   # Ordered list of words in Hamlet

print(hamlet[:10])

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']


In [3]:
text =''

for word in hamlet:            # For each word
    text+=str(word).lower()    # Convert to lower case and add to string variable
    text+= ' '                 # Add space
    
    
print('Corpus length, Hamlet only:', len(text))

Corpus length, Hamlet only: 166765


In [4]:
'''
Break text into :

Features  -    Character-level sequences of fixed length        
Labels    -    The next character in sequence     

'''

training_sequences = []          # Empty list to collect each sequence 
next_chars = []                  # Empty list to collect next character in sequence
seq_len, stride = 35, 1          # Define lenth of each input sequence & stride to move before sampling next sequence


for i in range(0, len(text) - seq_len, stride):     # Loop over text with window of 35 characters, moving 1 stride at a time
    training_sequences.append(text[i: i + seq_len]) # Append sequences to traning_sequences
    next_chars.append(text[i + seq_len])            # Append following character in sequence to next_chars

In [5]:
# Print out sequences and labels to verify

print('Number of sequences:', len(training_sequences))
print('First sequences:', training_sequences[:1])
print('Next characters in sequence:', next_chars[:1])
print('Second sequences:', training_sequences[1:2])
print('Next characters in sequence:', next_chars[1:2])

Number of sequences: 166730
First sequences: ['[ the tragedie of hamlet by william']
Next characters in sequence: [' ']
Second sequences: [' the tragedie of hamlet by william ']
Next characters in sequence: ['s']


In [6]:
# Get sorted list of unique characters in hamlet

characters = sorted(list(set(text)))
print('Total characters:', len(characters))

Total characters: 43


In [7]:
# Make lookup dictionaries to map each unique charatcer with an integers

char_indices = dict((l, i) for i, l in enumerate(characters))
indices_char = dict((i, l) for i, l in enumerate(characters))

print(char_indices)

{' ': 0, '!': 1, '&': 2, "'": 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, '1': 9, '5': 10, '9': 11, ':': 12, ';': 13, '?': 14, '[': 15, ']': 16, 'a': 17, 'b': 18, 'c': 19, 'd': 20, 'e': 21, 'f': 22, 'g': 23, 'h': 24, 'i': 25, 'j': 26, 'k': 27, 'l': 28, 'm': 29, 'n': 30, 'o': 31, 'p': 32, 'q': 33, 'r': 34, 's': 35, 't': 36, 'u': 37, 'v': 38, 'w': 39, 'x': 40, 'y': 41, 'z': 42}


In [8]:
#Create a Matrix of zeros
# With dimensions : (training sequences, length of each sequence, total unique characters)

x = np.zeros((len(training_sequences), seq_len, len(characters)), dtype=np.bool)
y = np.zeros((len(training_sequences), len(characters)), dtype=np.bool)


for index, sequence in enumerate(training_sequences):     #Iterate over training sequences
    
    for sub_index, chars in enumerate(sequence):          #Iterate over characters per sequence
        
        x[index, sub_index, char_indices[chars]] = 1      #Update character position in feature matrix to 1
        
    y[index, char_indices[next_chars[index]]] = 1         #Update character position in label matrix to 1

In [9]:
print('Data vectorization completed.')
print('Feature vectors shape', x.shape)
print('Label vectors shape', y.shape)

Data vectorization completed.
Feature vectors shape (166730, 35, 43)
Label vectors shape (166730, 43)


In [10]:
'''Fun part: Construct a bunch of functions returning different kinds of RNNs, from simple to more complex'''

# Simple 1-layered RNN with 128 neurons

def SimpleRNN_model():
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=(seq_len, len(characters))))
    model.add(Dense(len(characters), activation='softmax'))
    return model

In [11]:
# Two stacked RNN layers, both with 128 neurons

def SimpleRNN_stacked_model():
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=(seq_len, len(characters)), return_sequences=True))
    model.add(SimpleRNN(128))
    model.add(Dense(len(characters), activation='softmax'))
    return model

In [12]:
# Two stacked GRU layers with 128 neurons each

def GRU_stacked_model():
    model = Sequential()
    model.add(GRU(128, input_shape=(seq_len, len(characters)), return_sequences=True))
    model.add(GRU(128))
    model.add(Dense(len(characters), activation='softmax'))
    return model

In [13]:
# Two stacked bi-directional layers with 128 neurons each

def Bi_directional_GRU():
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences=True), input_shape=(seq_len, len(characters))))
    model.add(Bidirectional(GRU(128)))
    model.add(Dense(len(characters), activation='softmax'))
    return model

In [14]:
# Large GRU model with 3 GRU layers and one densely connected hidden layer, using double dropout strategy

def larger_GRU():
    model = Sequential()
    model.add(GRU(128, input_shape=(seq_len, len(characters)),
                       dropout=0.2,
                       recurrent_dropout=0.2,
                       return_sequences=True))
    
    model.add(GRU(128, dropout=0.2,
                  recurrent_dropout=0.2,
                  return_sequences=True))
    
    model.add(GRU(128, dropout=0.2,
                  recurrent_dropout=0.2))
    
    model.add(Dense(128, activation='relu'))
    
    model.add(Dense(len(characters), activation='softmax'))
    
    return model



In [15]:
# All defined models

all_models = [SimpleRNN_model,
              SimpleRNN_stacked_model,
              GRU_stacked_model,
              Bi_directional_GRU, 
              Bi_directional_GRU,
              larger_GRU]

In [16]:
# Sampling a character index from a probability array
    
    
def sample(softmax_predictions, sample_threshold=1.0):
    
    softmax_preds = np.asarray(softmax_predictions).astype('float64')    # Make array of predictions, convert to float
    
    log_preds = np.log(softmax_preds) / sample_threshold                 # Log normalize and divide by threshold
    
    exp_preds = np.exp(log_preds)                                        # Compute exponents of log normalized terms
     
    norm_preds = exp_preds / np.sum(exp_preds)                           # Normalize predictions
    
    prob = np.random.multinomial(1, norm_preds, 1)                       # Draw sample from multinomial distribution
    
    return np.argmax(prob)                                               #Return max value

In [17]:
# Function executed epoch end, generates Prints 

def on_epoch_end(epoch, _):
    global model, model_name
    
    print('----- Generating text after Epoch: %d' % epoch)
    
    start_index = random.randint(0, len(text) - seq_len - 1)    # Random index position to start sample input sequence
    end_index = start_index + seq_len                           # End of sequence, corresponding to training sequence length
    
    sampling_range = [0.3, 0.5, 0.7, 1.0, 1.2]                  # Sampling entropy threshold
    
    for threshold in sampling_range:
        print('----- *Sampling Threshold* :', threshold)
        
        generated = ''                                          # Empty string to collect sequence
        
        sentence = text[start_index: end_index]                 # Random input sequence taken from Hamlet
        generated += sentence                                   # Add input sentence to generated
        
        print('Input sequence to generate from : "' + sentence + '"')
        
        sys.stdout.write(generated)                            # Print out buffer instead of waiting till the end
        
        
        for i in range(400):                                   # Generate 400 next characters in the sequence
            
            x_pred = np.zeros((1, seq_len, len(characters)))   # Matrix of zeros for input sentence
            
            for n, char in enumerate(sentence):                # For character in snetence
                
                x_pred[0, n, char_indices[char]] = 1.          # Change index position for character to 1.
                
            preds = model.predict(x_pred, verbose=0)[0]        # Make prediction on input vector
            
            next_index = sample(preds, threshold)              # Get index position of next character using sample function
            
            next_char = indices_char[next_index]               # Get next character using index
            
            generated += next_char                             # Add generated character to sequence
            sentence = sentence[1:] + next_char
            
            sys.stdout.write(next_char)
            sys.stdout.flush()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [18]:
def test_models(list, epochs=10):
    global model, model_name
    
    for network in list:   
        print('Initiating compilation...')
        
        # Initialize model
        model = network()
        # Get model name
        model_name = re.split(' ', str(network))[1]  
        
        #Filepath to save model with name, epoch and loss 
        filepath = "C:/Users/npurk/Desktop/Ch5RNN/all_models/versions/%s_epoch-{epoch:02d}-loss-{loss:.4f}.h5"%model_name
        
        #Checkpoint callback object 
        checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True, mode='min')
        
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        print('Compiled:', str(model_name))
        
        # Initiate training
        network = model.fit(x, y,
              batch_size=100,
              epochs=epochs,
              callbacks=[print_callback, checkpoint])
        
        # Print model configuration
        model.summary()
           
        #Save model history object for later analysis
        with open('C:/Users/npurk/Desktop/Ch5RNN/all_models/history/%s.pkl'%model_name, 'wb') as file_pi:
            pickle.dump(network.history, file_pi)
            

In [19]:
test_models(all_models, epochs=5)

Initiating compilation...
Compiled: SimpleRNN_model
Epoch 1/5
----- Generating text after Epoch: 0
----- *Sampling Threshold* : 0.3
Input sequence to generate from : "iudge ' twixt you and me ; if by di"
iudge ' twixt you and me ; if by dist , the be thes , the there , my lord , the thenger . lo the the pore , that wher mast the sing . the seathe , the thes so the come the the comest of that in thet in whel the mather , the thee thet the blath . the theas . be thes thes me the the fore the ke ther weath ald this beather , the brestere , the thou houe mare the wir ther , the there , tho . hou . and the seree the gother . hour somest----- *Sampling Threshold* : 0.5
Input sequence to generate from : "iudge ' twixt you and me ; if by di"
iudge ' twixt you and me ; if by dit , the aramerill which . whor hardese , the mertetis bo the come mand , bed io mo the merowey , wougste somy , and be fore the comy that whele thes the ke thet ere . thouer . and the de beree , and this , at in mere hime

KeyboardInterrupt: 

<keras.engine.sequential.Sequential at 0x1a051d7a208>