In [2]:
# Lab 7 - Text generation with LSTM
#
# Step 1 (not assessed): build and train a model to generate text in the style of a corpus.
#
# Based on the Keras text generation example (https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py)
#
# Step 2: build a model to distinguish genuine from fake sentences.

In [96]:
# Import essential modules
import pickle
import random
import sys
import time
import keras

import numpy as np
from sklearn.model_selection import train_test_split

from keras.layers import Input, LSTM, GRU, Dense, Activation, Flatten, Dropout, Embedding, Conv1D
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import RMSprop, Adam, Nadam, SGD
from keras.models import Model, Sequential
from keras.models import save_model
from keras.utils.data_utils import get_file
from keras import initializers

In [4]:
# Helper function to sample an index from an array of predictions.
#
# The input array 'preds' should be the output of a text generation model.
# The elements contain the values of the units in the final layer.
# Each unit corresponds to a character in the text alphabet.
# The final layer should have SoftMax activation, and thus the
# value corresponds to the 'strength of prediction' of that character
# as the next output value---so the maximum value indicates which character
# is most strongly predicted (considerd most likely) as the next one.
#
def sample(preds, temperature=1.0):
    # Convert to high-precision datatype (we are going to be manipulating some
    # very small values in this function)
    preds = np.asarray(preds).astype('float64')  
    
    # The next line has the effect of raising each prediction value to the power 1/T.
    # It's done using logs to improve numerical precision.  This is a kind of value-dependent
    # scaling: for T < 1.0 (1/T > 1.0), small values are made smaller (proportionally) than 
    # large values (unlike a linear scaling, such as multiplication by 0.9, which scales all values
    # the same).
    #
    # Example: Consider that we have only two symbols (letters) in our alphabet, and our 
    # probabilities are [0.2, 0.8].  A temperature of 1.0 means 'do not adjust the
    # probabilities at all', so in this case there will be a 20% chance that the 
    # function will return 'symbol 0' and an 80% chance  that it will return 'symbol 1'.
    # Note that symbol 1 is 4x more likely than symbol 0.
    #
    # Now: if we supply a temperature of 0.5, our probabilites will be raised to the
    # power 1/0.5 = 2, becoming [0.04, 0.64].  These will then be normalized to sum to 1,
    # but anyway it is clear that symbol 1 is here 16x (the square of 4x) more likely than 
    # symbol 0.
    #
    # Conversely, for a temperature of 2, our probabilities will be raised to 0.5 (square-rooted),
    # becoming [.4472, 0.8944] - and so here symbol 1 is only 2x (sqrt of 4x) more likely than
    # symbol 0.
    #
    # So: low temperatures make the distribution peakier, exaggerating the difference between
    # values.  High temperatures flatten the distribution, reducing the difference between values.
    #
    # As the return value is a sample of the manipulated distribution, manipulating it to
    # be peakier (by supplying a low temperature) makes the sample more conservative, i.e.
    # more likely to pick the highest-probability symbol.
    #
    # Making the distribution flatter (by suppyling a high temperature) causes the
    # sample to be less conservative, i.e. more likely to pick some lower-likelihood
    # symbol.
    #
    # Phew!
    preds = np.exp(np.log(preds) / temperature)
    
    preds = preds / np.sum(preds)  # ensure that probs sum to 1
    probas = np.random.multinomial(1, preds, 1)  # take 1 sample from the distribution
    return np.argmax(probas)

In [164]:
# Decide how much data to use for training.
# You might want to reduce this to ~100k for faster experimentation, and then bring it back
# to 600k when you're happy with your network architecture.
# IMPORTANT: mke sure you end up with a 57-symbol alphabet after reducing the corpus size!
# If the number of symbols (shown in the next cell) gets smaller than it was with the full
# corpus, bring your sample size back up.  This is necessary because the encoding used for
# training must match that used for assessment.
#desired_num_chars = 480*1000  # Max: 600893
desired_num_chars = 480139  # Max: 600893

random.seed(43)  # Fix random seed for repeatable results.

# Slurp down all of Nietzsche from Amazon.
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('original corpus length:', len(text))

start_index = random.randint(0, len(text) - desired_num_chars - 1)
text = text[start_index:start_index + desired_num_chars]
text
print('length for training:', len(text))

original corpus length: 600901
length for training: 480139


In [165]:
# Let's have a quick look at a random exceprt.
#
# Caution: Nietzsche might drive you mad: dare you behold more than 1000 of his terrible chars..? 
sample_length = 1000

random.seed(None)  # Seeds random from current time (so re-eval this cell for a new sample).

start_index = random.randint(0, len(text) - sample_length - 1)
print(text[start_index:start_index+sample_length])

ne is such a liar
as the indignant man.

27. it is difficult to be understood, especially when one thinks and
lives gangasrotogati [footnote: like the river ganges: presto.] among
those only who think and live otherwise--namely, kurmagati [footnote:
like the tortoise: lento.], or at best "froglike," mandeikagati
[footnote: like the frog: staccato.] (i do everything to be "difficultly
understood" myself!)--and one should be heartily grateful for the
good will to some refinement of interpretation. as regards "the good
friends," however, who are always too easy-going, and think that as
friends they have a right to ease, one does well at the very first to
grant them a play-ground and romping-place for misunderstanding--one can
thus laugh still; or get rid of them altogether, these good friends--and
laugh then also!

28. what is most difficult to render from one language into another
is the tempo of its style, which has its basis in the character of the
race, or to speak more physiologicall

In [230]:
# Establish the alphabet (set of symbols) we are going to use.
chars = sorted(list(set(text)))
print('total chars:', len(chars))
print(chars)

char_indices = dict((c, i) for i, c in enumerate(chars))  # Map to look up index of a particular char (e.g. x['a'] = 0)
indices_char = dict((i, c) for i, c in enumerate(chars))  # Map to look up char at an index (e.g. x[0] = 'a')

total chars: 57
['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¤', '¦', '«', 'ã']


In [231]:
# Establish a training set of semi-redundant (i.e. overlapping) sequences of maxlen characters.
maxlen = 40
step = 3
sentences = []  # Not syntactic sentences, but just sequences of 40 chars pulled from the corpus.
next_chars = [] # next_chars[n] stores the character which followed sentences[n]
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 160033


In [232]:
# Convert the data to one-hot encoding.
# 'x' will contain the one-hot encoding of the training 'sentences'.
# 'y' will contain the one-hot encoding of the 'next char' for each sentence.
#
# 
# Let's consider that we have N sentences of length L:
#
# The 'native' encoding is an NxL matrix where element [n][l]
# is the symbol index for character at index (l) of sentence (n)
# (e.g., say, 5, corresponding to 'e').
#
# The one-hot encoding is an NxLxS matrix, where S is the 
# number of symbols in the alphabet, such that element [n][l][s]
# is 1 if the character at index (l) in sentence (n) has the
# symbol index (s), and 0 otherwise.
def onehot_encode(sentence, maxlen):
    x = np.zeros((maxlen, len(chars)), dtype=np.bool)
    for t, char in enumerate(sentence):
        x[t, char_indices[char]] = 1
    return x

x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    x[i,:,:] = onehot_encode(sentence, maxlen)
    y[i, :] = onehot_encode(next_chars[i], 1)

print(x.shape)
print(y.shape)

(160033, 40, 57)
(160033, 57)


In [233]:
# Build the generator model: a single GRU layer with 128 cells.
generator_model = Sequential()
generator_model.add(GRU(128, input_shape=(maxlen, len(chars))))
#generator_model.add(LSTM(256, input_shape=(maxlen, len(chars))))
generator_model.add(Dense(len(chars)))
generator_model.add(Activation('softmax'))

# You could experiment with NAdam instead of RMSProp.
optimizer = RMSprop(lr=0.01)
generator_model.compile(loss='categorical_crossentropy', optimizer=optimizer)
trained_epochs = 0

In [234]:
def generate_sentence_list(seed_list, length=400, temperature=0.25):
    sentence_list = [];
    generated_list = [];
    n = len(seed_list)
    # copy lists
    for seed in seed_list:
        sentence_list.append(seed[:])
        generated_list.append(seed[:])    
    
    for i in range(length):
      
        workdone = (i+1)*1.0 / length
        sys.stdout.write("\rgenerating sentences: [{0:20s}] {1:.1f}%".format('#' * int(workdone * 20), workdone*100))
        sys.stdout.flush()
            
        x_pred_list = np.zeros((n, maxlen, len(chars)))
        for j, sentence in enumerate(sentence_list):
            for t, char in enumerate(sentence):
                x_pred_list[j, t, char_indices[char]] = 1.

        start = time.time()
        pred_list = generator_model.predict(x_pred_list, verbose=0)
        end = time.time()

        for j in range(n):
            next_index = sample(pred_list[j,:], temperature)
            next_char = indices_char[next_index]
            generated_list[j] += next_char
            sentence_list[j] = sentence_list[j][1:] + next_char
    
    sys.stdout.write(' - done\n')
    sys.stdout.flush()
    
    return generated_list

def print_sentences(seeds, sentences):
    for seed, sentence in zip(seeds, sentences):
        print('-'*5)
        sys.stdout.write('\x1b[32m')
        sys.stdout.write(sentence[0:len(seed)])
        sys.stdout.write('\x1b[34m')
        sys.stdout.write(sentence[len(seed):-1])
        sys.stdout.write('\x1b[m')
        sys.stdout.write('\n')    
        sys.stdout.flush()
        
def pick_sentences(n, maxlen):
    global text    
    start_index_list = np.random.randint(len(text) - maxlen - 1, size=(1, n)).flatten().tolist()
    seed_list = [] 
    for start_index in start_index_list:
        seed_list.append(text[start_index: start_index + maxlen])
    return seed_list

In [235]:
# Generate 3 seeds which we will use to inspect the progress of our training:
preview_seeds = pick_sentences(3, maxlen=40)

# Train the model, output generated text after each iteration
for iteration in range(1, 11):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    generator_model.fit(x, y,
                  batch_size=1024,
                  epochs=4)

    generated_sentences = generate_sentence_list(preview_seeds)
    print_sentences(preview_seeds, generated_sentences)


--------------------------------------------------
Iteration 1
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
generating sentences: [####################] 100.0% - done
-----
[32mther aside. for it is, to all appearance[34m, from the are in the strength of the for the stranges the of the schooly there is a persigness the strungly the problems of the strength the individual the for the indiscives striggle for the has the the significal the strength the strength of the mast in the strength of the whole in striggle the call of the schoother the sare and sensity of the schoother the has the his problems of the his pass[m
-----
[32miment que la religion est un produit de [34min the indesirn the schooly the find a mast significal the school of the schooly of the sense there is strength in the schooly the schooly his plansions, they he has the has the strength of the say the schole the his perhaps the soul and in the self-consigness of the his passive the significations of the strength in the 

-----
[32miment que la religion est un produit de [34mthe and an and has been an any more hand of the inconsisted and a person should be the an and has been an art been the the the final uncenture of the of the and a consequences, the the historical sense of the inclue the the incensively and more the been the uncenture and a strenuth is a sense of the inclue the the uncless, the the the and the precisely the for the and the beart and a philosophy o[m
-----
[32mf surrender, of sacrifice for
one's neig[34mht of the an intent of the the explaned the believe in the here are the and an any objection, the the present to hear the beart of the for the instincts, the the sense of the and an any handsence of the ancient and the delick of the beart and not be understood: the the and the heart he the any hands of the ance the the have to be uncenture and man to be the uncenture and all the hands of the the [m

--------------------------------------------------
Iteration 6
Epoch 1/4
Epoch 2/


--------------------------------------------------
Iteration 10
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
generating sentences: [####################] 100.0% - done
-----
[32mther aside. for it is, to all appearance[34m of such a state of the soul and something constantation, which has been such a state the germans of the saint, and the serve of the most maints, the senses, with which we knowledge of the senses, this construits of the senses of the said to be the signify of the senses of the strives of this conscience, which we know that a states and the spirits of the senses, which still of the senses, that th[m
-----
[32miment que la religion est un produit de [34mthe saint, that is a soul that his presence of this construits of the spirits of think will be sourts of the saint of this conscience, which was the conscience, and with which still still to be the conscience of think will be the present think of the said to say has been and compressed that the regard of the soul of thi

In [236]:
# For a more complete inspection, print out a load of sentences:
#
num_sentences = 100             # how many to generate
sentence_length = 40            # 100--400 is good
#sample_temperature = 0.25       # see discussion of temperature up near the top
sample_temperature = 0.15       # see discussion of temperature up near the top

start_index_list = np.random.randint(len(text) - maxlen - 1, size=(1, num_sentences)).flatten().tolist()
preview_seeds = [] 
for start_index in start_index_list:
    preview_seeds.append(text[start_index: start_index + maxlen])

generated_sentences = generate_sentence_list(preview_seeds, length=sentence_length, temperature=sample_temperature); 
print_sentences(preview_seeds, generated_sentences)

generating sentences: [####################] 100.0% - done
-----
[32me psychologist
who has discovered this r[34mecontrared and strong than of this cons[m
-----
[32ms that to knowledge the highest utility [34mof the strives of the strives of the so[m
-----
[32m easier to go over to a
really emancipat[34med and think well that in the saint, an[m
-----
[32mfering, to sit still, to exercise patien[34mce, which well that the religious serve[m
-----
[32mned jargon is: psychological
observation[34m of the soul, with which well that the [m
-----
[32meen the
specialties of science and philo[34msophers of the senses, which well with [m
-----
[32m the climax, the attained climax of
mank[34mind will be sourt of the saint, that is[m
-----
[32m christian period of european
history, a[34mnd the senses, which well that it is so[m
-----
[32mower, a strange, still unconquered
enemy[34m with the strive will to be of the sain[m
-----
[32mt to speak of syllables) of a page--he 

-----
[32m whether he who experiences
them is not [34mthat the senses, we knowledge of the se[m
-----
[32mg of another.

242. whether we call it "[34mgod and strives of the saint, that is a[m
-----
[32mkind
consists simply in the fact that th[34me signify of this construits of the sai[m
-----
[32msinterestedly." there have been philosop[34mher with which we knowledge of the sain[m
-----
[32mstic affinities is very hard to find; st[34mill still still still be soul, which we[m
-----
[32ma garden--or as music on the waters at e[34mvery one should be still still still of[m
-----
[32mams
carry us back to the earlier stages [34mof the soul, with which we knowledge of[m
-----
[32mnuation and spiritualization by the symb[34modism that it will be the senses of the[m
-----
[32mt of historical justice in
a determined [34mto the senses of the same strivings, th[m
-----
[32m again, from
time to time, that is, his [34mprocess of the strives of the strives o[m
-----
[32

In [237]:
# This is just a checkpoint, which will let you download and re-upload (or add to git) this model.
save_model(generator_model, './generator_model.h5')

In [238]:
# Generating the training fake sentences for the Discriminator network
#
# These are saved to the file 'fake.pkl' -- you could download this to your
# user drive and re-upload it in a subsequent session, to save regenerating
# it again (in which case you don't need to evaluate this cell).

training_seeds = pick_sentences(3000, maxlen=40)
training_generated_sentences = generate_sentence_list(training_seeds, length=40)
# Strip out the initial 40 chars (the seed sequence, which is genuine data from the corpus).
for i, sentence in enumerate(training_generated_sentences):
    training_generated_sentences[i] = sentence[40:40+40]
    
output = open('fake.pkl', 'wb')
pickle.dump(training_seeds, output)
pickle.dump(training_generated_sentences, output)
output.close()

generating sentences: [####################] 100.0% - done


In [239]:
# Load the training set from the file
pkl_file = open('fake.pkl', 'rb')
training_seeds = pickle.load(pkl_file)
training_generated_sentences = pickle.load(pkl_file)
pkl_file.close()

In [240]:
# Make a 50:50 set of 'fake' (generated) and genuine sentences:
num_generated = len(training_generated_sentences)
training_real_sentences = pick_sentences(num_generated, maxlen=40)

all_training_sentences = training_generated_sentences + training_real_sentences
n = len(all_training_sentences)
x = np.zeros((n, 40, len(chars)))
y = np.zeros((n, 1))

for i, sentence in enumerate(all_training_sentences):
    x[i, :, :] = onehot_encode(sentence, maxlen=40)
y[num_generated:] = 1  # Encodes the fact that sentences with indexes larger than (num_generated) are real.


In [357]:
print('Build model...')

# Define some layers here..
print(x.shape)
#print(x.shape)
#print(y.shape)
#print(n)

# Use your layers to create the model.

#inputs = Input(shape=(40, 59))
#h = LSTM(256)(inputs)
#h = Dropout(0.2)(h)
#h = Dense(1024, activation='relu')(h)
#h = LSTM(256, return_sequences=False)(h)
#h = Dense(512, activation='tanh')(h)

#output = Dense(1, activation='softmax')(h)

discriminator_model = Sequential()
#discriminator_model.add(Embedding(4020, 40, input_length=59))
discriminator_model.add(GRU(1024, dropout = 0.2, return_sequences=False, recurrent_dropout = 0.2, input_shape=(maxlen, len(chars))))
#discriminator_model.add(LSTM(256, recurrent_dropout=0.0, return_sequences=False, input_shape=(maxlen, len(chars))))
#discriminator_model.add(LSTM(128, return_sequences=True))
#discriminator_model.add(Conv1D(64, 5, activation='relu', padding='valid', input_shape=(maxlen, len(chars))))
#discriminator_model.add(Dropout(0.7))
#discriminator_model.add(Dense(len(chars), activation='relu'))
#discriminator_model.add(LSTM(128, return_sequences=False))
#discriminator_model.add(Flatten())
discriminator_model.add(Dense(1, activation='sigmoid'))
#discriminator_model.add(Dense(1024))
#discriminator_model.add(LeakyReLU(0.2))
#discriminator_model.add(Dense(512))
#discriminator_model.add(LeakyReLU(0.2))
#discriminator_model.add(Dropout(0.3))
#discriminator_model.add(Dense(1))

#discriminator_model = keras.models.Model(inputs=inputs, outputs=output)

# Define some layers here..
#inputs = keras.layers.Input(shape=(40, len(chars)))

#discriminator_model = Sequential()
#discriminator_model.add(LSTM(256, input_shape=(40, 59)))
#discriminator_model.add(Dense(1, activation='softmax'))

# Use your layers to create the model.
opt = RMSprop(lr=0.001)

# Setup the optimisation strategy.
discriminator_model.compile(optimizer=opt,
                    loss='binary_crossentropy',
                    metrics=['accuracy'])
                             
print('compiled.')
discriminator_model.summary()


Build model...
(6000, 40, 57)
compiled.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_13 (GRU)                 (None, 1024)              3323904   
_________________________________________________________________
dense_64 (Dense)             (None, 1)                 1025      
Total params: 3,324,929
Trainable params: 3,324,929
Non-trainable params: 0
_________________________________________________________________


In [358]:
[x_train, x_test, y_train, y_test] = train_test_split(x, y, test_size=0.33, random_state=42)
discriminator_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=64)

Train on 4020 samples, validate on 1980 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20d170c8ef0>

In [359]:
# Once you're happy with your discriminator model, evaluate this cell to save it:
save_model(discriminator_model, './discriminator_model.h5')
# Run these commands in the terminal to submit your model for assessment.
# git add lab-07/discriminator_model.h5
# git commit -m "Add/update discriminator model."
# git push
# submit-lab 7
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(4020, 40, 57)
(4020, 1)
(1980, 40, 57)
(1980, 1)


In [360]:
score,acc = discriminator_model.evaluate(x_test, y_test, verbose = 2, batch_size = 64)
print("Score: %.2f" % (score))
print("Validation Accuracy: %.2f%%" % (acc*100))

Score: 0.10
Validation Accuracy: 96.62%
