# Load data

In [1]:
import keras
import numpy as np
path = keras.utils.get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))

Using TensorFlow backend.


Corpus length: 600901


In order to control the amount of stochasticity in the sampling process, we’ll introduce a parameter called the softmax temperature that characterizes the entropy of the
probability distribution used for sampling: it characterizes how surprising or predictable the choice of the next character will be. Given a temperature value, a new probability distribution is computed from the original one (the softmax output of the
model) by reweighting it in the following way

# Reweighting a probability distribution to a different temperature
Bigger the temperature value more random the the distribution sampling is 

In [2]:
import numpy as np

def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    return distribution / np.sum(distribution) # divide by sum to renormalize

# Vectorize the sequences of characters
extract partially overlapping sequences of length maxlen, one-hot encode
them, and pack them in a 3D Numpy array x of shape (sequences, maxlen,
unique_characters). Simultaneously, you’ll prepare an array y containing the corresponding targets: the one-hot-encoded characters that come after each extracted
sequence.

In [4]:
maxlen = 60         # length of sequence
step = 3            # new sequence in every 3 character

sentences = []      # hold the sentences
next_chars = []     # holds the next chars

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i+maxlen])
    next_chars.append(text[i+maxlen])
    
print('number of sequences: ', len(sentences))

number of sequences:  200281


In [5]:
chars = sorted(list(set(text)))
print('number of unique chars ', len(chars))

number of unique chars  59


In [7]:
chars_indices = dict((char, chars.index(char)) for char in chars)

In [8]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, chars_indices[char]] = 1
    y[i, chars_indices[next_chars[i]]] = 1

In [9]:
print(x.shape)
print(y.shape)

(200281, 60, 59)
(200281, 59)


# Building the model

In [11]:
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='relu'))

In [12]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# TRAINING THE LANGUAGE MODEL AND SAMPLING FROM IT
Given a trained model and a seed text snippet, you can generate new text by doing the
following repeatedly:
1. Draw from the model a probability distribution for the next character, given the
generated text available so far.
2. Reweight the distribution to a certain temperature.
3. Sample the next character at random according to the reweighted distribution.
4. Add the new character at the end of the available text.
This is the code you use to reweight the original probability distribution coming out
of the model and draw a character index from it (the sampling function)

## function to sample the next char given the model's parameter

In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    