# Generate text using LSTM

In [48]:
#importing libraries
import numpy as np
import random
import sys
import io
from keras.models import Sequential
from keras.layers import LSTM, Activation
from keras.optimizers import RMSprop

## Reading the file

In [23]:
with open('./dataset/sherlock_holmes.txt', 'r') as file:
    text = file.read().lower()
print('text length:', len(text))

text length: 561833


In [6]:
#get all the unique characters in the text
chars = sorted(list(set(text)))
print('total chars', len(chars))

total chars 55


In [28]:
#build character indices
char_indices = dict((c,i) for i,c in enumerate(chars))
print(char_indices)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '&': 4, "'": 5, '(': 6, ')': 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, ';': 23, '?': 24, 'a': 25, 'b': 26, 'c': 27, 'd': 28, 'e': 29, 'f': 30, 'g': 31, 'h': 32, 'i': 33, 'j': 34, 'k': 35, 'l': 36, 'm': 37, 'n': 38, 'o': 39, 'p': 40, 'q': 41, 'r': 42, 's': 43, 't': 44, 'u': 45, 'v': 46, 'w': 47, 'x': 48, 'y': 49, 'z': 50, 'à': 51, 'â': 52, 'è': 53, 'é': 54}


In [12]:
#build indices_char
indices_char = dict((i,c) for i,c in enumerate(chars))
print(indices_char)

{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '&', 5: "'", 6: '(', 7: ')', 8: ',', 9: '-', 10: '.', 11: '/', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '6', 19: '7', 20: '8', 21: '9', 22: ':', 23: ';', 24: '?', 25: 'a', 26: 'b', 27: 'c', 28: 'd', 29: 'e', 30: 'f', 31: 'g', 32: 'h', 33: 'i', 34: 'j', 35: 'k', 36: 'l', 37: 'm', 38: 'n', 39: 'o', 40: 'p', 41: 'q', 42: 'r', 43: 's', 44: 't', 45: 'u', 46: 'v', 47: 'w', 48: 'x', 49: 'y', 50: 'z', 51: 'à', 52: 'â', 53: 'è', 54: 'é'}


To get valuable data,which we can use to train our model, we will split our data up into subsequences with a length of 40 characters. Then we will transform our data to an boolean array.

In [13]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text)-maxlen, step):
    sentences.append(text[i: i+maxlen])
    next_chars.append(text[i+maxlen])

In [40]:
print(len(sentences))
print(len(next_chars))

187265
187265


In [24]:
#create x
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)

In [25]:
x.shape

(187265, 40, 55)

In [27]:
#create y
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
y.shape

(187265, 55)

In [29]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

## Build the model

In [31]:
#import other libraries
model = Sequential()
model.add(LSTM(128, input_shape = (maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## Helper functions from keras 

In [38]:
from keras.callbacks import LambdaCallback, ModelCheckpoint, ReduceLROnPlateau
import sys
import io
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [34]:
#other checkpoints
filepath = r"D:\\Backup_of_exact_MY_REPO\\model Weights\\weights_text_generation.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')

reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=1, min_lr=0.001)

callbacks = [print_callback, checkpoint, reduce_lr]


## Fit the model

In [39]:
import random
history = model.fit(x, y, batch_size=512, epochs = 5, callbacks=callbacks)

Epoch 1/5

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "e and some very
bulky boxes driving rapi"
e and some very
bulky boxes driving raping the contrary and a state of the paper when i was a man when i shall be and some condined and a morning of the concers of the discreced to the company with a colone of the colone of the concert that i am a little the track and the matter and some and was a little the contrary of the colone of the strange of the comple and we had a colone of the contrary to the concertion with a contrary of the c
----- diversity: 0.5
----- Generating with seed: "e and some very
bulky boxes driving rapi"
e and some very
bulky boxes driving raping my convince and came a monol, and we have a more really and little him that the right to be a police was in
the corner and the little the front of a bridely and was wall to the proves a little bener of a strengt than i coild a discrock down in a givened and stood little dight of a st

"you have no tell to my side, i shall cas eyes and in a smophing to the ready to the face, he was in the red to the exise in the house of the really could was absolutely andress in the house, and the lowed in the stro
----- diversity: 1.0
----- Generating with seed: " but it was dreadful hard before
his mot"
 but it was dreadful hard before
his motheled his
wood-do
dilctomed then waiting over. yours inever, sill i am i det imaninal."

"little knew the manal, shows, that i realle, and that we shall gee teast destranger, and he would not given where hud.'

"'that."

"in the silk,' he
extrimed a miders, looking
swergul in 180ow
adouming terriffeciso finished.

"what, how crung it up in foctograg-walked i may leave, i was extertion to the
footl
----- diversity: 1.2
----- Generating with seed: " but it was dreadful hard before
his mot"
 but it was dreadful hard before
his moti well, only cleared, "how fow wet!"e
fillodges upones."

"therefully."

repalled, his
mying least her."i rungular to

## Generate text ourselves

In [46]:
#Using the helper function and slightly modify it to generate text ourselves
def generate_text(length, diversity):
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    
    for i in range(length):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

In [47]:
#generate text by calling that function
print(generate_text(500, 0.2))

yself clear?"

"i am to remain neutral, and the police and the country of the street and the street and the street that the much any commitse of the street and the problem of the country of the street and the country of the country of the street and some things of the street and the country of the street of the country of the street and the sign of the concless from his chair of the country of the street and the country of the street of the street and the things and look and the considered that the country was the matter to the most a


## Ways to improve the model

There are also a lot of things you can improve about the model to get better outputs. A few of them are:

Using a more sophisticated network structure (more LSTM-, Dense Layers),
Training for more epochs,
Playing around with the batch_size

# That's the end of the code