# 1. Generative Models for Text

In [1]:
import numpy as np
import pandas as pd
from __future__ import print_function
import keras
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
import numpy as np
import random
import sys
import io
np.random.seed(7)

Using TensorFlow backend.


In [2]:
book1 = io.open('book1.txt', 'r', encoding="utf8").read().lower()
book2 = io.open('book2.txt', 'r', encoding="utf8").read().lower()
book3 = io.open('book3.txt', 'r', encoding="utf8").read().lower()
book4 = io.open('book4.txt', 'r', encoding="utf8").read().lower()

In [3]:
text = book1 + '\n' + book2 + '\n' + book3 + '\n' + book4
print('Text Length', len(text))

Text Length 1593556


### Map Characters to Integers

In [4]:
chars = sorted(list(set(text)))
print('Total Unique Characters', len(chars))

Total Unique Characters 99


In [5]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

### Split up into subsequences

In [6]:
maxlen = 99
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i:i + maxlen])
    next_chars.append(text[i + maxlen])
print('No. of Sequences', len(sentences))

No. of Sequences 1593457


In [7]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

### Building Model

In [8]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

In [9]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

### Helper Functions

In [10]:
def sample(preds, temperature):
    #Helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [11]:
def on_epoch_end(epoch, logs):
    #Function invoked at the end of each epoch. Prints generated Text.
    #print('\n----- Generating Text after Epoch: %d'%epoch)
    
    start_index = random.randint(0, len(text) - maxlen - 1)
    
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        #print('----- Diversity:', diversity)
        
        generated = ''
        sentence = text[start_index : start_index + maxlen]
        generated += sentence
        #print('----- Generating with seed:"'+sentence+'"')
        #sys.stdout.write(generated)
        
        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1
            
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            
            generated += next_char
            sentence = sentence[1:] + next_char
            
            #sys.stdout.write(next_char)
            #sys.stdout.flush()
        #print('\n')
print_callback = LambdaCallback(on_epoch_end = on_epoch_end)

In [12]:
from keras.callbacks import ModelCheckpoint

filepath = 'weights.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                            verbose=1, save_best_only=True, 
                            mode='min')

In [15]:
def generate_text(length, diversity):
    #Get random starting Text
    #start_index = random.randint(0, len(text)-maxlen-1)
    generated = ''
    sentence = 'There are those who take mental phenomena naively, just as they would physical phenomena. This scho'.lower()
    #sentence = text[start_index: start_index + maxlen]
    generated += sentence
    
    for i in range(length):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.
        
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        
        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

In [13]:
from keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                             patience=1, min_lr=0.001)

In [14]:
callbacks = [print_callback, checkpoint, reduce_lr]

In [None]:
model.fit(x, y, batch_size=20, epochs=30, callbacks=callbacks)

Epoch 1/30

Epoch 00001: loss did not improve from inf
Epoch 2/30

Epoch 00002: loss did not improve from inf
Epoch 3/30

Epoch 00003: loss did not improve from inf
Epoch 4/30

Epoch 00004: loss did not improve from inf
Epoch 5/30

Epoch 00005: loss did not improve from inf
Epoch 6/30

Epoch 00006: loss did not improve from inf
Epoch 7/30

Epoch 00007: loss did not improve from inf
Epoch 8/30
 260920/1593457 [===>..........................] - ETA: 1:27:35 - loss: nan

In [None]:
print(generate_text(1000, 0.2))