# Text Generator Using Keras LSTM

## Import Libraries

In [1]:
# Import Keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils
# Import Other Libraries
import numpy as np
import pandas as pd

Using TensorFlow backend.


## Load Data

In [2]:
df_text=(open("PrideAndPrejudice.txt").read())
df_text=df_text.lower()

In [3]:
print(len(df_text))

754870


In [4]:
print(df_text[:500])

chapter 1

      it is a truth universally acknowledged, that a single man in
      possession of a good fortune, must be in want of a wife.

      however little known the feelings or views of such a man may be
      on his first entering a neighbourhood, this truth is so well
      fixed in the minds of the surrounding families, that he is
      considered the rightful property of some one or other of their
      daughters.

      “my dear mr. bennet,” said his lady to him one day, “have you
 


## Create Character-Number Mapping

In [5]:
characters = sorted(list(set(df_text)))
print(characters)
print("Total Characters: ", len(characters))

['\n', ' ', '!', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'é', 'ê', '—', '‘', '’', '“', '”']
Total Characters:  56


In [6]:
char_to_n = {char:n for n, char in enumerate(characters)}

## Data Preprocessing

In [7]:
X = []
Y = []
length = len(df_text)
seq_length = 100
for i in range(0, length-seq_length, 1):
    sequence = df_text[i:i + seq_length]
    label =df_text[i + seq_length]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])

In [8]:
print(len(X))
print(len(Y))

754770
754770


In [9]:
print(X[0])

[24, 29, 22, 37, 41, 26, 39, 1, 9, 0, 0, 1, 1, 1, 1, 1, 1, 30, 41, 1, 30, 40, 1, 22, 1, 41, 39, 42, 41, 29, 1, 42, 35, 30, 43, 26, 39, 40, 22, 33, 33, 46, 1, 22, 24, 32, 35, 36, 44, 33, 26, 25, 28, 26, 25, 5, 1, 41, 29, 22, 41, 1, 22, 1, 40, 30, 35, 28, 33, 26, 1, 34, 22, 35, 1, 30, 35, 0, 1, 1, 1, 1, 1, 1, 37, 36, 40, 40, 26, 40, 40, 30, 36, 35, 1, 36, 27, 1, 22, 1]


In [10]:
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified / float(len(characters))
Y_modified = np_utils.to_categorical(Y)
print("X: ",X_modified[0])
print("Y: ",Y_modified[0])

X:  [[0.42857143]
 [0.51785714]
 [0.39285714]
 [0.66071429]
 [0.73214286]
 [0.46428571]
 [0.69642857]
 [0.01785714]
 [0.16071429]
 [0.        ]
 [0.        ]
 [0.01785714]
 [0.01785714]
 [0.01785714]
 [0.01785714]
 [0.01785714]
 [0.01785714]
 [0.53571429]
 [0.73214286]
 [0.01785714]
 [0.53571429]
 [0.71428571]
 [0.01785714]
 [0.39285714]
 [0.01785714]
 [0.73214286]
 [0.69642857]
 [0.75      ]
 [0.73214286]
 [0.51785714]
 [0.01785714]
 [0.75      ]
 [0.625     ]
 [0.53571429]
 [0.76785714]
 [0.46428571]
 [0.69642857]
 [0.71428571]
 [0.39285714]
 [0.58928571]
 [0.58928571]
 [0.82142857]
 [0.01785714]
 [0.39285714]
 [0.42857143]
 [0.57142857]
 [0.625     ]
 [0.64285714]
 [0.78571429]
 [0.58928571]
 [0.46428571]
 [0.44642857]
 [0.5       ]
 [0.46428571]
 [0.44642857]
 [0.08928571]
 [0.01785714]
 [0.73214286]
 [0.51785714]
 [0.39285714]
 [0.73214286]
 [0.01785714]
 [0.39285714]
 [0.01785714]
 [0.71428571]
 [0.53571429]
 [0.625     ]
 [0.5       ]
 [0.58928571]
 [0.46428571]
 [0.01785714]
 [

## LSTM Basic Model

In [12]:
model = Sequential()
model.add(LSTM(200, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(200))
model.add(Dropout(0.2))
model.add(Dense(Y_modified.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(X_modified, Y_modified, epochs=1, batch_size=100)

## Save/Load the Model

In [None]:
model.save_weights('/Users/oindrilasen/WORK_AREA/Data Science/Projects/LSTM_Text_Generator/models/text_generator_basic_model.h5')

In [13]:
model.load_weights('/Users/oindrilasen/WORK_AREA/Data Science/Projects/LSTM_Text_Generator/models/text_generator_basic_model.h5')

## Generate Text

In [14]:
n_to_char = dict((i, c) for i, c in enumerate(characters))

In [16]:
string_mapped = X[0]
full_string = [n_to_char[value] for value in string_mapped]
full_string

['c',
 'h',
 'a',
 'p',
 't',
 'e',
 'r',
 ' ',
 '1',
 '\n',
 '\n',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 'i',
 't',
 ' ',
 'i',
 's',
 ' ',
 'a',
 ' ',
 't',
 'r',
 'u',
 't',
 'h',
 ' ',
 'u',
 'n',
 'i',
 'v',
 'e',
 'r',
 's',
 'a',
 'l',
 'l',
 'y',
 ' ',
 'a',
 'c',
 'k',
 'n',
 'o',
 'w',
 'l',
 'e',
 'd',
 'g',
 'e',
 'd',
 ',',
 ' ',
 't',
 'h',
 'a',
 't',
 ' ',
 'a',
 ' ',
 's',
 'i',
 'n',
 'g',
 'l',
 'e',
 ' ',
 'm',
 'a',
 'n',
 ' ',
 'i',
 'n',
 '\n',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 'p',
 'o',
 's',
 's',
 'e',
 's',
 's',
 'i',
 'o',
 'n',
 ' ',
 'o',
 'f',
 ' ',
 'a',
 ' ']

In [17]:
# generating characters
for i in range(400):
    x = np.reshape(string_mapped,(1,len(string_mapped), 1))
    x = x / float(len(characters))

    pred_index = np.argmax(model.predict(x, verbose=0))
    seq = [n_to_char[value] for value in string_mapped]
    full_string.append(n_to_char[pred_index])

    string_mapped.append(pred_index)
    string_mapped = string_mapped[1:len(string_mapped)]

In [22]:
print(df_text[:200])

chapter 1

      it is a truth universally acknowledged, that a single man in
      possession of a good fortune, must be in want of a wife.

      however little known the feelings or views of such a


In [18]:
#combining text
txt=""
for char in full_string:
    txt = txt+char
txt

'chapter 1\n\n      it is a truth universally acknowledged, that a single man in\n      possession of a lere the sase the sase the sase the sas to the\n      she sooe the sase the sase the sase the sase the sas to the       pere the sas to the sase the sase the sase the sase the was        “i sas to the sase the sase the sase the sase the sas to the\n      pere the sas to the sase the sase the sase the sase the was        “i sas to the sase the sase the sase the sase the sas to the\n      pere the sas '

In [23]:

# Load LSTM network and generate text
import sys
# pick a random seed
start = np.random.randint(0, len(X)-1)
print(start)
pattern = X[start]
n_vocab = len(characters)
print("Seed:")

print( "\"", ''.join([n_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = n_to_char[index]
    seq_in = [n_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print ("\nDone")

267836
Seed:
"       receiving her brother and his wife, who came as usual to spend
      the christmas at longbour "
 and the sase the sase the sas to the
      pere the sas to the sase the sase the sase the sase the was        “i sas to the sase the sase the sase the sase the sas to the
      pere the sas to the sase the sase the sase the sase the was        “i sas to the sase the sase the sase the sase the sas to the
      pere the sas to the sase the sase the sase the sase the was        “i sas to the sase the sase the sase the sase the sas to the
      pere the sas to the sase the sase the sase the sase the was        “i sas to the sase the sase the sase the sase the sas to the
      pere the sas to the sase the sase the sase the sase the was        “i sas to the sase the sase the sase the sase the sas to the
      pere the sas to the sase the sase the sase the sase the was        “i sas to the sase the sase the sase the sase the sas to the
      pere the sas to the sase the sase 