In [1]:
#importing dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [3]:
#Load data
file = open('textgenerator.txt').read()

In [5]:
#Tokenization
#Standardization
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)
processed_inputs = tokenize_words(file)

In [6]:
#Chars to numbers
chars = sorted(list(set(processed_inputs)))
chars_to_num = dict((c, i) for i, c in enumerate(chars))

In [7]:
#Check if words to chars or chars to num has worked?
inp_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters = ",inp_len)
print("Total vocab= ",vocab_len)

Total number of characters =  140572
Total vocab=  40


In [9]:
#Sequence length
seq_len = 100
x_data = []
y_data = []
#Loop through the sequence
for i in range(0, inp_len-seq_len, 1):
    in_seq = processed_inputs[i: i + seq_len]
    out_seq = processed_inputs[i + seq_len]
    x_data.append([chars_to_num[char] for char in in_seq])
    y_data.append(chars_to_num[out_seq])
n_patterns = len(x_data)
print("Total Patterns = ",n_patterns)

Total Patterns =  140472


In [10]:
#Convert input sequence to np array
X = numpy.reshape(x_data, (n_patterns, seq_len, 1))
X = X/float(vocab_len)

In [11]:
#One hot encoding our label data
y = np_utils.to_categorical(y_data)

In [12]:
#Creating a sequential model
model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

W0522 16:20:20.873853 140050461259584 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0522 16:20:21.044076 140050461259584 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0522 16:20:21.116841 140050461259584 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0522 16:20:21.798897 140050461259584 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.

In [13]:
#Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

W0522 16:21:40.273869 140050461259584 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0522 16:21:40.300360 140050461259584 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



In [15]:
#Saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [16]:
#Fit the model and let it train
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

W0522 16:28:29.521744 140050461259584 deprecation.py:323] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/4

Epoch 00001: loss improved from inf to 3.01595, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 3.01595 to 2.97901, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.97901 to 2.92757, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.92757 to 2.88397, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7f5f802d36a0>

In [17]:
#Recompile model with saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [20]:
#Output of model back to characters
num_to_char = dict((i,c) for i,c in enumerate(chars))
#Random seed to help generate
start = numpy.random.randint(0, len(x_data)-1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" loorknewtremendousamountsugarstockhandswashingtonnovicesspeculationothersboughthighpricessugardroppi "


In [22]:
#Generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x/float(vocab_len)
    pred = model.predict(x, verbose=0)
    index = numpy.argmax(pred)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

eeteareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoeeareoee