# Develop a Neural Language Model for Text Generation
## 2. Train language model

1. Load doc into memory 
2. Integer encode sequences of words  
(The word embedding layer expects input sequences to be comprised of integers.
→ use the Tokenizer class in the Keras API!)
3. Define the model and, fit model
4. Save the model to file
5. Save the tokenizer


In [None]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [None]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [None]:
# load
doc = load_doc('republic_sequences.txt')
lines = doc.split('\n')

In [None]:
lines[:3]

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [None]:
print(len(lines), len(sequences))
print(type(lines), type(sequences))

In [None]:
print(lines[0])

In [None]:
print(sequences[0])

In [None]:
tokenizer.word_index

In [None]:
tokenizer.word_index.get('was')

In [None]:
len(tokenizer.word_index)

In [None]:
# # define model
# model = define_model(vocab_size, seq_length)
# # fit model
# model.fit(X, y, batch_size=128, epochs=100)

In [None]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

In [None]:
# separate into input and output
# list indices must be integers or slices, not tuple
sequences = array(sequences)
print(type(sequences), sequences.shape)

X, y = sequences[:,:-1], sequences[:,-1]
print(X.shape, y.shape)

In [None]:
print(sequences[0])
print(X[0])
print(y[0])

In [None]:
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]
print(seq_length)

In [None]:
print(y[0])
print(y[0][57])

In [None]:
# define the model
def define_model(vocab_size, seq_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=seq_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [None]:
### You will get different results,
### but perhaps an accuracy of just over 50% of predicting the next word in the sequence, which is
### not bad. We are not aiming for 100% accuracy (e.g. a model that memorized the text), but
### rather a model that captures the essence of the text.

In [None]:
# define model
model = define_model(vocab_size, seq_length)
# fit model
model.fit(X, y, batch_size=128, epochs=100)
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))