<a href="https://colab.research.google.com/github/Nik8x/Deep_writing_generating_text/blob/master/Word_Based_Neural_Language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding

import numpy as np

In [0]:
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """

In [0]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

In [5]:
data

' Jack and Jill went up the hill\n\n\t\tTo fetch a pail of water\n\n\t\tJack fell down and broke his crown\n\n\t\tAnd Jill came tumbling after\n '

In [9]:
encoded

[2,
 1,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 2,
 14,
 15,
 1,
 16,
 17,
 18,
 1,
 3,
 19,
 20,
 21]

In [10]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 22


In [14]:
tokenizer.word_index

{'a': 10,
 'after': 21,
 'and': 1,
 'broke': 16,
 'came': 19,
 'crown': 18,
 'down': 15,
 'fell': 14,
 'fetch': 9,
 'hill': 7,
 'his': 17,
 'jack': 2,
 'jill': 3,
 'of': 12,
 'pail': 11,
 'the': 6,
 'to': 8,
 'tumbling': 20,
 'up': 5,
 'water': 13,
 'went': 4}

In [24]:
encoded[2-1 : 2+1]

[1, 3]

In [20]:
# create word -> word sequences
sequences = []
for i in range(1, len(encoded)):
	sequence = encoded[i-1 : i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 24


In [21]:
sequences

[[2, 1],
 [1, 3],
 [3, 4],
 [4, 5],
 [5, 6],
 [6, 7],
 [7, 8],
 [8, 9],
 [9, 10],
 [10, 11],
 [11, 12],
 [12, 13],
 [13, 2],
 [2, 14],
 [14, 15],
 [15, 1],
 [1, 16],
 [16, 17],
 [17, 18],
 [18, 1],
 [1, 3],
 [3, 19],
 [19, 20],
 [20, 21]]

In [0]:
# split into X and y elements
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]

In [30]:
X

array([ 2,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  2, 14, 15,  1,
       16, 17, 18,  1,  3, 19, 20])

In [32]:
y

array([ 1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  2, 14, 15,  1, 16,
       17, 18,  1,  3, 19, 20, 21])

In [0]:
# one hot encode outputs
y = to_categorical(y, num_classes = vocab_size)

In [37]:
y

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,

In [40]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             220       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
None


In [41]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=2)

Instructions for updating:
Use tf.cast instead.
Epoch 1/500
 - 3s - loss: 3.0911 - acc: 0.0000e+00
Epoch 2/500
 - 0s - loss: 3.0903 - acc: 0.0000e+00
Epoch 3/500
 - 0s - loss: 3.0895 - acc: 0.0000e+00
Epoch 4/500
 - 0s - loss: 3.0888 - acc: 0.0833
Epoch 5/500
 - 0s - loss: 3.0880 - acc: 0.0833
Epoch 6/500
 - 0s - loss: 3.0872 - acc: 0.0833
Epoch 7/500
 - 0s - loss: 3.0864 - acc: 0.1250
Epoch 8/500
 - 0s - loss: 3.0856 - acc: 0.1250
Epoch 9/500
 - 0s - loss: 3.0848 - acc: 0.1250
Epoch 10/500
 - 0s - loss: 3.0839 - acc: 0.1250
Epoch 11/500
 - 0s - loss: 3.0831 - acc: 0.1250
Epoch 12/500
 - 0s - loss: 3.0823 - acc: 0.1667
Epoch 13/500
 - 0s - loss: 3.0814 - acc: 0.1667
Epoch 14/500
 - 0s - loss: 3.0806 - acc: 0.1667
Epoch 15/500
 - 0s - loss: 3.0797 - acc: 0.2083
Epoch 16/500
 - 0s - loss: 3.0788 - acc: 0.2083
Epoch 17/500
 - 0s - loss: 3.0779 - acc: 0.2083
Epoch 18/500
 - 0s - loss: 3.0769 - acc: 0.2083
Epoch 19/500
 - 0s - loss: 3.0760 - acc: 0.2083
Epoch 20/500
 - 0s - loss: 3.0750 - a

<keras.callbacks.History at 0x7f2d9c3f54e0>

In [45]:
# evaluate
in_text = 'Jack'
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = np.array(encoded)
yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
	if index == yhat:
		print(word)

Jack
and


In [0]:
def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = np.array(encoded)
		# predict a word in the vocabulary
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [55]:
print(generate_seq(model, tokenizer, 'Jack', 6))

Jack and jill went up the hill
