In [2]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
text = """He is sick\n
That is so sick\n
The sick cat is dying\n"""

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1
print("Size of vocabulary : %d" % vocab_size)

Size of vocabulary : 9


In [5]:
print(tokenizer.word_index)

{'is': 1, 'sick': 2, 'he': 3, 'that': 4, 'so': 5, 'the': 6, 'cat': 7, 'dying': 8}


In [6]:
sequences = list()
for line in text.split('\n'):
    encoded = tokenizer.texts_to_sequences({line})[0]
    #print(encoded)
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

In [7]:
print(sequences,"\nCount of sample : ",len(sequences))

[[3, 1], [3, 1, 2], [4, 1], [4, 1, 5], [4, 1, 5, 2], [6, 2], [6, 2, 7], [6, 2, 7, 1], [6, 2, 7, 1, 8]] 
Count of sample :  9


In [8]:
max_len = max(len(l) for l in sequences)
print("Maximum length of sample : {}".format(max_len))

Maximum length of sample : 5


In [9]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [10]:
print(sequences)

[[0 0 0 3 1]
 [0 0 3 1 2]
 [0 0 0 4 1]
 [0 0 4 1 5]
 [0 4 1 5 2]
 [0 0 0 6 2]
 [0 0 6 2 7]
 [0 6 2 7 1]
 [6 2 7 1 8]]


In [11]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

In [12]:
print(X,"\n")
print(y) #y will be label

[[0 0 0 3]
 [0 0 3 1]
 [0 0 0 4]
 [0 0 4 1]
 [0 4 1 5]
 [0 0 0 6]
 [0 0 6 2]
 [0 6 2 7]
 [6 2 7 1]] 

[1 2 1 5 2 2 7 1 8]


In [13]:
y = to_categorical(y, num_classes=vocab_size)

In [14]:
print(y)

[[0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [16]:
embedding_dim = 10
hidden_units = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(SimpleRNN(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))

In [17]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200
1/1 - 3s - loss: 2.2485 - accuracy: 0.0000e+00 - 3s/epoch - 3s/step
Epoch 2/200
1/1 - 0s - loss: 2.2319 - accuracy: 0.0000e+00 - 21ms/epoch - 21ms/step
Epoch 3/200
1/1 - 0s - loss: 2.2160 - accuracy: 0.0000e+00 - 20ms/epoch - 20ms/step
Epoch 4/200
1/1 - 0s - loss: 2.2006 - accuracy: 0.1111 - 14ms/epoch - 14ms/step
Epoch 5/200
1/1 - 0s - loss: 2.1857 - accuracy: 0.1111 - 15ms/epoch - 15ms/step
Epoch 6/200
1/1 - 0s - loss: 2.1712 - accuracy: 0.3333 - 15ms/epoch - 15ms/step
Epoch 7/200
1/1 - 0s - loss: 2.1569 - accuracy: 0.4444 - 14ms/epoch - 14ms/step
Epoch 8/200
1/1 - 0s - loss: 2.1427 - accuracy: 0.4444 - 14ms/epoch - 14ms/step
Epoch 9/200
1/1 - 0s - loss: 2.1284 - accuracy: 0.4444 - 21ms/epoch - 21ms/step
Epoch 10/200
1/1 - 0s - loss: 2.1140 - accuracy: 0.4444 - 19ms/epoch - 19ms/step
Epoch 11/200
1/1 - 0s - loss: 2.0993 - accuracy: 0.5556 - 20ms/epoch - 20ms/step
Epoch 12/200
1/1 - 0s - loss: 2.0842 - accuracy: 0.5556 - 20ms/epoch - 20ms/step
Epoch 13/200
1/1 - 0s - loss:

1/1 - 0s - loss: 0.5604 - accuracy: 0.8889 - 20ms/epoch - 20ms/step
Epoch 103/200
1/1 - 0s - loss: 0.5496 - accuracy: 0.8889 - 21ms/epoch - 21ms/step
Epoch 104/200
1/1 - 0s - loss: 0.5389 - accuracy: 0.8889 - 20ms/epoch - 20ms/step
Epoch 105/200
1/1 - 0s - loss: 0.5283 - accuracy: 0.8889 - 19ms/epoch - 19ms/step
Epoch 106/200
1/1 - 0s - loss: 0.5178 - accuracy: 0.8889 - 19ms/epoch - 19ms/step
Epoch 107/200
1/1 - 0s - loss: 0.5074 - accuracy: 0.8889 - 20ms/epoch - 20ms/step
Epoch 108/200
1/1 - 0s - loss: 0.4973 - accuracy: 0.8889 - 18ms/epoch - 18ms/step
Epoch 109/200
1/1 - 0s - loss: 0.4872 - accuracy: 0.8889 - 20ms/epoch - 20ms/step
Epoch 110/200
1/1 - 0s - loss: 0.4772 - accuracy: 1.0000 - 20ms/epoch - 20ms/step
Epoch 111/200
1/1 - 0s - loss: 0.4672 - accuracy: 1.0000 - 21ms/epoch - 21ms/step
Epoch 112/200
1/1 - 0s - loss: 0.4575 - accuracy: 1.0000 - 21ms/epoch - 21ms/step
Epoch 113/200
1/1 - 0s - loss: 0.4478 - accuracy: 1.0000 - 17ms/epoch - 17ms/step
Epoch 114/200
1/1 - 0s - loss:

<keras.callbacks.History at 0x31257dc0>

In [18]:
def sentence_generation(model, tokenizer, current_word, n):
    init_word = current_word
    sentence = ''
    
    for _ in range(n):
        #encording & padding
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre')
        
        result = model.predict(encoded, verbose=0)
        print(result,"\n")
        result = np.argmax(result, axis=1)
        
        for word, index in tokenizer.word_index.items():
            if index == result:
                break
                
        current_word = current_word + ' ' + word
        sentence = sentence + ' ' + word
        
    sentence = init_word + sentence
    
    return sentence

In [20]:
sentence_generation(model, tokenizer, "That", 3)

[[7.4991811e-04 9.5172209e-01 4.1163312e-03 2.0920450e-03 1.3026928e-03
  3.0452847e-02 1.3999537e-03 5.2934848e-03 2.8705851e-03]] 

[[4.8426865e-03 3.9151944e-02 1.4619547e-02 3.5878629e-03 5.8914307e-03
  8.7554902e-01 1.6338956e-03 3.2414953e-04 5.4399390e-02]] 

[[7.5530676e-05 2.0256302e-04 9.9451232e-01 1.1349673e-04 1.1266001e-03
  4.3063282e-04 7.2951909e-05 1.5072075e-03 1.9586843e-03]] 



'That is so sick'

In [21]:
sentence_generation(model, tokenizer, "The", 5)

[[0.00628905 0.28523317 0.52123606 0.00439475 0.00496216 0.10513956
  0.00396619 0.01373723 0.05504186]] 

[[0.00214279 0.27497113 0.07058426 0.00212181 0.00387782 0.00376626
  0.00474772 0.6353277  0.00246052]] 

[[2.5653986e-03 9.7502851e-01 3.2470524e-04 3.7379065e-03 2.0932181e-04
  1.4645806e-02 9.5541956e-04 1.0463791e-03 1.4866423e-03]] 

[[3.7548605e-03 5.3566549e-04 5.1840851e-03 1.8490843e-03 4.0181293e-03
  1.9605119e-02 2.1620439e-03 1.8798329e-04 9.6270299e-01]] 

[[0.00304753 0.00380214 0.3053477  0.00726334 0.01690933 0.0009032
  0.00636685 0.65458554 0.0017744 ]] 



'The sick cat is dying cat'