In [45]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [46]:
data = [
    ' Jack and Jill went up the hill'
    	' To fetch a pail of water'
    	' Jack fell down and broke his crown'
	   ' And Jill came tumbling after'
]

In [47]:
data

[' Jack and Jill went up the hill To fetch a pail of water Jack fell down and broke his crown And Jill came tumbling after']

In [48]:
tokenizer = Tokenizer(100)
tokenizer.fit_on_texts(data)
text_to_numbers = tokenizer.texts_to_sequences(data)
word_ids = tokenizer.word_index
print(word_ids)

{'and': 1, 'jack': 2, 'jill': 3, 'went': 4, 'up': 5, 'the': 6, 'hill': 7, 'to': 8, 'fetch': 9, 'a': 10, 'pail': 11, 'of': 12, 'water': 13, 'fell': 14, 'down': 15, 'broke': 16, 'his': 17, 'crown': 18, 'came': 19, 'tumbling': 20, 'after': 21}


In [49]:
print(text_to_numbers)

[[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]]


In [50]:
lengths_of_strings = [len(s) for s in text_to_numbers]
print(max(lengths_of_strings))

25


In [51]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_num_text_data = pad_sequences(text_to_numbers, padding = 'post', maxlen = max(lengths_of_strings), truncating = 'post')
print(padded_num_text_data)

[[ 2  1  3  4  5  6  7  8  9 10 11 12 13  2 14 15  1 16 17 18  1  3 19 20
  21]]


# Find the vocabulary size

In [52]:
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)

22


# Prepare input and output data for training
### Today is a sunny[input] ___? (day)[output]

In [53]:
X_train = padded_num_text_data[:,:-1]
X_train

array([[ 2,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  2, 14, 15,
         1, 16, 17, 18,  1,  3, 19, 20]])

In [54]:
y_train = padded_num_text_data[:,-1]
print(y_train)

[21]


In [55]:
padded_num_text_data = pad_sequences(text_to_numbers, maxlen = max(lengths_of_strings))
print(padded_num_text_data)

[[ 2  1  3  4  5  6  7  8  9 10 11 12 13  2 14 15  1 16 17 18  1  3 19 20
  21]]


In [56]:
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)

22


In [57]:
X_train = padded_num_text_data[:,:-1]
X_train

array([[ 2,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  2, 14, 15,
         1, 16, 17, 18,  1,  3, 19, 20]])

In [58]:
y_train = padded_num_text_data[:,-1]
print(y_train)

[21]


In [59]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN
from tensorflow.keras.layers import Embedding

In [60]:
y_train = to_categorical(y_train, num_classes=vocabulary_size)
print(y_train)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [61]:
model = Sequential()
model.add(Embedding(vocabulary_size,10,
                    input_length=max(lengths_of_strings)-1))
model.add(Flatten())
model.add(Dense(vocabulary_size, activation = 'softmax'))

In [62]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 24, 10)            220       
                                                                 
 flatten_2 (Flatten)         (None, 240)               0         
                                                                 
 dense_2 (Dense)             (None, 22)                5302      
                                                                 
Total params: 5522 (21.57 KB)
Trainable params: 5522 (21.57 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [63]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [64]:
model.fit(X_train, y_train, epochs = 500)

Epoch 1/500


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 7

<keras.src.callbacks.History at 0x293c5c84bd0>

In [65]:
max_sequence_length = max(lengths_of_strings)

In [70]:
# Example prediction
import numpy as np
input_sequence = ["jack and jill went up the "]
input_sequence = tokenizer.texts_to_sequences(input_sequence)
input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length-1)

predicted_probs = model.predict(input_sequence)[0]
predicted_class_index = np.argmax(predicted_probs)

# Convert predicted class index back to word
predicted_word = tokenizer.index_word[predicted_class_index]

print("Predicted word:", predicted_word)

Predicted word: came


# Predict on the following Text
## data3 = """ Jack and Jill went up the hill\n
##		To fetch a pail of water\n
##		Jack fell down and broke his crown\n
##		And Jill came tumbling after\n """