In [1]:
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [2]:
corpus = [
	"The sun is shining",
	"The weather is sweet",
	"Go to the forest",
	"And play baseketball"
 	"deep learning is powerful",
    "natural language processing is fun",
    "machine learning is amazing",
    "i enjoy learning new things"]

In [3]:
# tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

total_words = len(tokenizer.word_index) + 1 

print("Vocab size", total_words)
print("word index ", tokenizer.word_index)

Vocab size 25
word index  {'is': 1, 'the': 2, 'learning': 3, 'sun': 4, 'shining': 5, 'weather': 6, 'sweet': 7, 'go': 8, 'to': 9, 'forest': 10, 'and': 11, 'play': 12, 'baseketballdeep': 13, 'powerful': 14, 'natural': 15, 'language': 16, 'processing': 17, 'fun': 18, 'machine': 19, 'amazing': 20, 'i': 21, 'enjoy': 22, 'new': 23, 'things': 24}


In [4]:
# sequence
input_sequence = []

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequence.append(n_gram_sequence)

In [5]:
input_sequence

[[2, 4],
 [2, 4, 1],
 [2, 4, 1, 5],
 [2, 6],
 [2, 6, 1],
 [2, 6, 1, 7],
 [8, 9],
 [8, 9, 2],
 [8, 9, 2, 10],
 [11, 12],
 [11, 12, 13],
 [11, 12, 13, 3],
 [11, 12, 13, 3, 1],
 [11, 12, 13, 3, 1, 14],
 [15, 16],
 [15, 16, 17],
 [15, 16, 17, 1],
 [15, 16, 17, 1, 18],
 [19, 3],
 [19, 3, 1],
 [19, 3, 1, 20],
 [21, 22],
 [21, 22, 3],
 [21, 22, 3, 23],
 [21, 22, 3, 23, 24]]

In [6]:
max_seq_len = max([len(x) for x in input_sequence])
max_seq_len

6

In [7]:
input_sequence = pad_sequences(input_sequence, maxlen=max_seq_len, padding='pre')

In [8]:
input_sequence

array([[ 0,  0,  0,  0,  2,  4],
       [ 0,  0,  0,  2,  4,  1],
       [ 0,  0,  2,  4,  1,  5],
       [ 0,  0,  0,  0,  2,  6],
       [ 0,  0,  0,  2,  6,  1],
       [ 0,  0,  2,  6,  1,  7],
       [ 0,  0,  0,  0,  8,  9],
       [ 0,  0,  0,  8,  9,  2],
       [ 0,  0,  8,  9,  2, 10],
       [ 0,  0,  0,  0, 11, 12],
       [ 0,  0,  0, 11, 12, 13],
       [ 0,  0, 11, 12, 13,  3],
       [ 0, 11, 12, 13,  3,  1],
       [11, 12, 13,  3,  1, 14],
       [ 0,  0,  0,  0, 15, 16],
       [ 0,  0,  0, 15, 16, 17],
       [ 0,  0, 15, 16, 17,  1],
       [ 0, 15, 16, 17,  1, 18],
       [ 0,  0,  0,  0, 19,  3],
       [ 0,  0,  0, 19,  3,  1],
       [ 0,  0, 19,  3,  1, 20],
       [ 0,  0,  0,  0, 21, 22],
       [ 0,  0,  0, 21, 22,  3],
       [ 0,  0, 21, 22,  3, 23],
       [ 0, 21, 22,  3, 23, 24]], dtype=int32)

In [9]:
X, y = input_sequence[:, :-1], input_sequence[:,-1]

In [10]:
y = to_categorical(y, num_classes=total_words)

In [11]:
print("Input shape : ", X.shape)
print("Label shape : ", y.shape)

Input shape :  (25, 5)
Label shape :  (25, 25)


In [12]:
model = models.Sequential([
	layers.Embedding(input_dim=total_words, output_dim=100, input_length=max_seq_len-1),
	layers.SimpleRNN(150, activation='tanh'),
	layers.Dense(total_words, activation='softmax') # 25
])

model.compile(optimizer=optimizers.Adam(0.01), loss='categorical_crossentropy', metrics=['accuracy'])



In [22]:
model.summary()

In [14]:
model.fit(X, y, epochs=100, verbose=1, validation_batch_size=0.2)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0000e+00 - loss: 3.2213
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.2400 - loss: 2.6059
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.2000 - loss: 3.0127
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.4000 - loss: 2.1181
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.8000 - loss: 1.7421
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.9200 - loss: 1.3176
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.8400 - loss: 0.9412
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.9200 - loss: 0.6877
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x17715ec4040>

In [15]:
loss, acc = model.evaluate(X, y)
print("Loss : ", loss)
print("Accuracy : ", acc)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 208ms/step - accuracy: 0.9600 - loss: 0.0556
Loss :  0.05556227266788483
Accuracy :  0.9599999785423279


In [16]:
def generate_text(seed_text, n_words=5):
    for _ in range(n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=1)
    
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [17]:
print(corpus)

['The sun is shining', 'The weather is sweet', 'Go to the forest', 'And play baseketballdeep learning is powerful', 'natural language processing is fun', 'machine learning is amazing', 'i enjoy learning new things']


In [20]:
print(generate_text("The sun", n_words=3))
print(generate_text("And play", n_words=5))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
The Sun Is Shining Language
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
And Play Baseketballdeep Learning Is Powerful Baseketballdeep


In [21]:
print(generate_text("Go", n_words=3))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Go To The Forest
