<a href="https://colab.research.google.com/github/SHEHAN-120/gru-sequence-model-nlp/blob/main/Sequence_Prediction_with_GRU_Networks_in_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## NLP GRU


In [1]:
text="""Technology is changing the way we live and work.
Every day, new devices and applications are introduced to make our lives easier.
People use smartphones to communicate, search for information, and complete daily tasks.
With the help of artificial intelligence, machines can now learn and make decisions.
This transformation is creating new opportunities and challenges for society.
"""

In [2]:
import tensorflow as tf
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GRU,Dense


In [3]:
tokenizer=Tokenizer()

In [4]:
tokenizer.fit_on_texts([text])

In [5]:
len(tokenizer.word_index)

47

In [6]:
for sentence in text.split('\n'):
  print(sentence)

Technology is changing the way we live and work. 
Every day, new devices and applications are introduced to make our lives easier. 
People use smartphones to communicate, search for information, and complete daily tasks. 
With the help of artificial intelligence, machines can now learn and make decisions. 
This transformation is creating new opportunities and challenges for society.



In [7]:
for sentence in text.split('\n'):
  print(tokenizer.texts_to_sequences([sentence])[0])

[8, 2, 9, 3, 10, 11, 12, 1, 13]
[14, 15, 4, 16, 1, 17, 18, 19, 5, 6, 20, 21, 22]
[23, 24, 25, 5, 26, 27, 7, 28, 1, 29, 30, 31]
[32, 3, 33, 34, 35, 36, 37, 38, 39, 40, 1, 6, 41]
[42, 43, 2, 44, 4, 45, 1, 46, 7, 47]
[]


In [8]:
input_sequence=[]
for sentence in text.split('\n'):
  tokenized_sentence=tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequence.append(tokenized_sentence[:i+1])

In [9]:
input_sequence

[[8, 2],
 [8, 2, 9],
 [8, 2, 9, 3],
 [8, 2, 9, 3, 10],
 [8, 2, 9, 3, 10, 11],
 [8, 2, 9, 3, 10, 11, 12],
 [8, 2, 9, 3, 10, 11, 12, 1],
 [8, 2, 9, 3, 10, 11, 12, 1, 13],
 [14, 15],
 [14, 15, 4],
 [14, 15, 4, 16],
 [14, 15, 4, 16, 1],
 [14, 15, 4, 16, 1, 17],
 [14, 15, 4, 16, 1, 17, 18],
 [14, 15, 4, 16, 1, 17, 18, 19],
 [14, 15, 4, 16, 1, 17, 18, 19, 5],
 [14, 15, 4, 16, 1, 17, 18, 19, 5, 6],
 [14, 15, 4, 16, 1, 17, 18, 19, 5, 6, 20],
 [14, 15, 4, 16, 1, 17, 18, 19, 5, 6, 20, 21],
 [14, 15, 4, 16, 1, 17, 18, 19, 5, 6, 20, 21, 22],
 [23, 24],
 [23, 24, 25],
 [23, 24, 25, 5],
 [23, 24, 25, 5, 26],
 [23, 24, 25, 5, 26, 27],
 [23, 24, 25, 5, 26, 27, 7],
 [23, 24, 25, 5, 26, 27, 7, 28],
 [23, 24, 25, 5, 26, 27, 7, 28, 1],
 [23, 24, 25, 5, 26, 27, 7, 28, 1, 29],
 [23, 24, 25, 5, 26, 27, 7, 28, 1, 29, 30],
 [23, 24, 25, 5, 26, 27, 7, 28, 1, 29, 30, 31],
 [32, 3],
 [32, 3, 33],
 [32, 3, 33, 34],
 [32, 3, 33, 34, 35],
 [32, 3, 33, 34, 35, 36],
 [32, 3, 33, 34, 35, 36, 37],
 [32, 3, 33, 34, 35, 3

In [10]:
max_len=max([len(x) for x in input_sequence])

In [11]:
max_len

13

In [12]:
padded_input_sequences=pad_sequences(input_sequence,maxlen=max_len,padding='pre')

In [13]:
padded_input_sequences

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  2,  9],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  2,  9,  3],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  8,  2,  9,  3, 10],
       [ 0,  0,  0,  0,  0,  0,  0,  8,  2,  9,  3, 10, 11],
       [ 0,  0,  0,  0,  0,  0,  8,  2,  9,  3, 10, 11, 12],
       [ 0,  0,  0,  0,  0,  8,  2,  9,  3, 10, 11, 12,  1],
       [ 0,  0,  0,  0,  8,  2,  9,  3, 10, 11, 12,  1, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 15],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 15,  4],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 15,  4, 16],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 14, 15,  4, 16,  1],
       [ 0,  0,  0,  0,  0,  0,  0, 14, 15,  4, 16,  1, 17],
       [ 0,  0,  0,  0,  0,  0, 14, 15,  4, 16,  1, 17, 18],
       [ 0,  0,  0,  0,  0, 14, 15,  4, 16,  1, 17, 18, 19],
       [ 0,  0,  0,  0, 14, 15,  4, 16,  1, 17, 18, 19,  5],
       [ 0,  0,  0, 14, 

In [14]:
X=padded_input_sequences[:,:-1]
y=padded_input_sequences[:,-1]

In [15]:
X

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  2,  9],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  8,  2,  9,  3],
       [ 0,  0,  0,  0,  0,  0,  0,  8,  2,  9,  3, 10],
       [ 0,  0,  0,  0,  0,  0,  8,  2,  9,  3, 10, 11],
       [ 0,  0,  0,  0,  0,  8,  2,  9,  3, 10, 11, 12],
       [ 0,  0,  0,  0,  8,  2,  9,  3, 10, 11, 12,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 15],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 15,  4],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 14, 15,  4, 16],
       [ 0,  0,  0,  0,  0,  0,  0, 14, 15,  4, 16,  1],
       [ 0,  0,  0,  0,  0,  0, 14, 15,  4, 16,  1, 17],
       [ 0,  0,  0,  0,  0, 14, 15,  4, 16,  1, 17, 18],
       [ 0,  0,  0,  0, 14, 15,  4, 16,  1, 17, 18, 19],
       [ 0,  0,  0, 14, 15,  4, 16,  1, 17, 18, 19,  5],
       [ 0,  0, 14, 15,  4, 16,

In [16]:
y

array([ 2,  9,  3, 10, 11, 12,  1, 13, 15,  4, 16,  1, 17, 18, 19,  5,  6,
       20, 21, 22, 24, 25,  5, 26, 27,  7, 28,  1, 29, 30, 31,  3, 33, 34,
       35, 36, 37, 38, 39, 40,  1,  6, 41, 43,  2, 44,  4, 45,  1, 46,  7,
       47], dtype=int32)

In [17]:
tokenizer.word_index

{'and': 1,
 'is': 2,
 'the': 3,
 'new': 4,
 'to': 5,
 'make': 6,
 'for': 7,
 'technology': 8,
 'changing': 9,
 'way': 10,
 'we': 11,
 'live': 12,
 'work': 13,
 'every': 14,
 'day': 15,
 'devices': 16,
 'applications': 17,
 'are': 18,
 'introduced': 19,
 'our': 20,
 'lives': 21,
 'easier': 22,
 'people': 23,
 'use': 24,
 'smartphones': 25,
 'communicate': 26,
 'search': 27,
 'information': 28,
 'complete': 29,
 'daily': 30,
 'tasks': 31,
 'with': 32,
 'help': 33,
 'of': 34,
 'artificial': 35,
 'intelligence': 36,
 'machines': 37,
 'can': 38,
 'now': 39,
 'learn': 40,
 'decisions': 41,
 'this': 42,
 'transformation': 43,
 'creating': 44,
 'opportunities': 45,
 'challenges': 46,
 'society': 47}

In [18]:
y=to_categorical(y,num_classes=88)

In [19]:
y

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
y.shape

(52, 88)

In [21]:
X.shape

(52, 12)

## Model Building

In [22]:
model = Sequential()
model.add(Embedding(88,100,input_length=33))
model.add(GRU(150))
model.add(Dense(88,activation='softmax'))



In [23]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [24]:
model.summary()

In [25]:
X.shape

(52, 12)

In [26]:
y.shape

(52, 88)

In [27]:
model.fit(X,y,epochs=100)

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - accuracy: 0.0000e+00 - loss: 4.4749
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1314 - loss: 4.4550
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1883 - loss: 4.4351
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.1651 - loss: 4.4114
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.1442 - loss: 4.3804 
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.1210 - loss: 4.3401
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1418 - loss: 4.2741
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1186 - loss: 4.1629
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x7d2a20c4ce10>

## Testing the model

In [28]:
text_1="Data"

token_text=tokenizer.texts_to_sequences([text_1])[0]
padded_text=pad_sequences([token_text],maxlen=33,padding='pre')
model.predict(padded_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 225ms/step


array([[2.41664065e-06, 3.44378827e-06, 8.14603493e-02, 6.77650645e-02,
        8.41101937e-05, 5.07615950e-05, 1.30162891e-06, 4.95365020e-05,
        7.62228592e-06, 2.81594880e-03, 8.54184473e-05, 6.08264854e-05,
        2.27660785e-05, 7.29703534e-05, 9.15820328e-06, 4.71340772e-03,
        3.99293131e-05, 1.29906002e-05, 1.15122275e-05, 2.65440099e-06,
        1.57671127e-06, 4.36668472e-07, 1.00046861e-07, 6.72513625e-06,
        1.04642203e-02, 4.65449790e-04, 1.44347914e-05, 6.58416320e-06,
        1.15320014e-04, 5.86706337e-05, 1.16079982e-05, 4.15315799e-06,
        4.56909265e-06, 1.46671350e-03, 9.17626603e-04, 1.42899540e-03,
        2.12908912e-04, 2.54814840e-05, 1.10866757e-04, 7.96891584e-07,
        1.67374583e-06, 2.12958253e-06, 2.68113909e-05, 8.26106608e-01,
        8.54203245e-04, 9.51807669e-06, 3.62708597e-05, 1.79662384e-05,
        9.53455674e-06, 6.42772284e-06, 1.17146710e-05, 2.22880681e-05,
        5.31128580e-06, 3.84025543e-06, 4.75959405e-06, 9.545383

In [29]:
pos=np.argmax(model.predict(padded_text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


In [30]:
tokenizer.word_index

{'and': 1,
 'is': 2,
 'the': 3,
 'new': 4,
 'to': 5,
 'make': 6,
 'for': 7,
 'technology': 8,
 'changing': 9,
 'way': 10,
 'we': 11,
 'live': 12,
 'work': 13,
 'every': 14,
 'day': 15,
 'devices': 16,
 'applications': 17,
 'are': 18,
 'introduced': 19,
 'our': 20,
 'lives': 21,
 'easier': 22,
 'people': 23,
 'use': 24,
 'smartphones': 25,
 'communicate': 26,
 'search': 27,
 'information': 28,
 'complete': 29,
 'daily': 30,
 'tasks': 31,
 'with': 32,
 'help': 33,
 'of': 34,
 'artificial': 35,
 'intelligence': 36,
 'machines': 37,
 'can': 38,
 'now': 39,
 'learn': 40,
 'decisions': 41,
 'this': 42,
 'transformation': 43,
 'creating': 44,
 'opportunities': 45,
 'challenges': 46,
 'society': 47}

In [31]:
for word,index in tokenizer.word_index.items():
  if index==pos:
    print(word)

transformation


In [34]:
text_2="It is"

token_text_2=tokenizer.texts_to_sequences([text_2])[0]
padded_text_2=pad_sequences([token_text_2],maxlen=33,padding='pre')
model.predict(padded_text_2)

pos_2=np.argmax(model.predict(padded_text_2))

for word,index in tokenizer.word_index.items():
  if index==pos_2:
    print(word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
changing
