## NEXT WORD PREDICTION USING RNN

In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.layers import Dense, LSTM ,Embedding
from keras.models import Sequential


In [2]:
#source text
data='''Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems 
to extract knowledge and insights from noisy, structured and unstructured data, and apply knowledge from data across
a broad range of application domains. Data science is related to data mining, machine learning and big data.
Data science is a concept to unify statistics, data analysis, informatics, and their related methods in order to 
understand and analyse actual phenomena with data.It uses techniques and theories drawn from many fields within 
the context of mathematics, statistics, computer science, information science, and domain knowledge.'''

In [3]:
#integer encode text
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data= tokenizer.texts_to_sequences([data])[0]
encoded_data

[1,
 3,
 5,
 14,
 15,
 16,
 17,
 8,
 18,
 9,
 19,
 20,
 2,
 21,
 4,
 22,
 6,
 2,
 23,
 7,
 24,
 25,
 2,
 26,
 1,
 2,
 27,
 6,
 7,
 1,
 28,
 10,
 29,
 30,
 11,
 31,
 32,
 1,
 3,
 5,
 12,
 4,
 1,
 33,
 34,
 35,
 2,
 36,
 1,
 1,
 3,
 5,
 10,
 37,
 4,
 38,
 13,
 1,
 39,
 40,
 2,
 41,
 12,
 9,
 42,
 43,
 4,
 44,
 2,
 45,
 46,
 47,
 48,
 1,
 49,
 8,
 50,
 2,
 51,
 52,
 7,
 53,
 54,
 55,
 56,
 57,
 11,
 58,
 13,
 59,
 3,
 60,
 3,
 2,
 61,
 6]

In [4]:
#determining the vocabulary size
vocab_size=len(tokenizer.word_index)+1
print("Vocabulary Size is {}".format(vocab_size))

Vocabulary Size is 62


In [5]:
#creating a sequence of words to fitthe model wth one word as input and one word as output
#create word- word sequences
sequences=list()
for i in range(1,len(encoded_data)):
    sequence=encoded_data[i-1:i+1]
    sequences.append(sequence)
    
print('Total Sequences: {}' .format(len(sequences)))   

Total Sequences: 95


In [6]:
#input output pairs
sequences

[[1, 3],
 [3, 5],
 [5, 14],
 [14, 15],
 [15, 16],
 [16, 17],
 [17, 8],
 [8, 18],
 [18, 9],
 [9, 19],
 [19, 20],
 [20, 2],
 [2, 21],
 [21, 4],
 [4, 22],
 [22, 6],
 [6, 2],
 [2, 23],
 [23, 7],
 [7, 24],
 [24, 25],
 [25, 2],
 [2, 26],
 [26, 1],
 [1, 2],
 [2, 27],
 [27, 6],
 [6, 7],
 [7, 1],
 [1, 28],
 [28, 10],
 [10, 29],
 [29, 30],
 [30, 11],
 [11, 31],
 [31, 32],
 [32, 1],
 [1, 3],
 [3, 5],
 [5, 12],
 [12, 4],
 [4, 1],
 [1, 33],
 [33, 34],
 [34, 35],
 [35, 2],
 [2, 36],
 [36, 1],
 [1, 1],
 [1, 3],
 [3, 5],
 [5, 10],
 [10, 37],
 [37, 4],
 [4, 38],
 [38, 13],
 [13, 1],
 [1, 39],
 [39, 40],
 [40, 2],
 [2, 41],
 [41, 12],
 [12, 9],
 [9, 42],
 [42, 43],
 [43, 4],
 [4, 44],
 [44, 2],
 [2, 45],
 [45, 46],
 [46, 47],
 [47, 48],
 [48, 1],
 [1, 49],
 [49, 8],
 [8, 50],
 [50, 2],
 [2, 51],
 [51, 52],
 [52, 7],
 [7, 53],
 [53, 54],
 [54, 55],
 [55, 56],
 [56, 57],
 [57, 11],
 [11, 58],
 [58, 13],
 [13, 59],
 [59, 3],
 [3, 60],
 [60, 3],
 [3, 2],
 [2, 61],
 [61, 6]]

In [7]:
#split the sequences into input element X and output elememnt Y
sequences=np.asarray(sequences)
X,y=sequences[:,0],sequences[:,1]

In [8]:
X[:5]

array([ 1,  3,  5, 14, 15])

In [9]:
y[:5]

array([ 3,  5, 14, 15, 16])

In [10]:
# one hot encode outputs
y = np_utils.to_categorical(y, num_classes=vocab_size)
# define model
y[:5]

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0

In [11]:
#Model Buildng
model=Sequential()
model.add(Embedding(vocab_size,10,input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             620       
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 62)                3162      
                                                                 
Total params: 15,982
Trainable params: 15,982
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
#compiling the network
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [13]:
model.fit(X,y,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x228866e3340>

In [14]:
#generate a sequence from the model
def generate_seq(model,tokenizer,enter_text,n_predict):
    input,result=enter_text,enter_text
    #generate a fix number of words
    for i in range (n_predict):
        
        #encode the text as integers
        
        encoded=tokenizer.texts_to_sequences([input])[0]
        encoded=np.asarray(encoded)
        
        #predict a word in vocabulary
        predicted_word = np.argmax(model.predict(encoded))
        
        
        #map predicted word index to word
        out_word=''
        for word,index in tokenizer.word_index.items():
            if index==predicted_word:
                out_word=word
                break;
        #append to  input
        input, result= out_word, result+" "+out_word
            
    return result;   
        
        

In [15]:
#evaluating 
print(generate_seq(model, tokenizer, 'statistics', 6))

statistics data data data data data data


In [16]:
#evaluating 
print(generate_seq(model, tokenizer, 'informatics', 6))

informatics and statistics data data data data
