#Recurrent neural networks
https://nlp.stanford.edu/projects/glove/

https://towardsdatascience.com/a-deep-learning-approach-in-predicting-the-next-word-s-7b0ee9341bfe

In [2]:
import csv
import string
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam


In [3]:
# Importing the data from The Project Gutenberg EBook of A Short History of the World, by H. G. Wells

# open() function opens a file, and returns it as a file object
with open('history.txt') as f:
  # read(): returns the specified number of bytes/ whole file (default).
  content = f.read()

# split(): to split a string, join(): to join a string with " "
content = " ".join(content.split())

# strip(): removes leading and trailing whitespaces
content = content.strip()

# lower(): returns a string with all characters in lower case
content = content.lower()

#content = content[10000:400000] #limit the size of the text

print(len(content))

722922


In [4]:
# tokenize to words and sentences (word sequences)

from keras.preprocessing.text import text_to_word_sequence

# text_to_word_sequence: Split a sentence into a list of words
words = text_to_word_sequence(content)
print(words)
print(len(words))

# split(): to split a string at '.' (full stop) as sentences generally ends with it
elems = content.split('.')
print(elems)

121915


In [5]:
# Capable to create word index, creates dictionary

# Tokenization: To divide large quantity of text into smaller fragments called Tokens

token = Tokenizer() 
token.fit_on_texts(elems)

# word_index: assigns a unique index to each word present in the text
word_index = token.word_index
print(word_index)

# maximum unique words in word index
max_words = len(word_index)
print(max_words)


11538


In [6]:
# replace words with indexes
arr=[]
for w in words:
  arr.append(word_index[w])
  
# creating numpy array (i.e. [1,2,3,4] to [1 2 3 4])
x = np.array(arr)
print(x.shape)

(121915,)


In [7]:
# sliding window
window_size = 20

in_data = np.zeros((len(x)-window_size, window_size))
out_datax = np.zeros((len(x)-window_size,1))

for i in range(len(x)-window_size):
  in_data[i]=x[i:i+window_size]
  out_datax[i]=x[i+window_size:i+window_size+1]

out_data=tf.keras.utils.to_categorical(out_datax,num_classes=max_words+1)
# print(out_data.shape)
# print(in_data,out_datax)

# print(in_data.shape, out_data.shape)
# print(in_data, out_data)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(in_data, out_data, test_size =0.2, random_state=123)
del in_data
del out_data
# print(x_train.shape, x_test.shape)
# print(y_train.shape, y_test.shape)

In [8]:
# GloVe 

max_embed = 50
path = 'glove.6B.50d.txt'

embeddings_index = {}
with open(path) as f:
  for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.array(values[1:max_embed+1], dtype='float32')
    embeddings_index[word] = coeffs
dict(list(embeddings_index.items())[0:2])

{'the': array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
        -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
         2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
         1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
        -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
        -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
         4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
         7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
        -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
         1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
       dtype=float32),
 ',': array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
        -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
        -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
        -0.4

In [9]:
# create matrix of embeddings for our words
# from big glove file, will would extract our words, & use it as weight matrix 

print(word_index)
embeddings_matrix = np.zeros((max_words+1,max_embed))
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embeddings_matrix[i] = embedding_vector
print(embeddings_matrix.shape)
del embeddings_index

(11539, 50)


In [10]:
import tensorflow as tf
from keras import backend as k

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = max_words+1, output_dim = max_embed, weights = [embeddings_matrix], input_length = window_size, trainable = False),
    tf.keras.layers.SimpleRNN(256, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.SimpleRNN(256),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(max_words+1, activation='softmax')
    ])
model.summary()
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics = ['accuracy'])


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 50)            576950    
                                                                 
 simple_rnn (SimpleRNN)      (None, 20, 256)           78592     
                                                                 
 dropout (Dropout)           (None, 20, 256)           0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 256)               131328    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense (Dense)               (None, 256)               65792     
                                                                 
 dense_1 (Dense)             (None, 11539)             2

In [11]:
# training

history = model.fit(x_train, y_train, epochs=10, validation_split=0.2, verbose=1, batch_size=256)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
def prediction(in_phase):
    in_phase=in_phase.strip()
    in_phase=in_phase.lower()
    print(in_phase)
    in_words=text_to_word_sequence(in_phase)
    
#replace words with indexes

    arr=[]
    for w in in_words[:window_size]:
        arr.append(word_index[w])
    x=np.array(arr)

    if len(x)<window_size:
        z=np.zeros((window_size-len(x)))
        x=np.concatenate((z,x))

    x=x.reshape(1,len(x))
    p=model.predict(x)
    predicted=np.argmax(p,axis=-1)
    key=[k for k,v in word_index.items() if v== predicted]
    return key

in_text="I understand that they could see"


for i in range(10):
    pr=prediction(in_text)
    print(pr)
    in_text=in_text+" "+pr[0]

i understand that they could see
['to']
i understand that they could see to
['the']
i understand that they could see to the
['world']
i understand that they could see to the world
['of']
i understand that they could see to the world of
['the']
i understand that they could see to the world of the
['world']
i understand that they could see to the world of the world
['of']
i understand that they could see to the world of the world of
['the']
i understand that they could see to the world of the world of the
['world']
i understand that they could see to the world of the world of the world
['of']
