In [32]:
import tensorflow as tf
import string
import requests


In [33]:
response=requests.get("http://www.gutenberg.org/cache/epub/5200/pg5200.txt")

In [34]:
response.text[:1500]


'\ufeffThe Project Gutenberg EBook of Metamorphosis, by Franz Kafka\r\nTranslated by David Wyllie.\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r\nre-use it under the terms of the Project Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org\r\n\r\n** This is a COPYRIGHTED Project Gutenberg eBook, Details Below **\r\n**     Please follow the copyright guidelines in this file.     **\r\n\r\n\r\nTitle: Metamorphosis\r\n\r\nAuthor: Franz Kafka\r\n\r\nTranslator: David Wyllie\r\n\r\nRelease Date: August 16, 2005 [EBook #5200]\r\nFirst posted: May 13, 2002\r\nLast updated: May 20, 2012\r\n\r\nLanguage: English\r\n\r\n\r\n*** START OF THIS PROJECT GUTENBERG EBOOK METAMORPHOSIS ***\r\n\r\n\r\n\r\n\r\nCopyright (C) 2002 David Wyllie.\r\n\r\n\r\n\r\n\r\n\r\n  Metamorphosis\r\n  Franz Kafka\r\n\r\nTranslated by David Wyllie\r\n\r\n\r\n\r\nI\r\n\r\n\r\nOne morning, when Gregor

In [35]:
data = response.text.split('\n')
data[0]


'\ufeffThe Project Gutenberg EBook of Metamorphosis, by Franz Kafka\r'

In [36]:
data = data[253:]
data[0]

'away from the bed, bend down with the load and then be patient and\r'

In [37]:
len(data)

2110

In [38]:
data = " ".join(data)
data[:1000]


'away from the bed, bend down with the load and then be patient and\r careful as he swang over onto the floor, where, hopefully, the\r little legs would find a use.  Should he really call for help\r though, even apart from the fact that all the doors were locked?\r Despite all the difficulty he was in, he could not suppress a smile\r at this thought.\r \r After a while he had already moved so far across that it would have\r been hard for him to keep his balance if he rocked too hard.  The\r time was now ten past seven and he would have to make a final\r decision very soon.  Then there was a ring at the door of the flat.\r "That\'ll be someone from work", he said to himself, and froze very\r still, although his little legs only became all the more lively as\r they danced around.  For a moment everything remained quiet.\r "They\'re not opening the door", Gregor said to himself, caught in\r some nonsensical hope.  But then of course, the maid\'s firm steps\r went to the door as ever and o

In [39]:
def clean_text(doc):
 tokens = doc.split()
 table = str.maketrans('', '', string.punctuation)
 tokens = [w.translate(table) for w in tokens]
 tokens = [word for word in tokens if word.isalpha()]
 tokens = [word.lower() for word in tokens]
 return tokens
tokens = clean_text(data)
print(tokens[:50])

['away', 'from', 'the', 'bed', 'bend', 'down', 'with', 'the', 'load', 'and', 'then', 'be', 'patient', 'and', 'careful', 'as', 'he', 'swang', 'over', 'onto', 'the', 'floor', 'where', 'hopefully', 'the', 'little', 'legs', 'would', 'find', 'a', 'use', 'should', 'he', 'really', 'call', 'for', 'help', 'though', 'even', 'apart', 'from', 'the', 'fact', 'that', 'all', 'the', 'doors', 'were', 'locked', 'despite']


In [40]:
len(tokens)

22607

In [41]:
length = 50 + 1
lines = []
for i in range(length, len(tokens)):
 seq = tokens[i-length:i]
 line = ' '.join(seq)
 lines.append(line)
 if i > 200000:
        break

print(len(lines))

22556


# Build LSTM Model and Prepare X and y

#### import all the necessary libraries used to pre-process the data and create the layers of the neural network.

In [42]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

##### We are going to create a unique numerical token for each unique word in the dataset.fit_on_texts() updates internal vocabulary based on a list of texts. texts_to_sequences() transforms each text in texts to a sequence of integers.

In [43]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)


##### sequences containes a list of integer values created by tokenizer. Each line in sequences has 51 words. Now we will split each line such that the first 50 words are in X and the last word is in y.


In [44]:
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:,-1]
X[0]


array([ 103,   29,    1,  245, 2883,   98,   14,    1, 1435,    3,   48,
         30,  618,    3,  756,   13,    6, 1434,  107,  165,    1,  149,
         86, 2880,    1,   78,  225,   21,  530,   12,  156,  193,    6,
        142,  754,   17,  180,  116,   49, 1433,   29,    1,  753,   11,
         26,    1,  455,   58,  617,  329])

In [45]:
vocab_size = len(tokenizer.word_index) + 1


##### to_categorical() converts a class vector (integers) to binary class matrix. num_classes is the total number of classes which is vocab_size.

In [46]:
y = to_categorical(y, num_classes=vocab_size)


In [47]:
seq_length = X.shape[1]
seq_length

50

# LSTM Model


##### A Sequential model is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor

In [48]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))


In [49]:
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            144250    
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 2885)              291385    
Total params: 586,535
Trainable params: 586,535
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

##### After compiling the model we will now train the model using model.fit() on the training dataset. We will use 100 epochs to train the model. An epoch is an iteration over the entire x and y data provided. batch_size is the number of samples per gradient update i.e. the weights will be updates after 256 training examples.

##### We are now going to generate words using the model. For this we need a set of 50 words to predict the 51st word. So we are taking a random line.


In [23]:
model.fit(X, y, batch_size = 256, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x24a9db10250>

In [51]:
seed_text=lines[12343]
seed_text

'condition seemed serious enough to remind even his father that gregor despite his current sad and revolting form was a family member who could not be treated as an enemy on the contrary as a family there was a duty to swallow any revulsion for him and to be patient just'

###### generate_text_seq() generates n_words number of words after the given seed_text. We are going to pre-process the seed_text before predicting. We are going to encode the seed_text using the same encoding used for encoding the training data. Then we are going to convert the seed_textto 50 words by using pad_sequences(). Now we will predict using model.predict_classes(). After that we will search the word in tokenizer using the index in y_predict. Finally we will append the predicted word to seed_text and text and repeat the process.

In [52]:
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
    text = []
    
 
    for _ in range(n_words):
        
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating='pre')
        y_predict = model.predict_classes(encoded)
        predicted_word = ''
        
        for word, index in tokenizer.word_index.items(): 
            if index == y_predict: 
                predicted_word = word 
                break
                
        seed_text = seed_text + ' ' + predicted_word
        
        text.append(predicted_word)
    return ' '.join(text)
       


            


        
 

##### We can see that the next 100 words are predicted by the model for the seed_text.


In [53]:
generate_text_seq(model, tokenizer, seq_length, seed_text, 100)


'chance chance chance added sleeve covering goes goes abandoned abandoned shock shock shock moment solid solid solid awful awful example example example family dark dark expected expected moment moment reply balls balls balls problem imagined imagined imagined imagined imagined rage rage nearer explained explained nearer explained explained nearer explained explained total limitation kiss harmed harmed forgotten forgotten intention intention intention intention repelled repelled repelled repelled shock shock downloading he slight slight attached shoulders shoulders shoulders explained explained explained explained explained chosen chosen chosen run run run run run run run fancy fancy fancy salts salts salts salts contact contact load'