In [1]:
import os
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from tensorflow.keras.utils import to_categorical
from keras.utils.data_utils import get_file
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
from google.colab import files
uploaded = files.upload()

Saving wonderland.txt to wonderland.txt


In [3]:
path = r'/content/wonderland.txt'
with open(path, encoding='utf-8') as f:
    text = f.read()

In [4]:
print('corpus length:',len(text))

corpus length: 163781


In [5]:
print('some part of text:',text[:150])

some part of text: ﻿Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll

This eBook is for the use of anyone anywhere at no cost and with
almost no re


In [6]:
tokens=text.replace('--', ' ').split()
cleaned_tokens=[]
table=str.maketrans('','', string.punctuation)
for word in tokens:
    word=word.translate(table)
    if word.isalpha():
        cleaned_tokens.append(word.lower())

In [7]:
min_count=2
unknown_token='<unk>'
word2index={unknown_token: 0}
index2word=[unknown_token]

filtered_words=0
counter=Counter(cleaned_tokens)
for word, count in counter.items():
    if count>=min_count:
        index2word.append(word)
        word2index[word]=len(word2index)
    else:
        filtered_words+=1

num_classes=len(word2index)
print('vocabulary size: ',num_classes)
print('filtered words: ',filtered_words)

vocabulary size:  1702
filtered words:  1385


In [8]:
step=3
maxlen=40
X=[]
y=[]
for i in range(0,len(cleaned_tokens)-maxlen,step):
    sentence=cleaned_tokens[i:i+maxlen]
    next_word=cleaned_tokens[i+maxlen]
    X.append([word2index.get(word,0) for word in sentence])
    y.append(word2index.get(next_word,0))
X=np.array(X)
Y=to_categorical(y,num_classes)
print('sequence dimension: ',X.shape)
print('target dimension: ',Y.shape)
print('example sequence:\n',X[0])

sequence dimension:  (9812, 40)
target dimension:  (9812, 1702)
example sequence:
 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 19
 24 25 26 27 28 29 30 29 31 32 33 29 34 13 35 15]


In [9]:
model=Sequential()
model.add(Embedding(num_classes,output_dim=50,input_length=maxlen,))
model.add(LSTM(256))
model.add(Dense(num_classes,activation='softmax'))

In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 50)            85100     
                                                                 
 lstm (LSTM)                 (None, 256)               314368    
                                                                 
 dense (Dense)               (None, 1702)              437414    
                                                                 
Total params: 836,882
Trainable params: 836,882
Non-trainable params: 0
_________________________________________________________________


In [12]:
address1='lstm_weights.hdf5'
print('model checkpoint address: ',address1)

history=model.fit(X,Y,batch_size=64,epochs=200, verbose=1,validation_split=0.2)

model_info={'history': history,'model':model}

model checkpoint address:  lstm_weights.hdf5
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Ep

In [13]:
loss,accuracy = model.evaluate(X,Y,batch_size=64)
print("Loss is ",loss*100,'%')
print("Accuracy is ",accuracy*100,'%')

Loss is  293.9980745315552 %
Accuracy is  81.24745488166809 %


In [14]:
def check_prediction(model, num_predict):
    true_print_out='Actual words: '
    pred_print_out='Predicted words: '
    for i in range(num_predict):
        x=X[i]
        prediction=model.predict(x[np.newaxis, :], verbose = 0)
        index=np.argmax(prediction)
        true_print_out+=index2word[y[i]]+' '
        pred_print_out+=index2word[index]+' '

    print(true_print_out)
    print()
    print(pred_print_out)


In [15]:
model=model_info['model']
check_prediction(model,15)

Actual words: the license this online <unk> in lewis date <unk> last <unk> of gutenberg adventures alices 

Predicted words: the license this online <unk> in lewis date <unk> last <unk> of gutenberg adventures alices 
