In [81]:
#import dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense,Dropout,LSTM #dense,dropout,LSTM are all layers
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
#load data.Step-2:Localization->break data into tokens
#loading and opening our data in .txt form
file = open("frankenstein-2.txt").read()


In [83]:
#tokenization->The process of breaking a stream of text into words,phrases,symbols or meaningful elements
#standardisation
def tokenize_words(input):
    input = input.lower()  #lowercase everything to standardise it
    tokenizer = RegexpTokenizer(r'\w+')  #instantiating the tokenizer
    tokens = tokenizer.tokenize(input)  #tokenizing the text into tokens
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)
processed_inputs = tokenize_words(file)
                            

In [84]:
#neural network understands only numbers.So convert characters to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [85]:
#check if words to chars and chars to nums has worked:
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocabulary:", vocab_len)

Total number of characters: 232972
Total vocabulary: 37


In [86]:
#seq length
seq_length = 100
x_data = []
y_data = []

In [87]:
for i in range(0,input_len - seq_length,1):
    in_seq = processed_inputs[i:i+seq_length]
    out_seq = processed_inputs[i+seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns = len(x_data)
print("Total Patterns", n_patterns)

Total Patterns 232872


In [88]:
#convert i/p seq to np array etc.,
X = numpy.reshape(x_data,(n_patterns,seq_length,1))
X = X/float(vocab_len)

In [89]:
#One-hot encoding
y = np_utils.to_categorical(y_data)

In [90]:
#create LSTM model
#1.Create sequential model:
model = Sequential()
#add layers 1 by 1
model.add(layers.LSTM(256,input_shape=(X.shape[1],X.shape[2])))
model.add(layers.Dropout(0.2))
#model.add(layers.LSTM(128))
model.add(layers.Dropout(0.2))
#model.add(layers.LSTM(64))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(y.shape[1],activation='softmax'))

In [91]:
#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [92]:
#saving weights
filepath = 'model_weights_saved.hdf5'
checkpoint = ModelCheckpoint(filepath,monitor='loss',verbose = 1,save_best_only = True, mode='min')
desired_callbacks = [checkpoint]

In [93]:
#fit model and let it train
model.fit(X,y,epochs=4,batch_size=256,callbacks=desired_callbacks)

Epoch 1/4
Epoch 00001: loss improved from inf to 2.93153, saving model to model_weights_saved.hdf5
Epoch 2/4
Epoch 00002: loss improved from 2.93153 to 2.89760, saving model to model_weights_saved.hdf5
Epoch 3/4
Epoch 00003: loss improved from 2.89760 to 2.88349, saving model to model_weights_saved.hdf5
Epoch 4/4
Epoch 00004: loss improved from 2.88349 to 2.87965, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f70d30d1cf8>

In [97]:
#recompile model with saved weights
filename = 'model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [98]:
num_to_char = dict((i,e) for i,e in enumerate(chars))

In [108]:
# random seed to help generate
start = numpy.random.randint(0,len(x_data) - 1)
pattern = x_data[start]
print("Random Seed: ")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" arlysuspectfraudfalsedealinganotherfriendhoweverstronglymayattachedmayspitecontemplatedsuspicionenjo "


In [110]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee

In [None]:
#if epoch is increased,it can recognize patterns much better and pattern other than e also will be recognized by our model.
