In [None]:
import numpy as np
import sys
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense , Dropout , LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint



In [None]:
with open('frankenstein-2.txt', 'r', encoding='latin-1', errors='ignore') as file:
    text = file.read()

In [None]:

def tokenize_words(input_text):
    input_text = input_text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input_text)
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    return " ".join(filtered_tokens)

processed_inputs = tokenize_words(text)


In [None]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))
input_len = len(processed_inputs)
vocab_len = len(chars)  

print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)


In [None]:
seq_length = 100  

x_data = []
y_data = []

for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])  
    y_data.append(char_to_num[out_seq])  

n_patterns = len(x_data)
print("Total Patterns:", n_patterns)


In [None]:
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X / float(vocab_len)

y = np_utils.to_categorical(y_data)

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [None]:

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
filepath = "modelweights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [None]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"" + "".join([num_to_char[value] for value in pattern]) + "\"")

In [None]:

for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]