In [1]:
#importing libraries
import numpy
import sys
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, LSTM

In [2]:
#load data
file=open("frankenstein.txt").read()

In [3]:
#tokenization
#standardization
def tokenize_words(input):
    input=input.lower()
    tokenizer=RegexpTokenizer(r'\w+')
    tokens=tokenizer.tokenize(input)
    filtered=filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)
    
processed_inputs=tokenize_words(file)

In [4]:
#chars to numbers
chars= sorted(list(set(processed_inputs)))
char_to_num= dict((c,i) for i, c in enumerate(chars))

In [7]:
#check if words to char or chars to num has worked
input_len=len(processed_inputs)
vocab_len=len(chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)

Total number of characters: 232972
Total vocab: 37


In [8]:
#seq length
seq_length=100
x_data=[]
y_data=[]

In [13]:
#loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq=processed_inputs[i:i+seq_length]
    out_seq=processed_inputs[i+seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns=len(x_data)
print("Total patterns:", n_patterns)

Total patterns: 232872


In [14]:
#convert input sequence to np array and so on
X= numpy.reshape(x_data, (n_patterns, seq_length, 1))
X=X/float(vocab_len)

In [16]:
#one-hot-encoding
y=to_categorical(y_data)

In [17]:
#creating the model
model= Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0,2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0,2))
model.add(LSTM(128))
model.add(Dropout(0,2))
model.add(Dense(y.shape[1], activation='softmax'))

  super().__init__(**kwargs)


In [21]:
#compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [47]:
#saving weights
filepath="model_weights_saved.keras"
checkpoint=ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks=[checkpoint]

In [49]:
#fitting the model and training
model.fit(X,y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 2.9382
Epoch 1: loss improved from None to 2.91548, saving model to model_weights_saved.keras
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1737s[0m 2s/step - loss: 2.9155
Epoch 2/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 2.9089
Epoch 2: loss improved from 2.91548 to 2.90923, saving model to model_weights_saved.keras
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1731s[0m 2s/step - loss: 2.9092
Epoch 3/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 2.9095
Epoch 3: loss improved from 2.90923 to 2.90898, saving model to model_weights_saved.keras
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1761s[0m 2s/step - loss: 2.9090
Epoch 4/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 2.8908
Epoch 4: loss improved from 2.90898 to 2.87789, saving model 

<keras.src.callbacks.history.History at 0x23ff47a6000>

In [23]:
#recompileing model with same weights
filename="model_weights_saved.keras"
model.load_weights(filename)
model.compile(loss='categorical crossentropy', optimizer='adam')

  saveable.load_own_variables(weights_store.get(inner_path))


In [25]:
#output of the model back into characters
num_to_char=dict((i,c) for i,c in enumerate(chars))

In [27]:
#random seeds to help generate
start=numpy.random.randint(0, len(x_data)-1)
pattern=x_data[start]
print("Random Seed: ")
print("\"", ' '.join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" g s m i n d s e i z e d l i k e l i c h e n r o c k w i s h e d s o m e t i m e s s h a k e t h o u g h t f e e l i n g l e a r n e d o n e m e a n s o v e r c o m e s e n s a t i o n p a i n d e a t "


In [29]:
#generate the text
for i in range(1000):
    x=numpy.reshape(pattern, (1,len(pattern),1))
    x=x/float(vocab_len)
    prediction=model.predict(x, verbose=0)
    index=numpy.argmax(prediction)
    result=num_to_char[index]
    seq_in=[num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee