In [16]:
# importing the dependencies
import numpy 
import sys
import keras
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# load data
# loading the data and opening our input data in the form of a txt file
# Project Gutenburg/berg is where the data can be found (google it)
with open('/content/drive/My Drive/Colab Notebooks/Frankenstein.txt', 'r') as docs: 
     file1 = docs.read() 

In [0]:
# tokenization 
# standardization
# what is Tokenization? Tokenization is the process of breaking a stream of text up into words phases symbols or a meaningful elements
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()
    # instantiating the tonkenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # tokenizing the text into tokens
    tokens = tokenizer.tokenize(input)
    # filtering the stopwords using the lambda
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)  
# preprocess the input data, make tokens
processed_inputs = tokenize_words(file1)

In [0]:
# chars to numbers
# convert characters in our input to numbers 
# we'll sort the list of the set all characters that appears in out i/p text and then use the enumerate fn to get numbers that represents the characters
# we'll thencreat a dictionary that stores the keys and values, or the characters and the number the repesent them
chars = sorted (list(set(processed_inputs)))
chars_to_num = dict((c,i) for i, c in enumerate(chars))

In [21]:
# check if words to chars or chars to num (?!) has worked?
# just so we get an idea of whether our process of converting words to characters has worked
# we jsut print the length of our variables
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total no. of characters: ",input_len)
print("Total vocab: ", vocab_len)

Total no. of characters:  241873
Total vocab:  42


In [0]:
# seq_length
# we're defining how long we want an individual sequence here
# an individual sequence is a complete mapping of input characeters as integers
seq_length = 100
x_data = []
y_data = []

In [23]:
# loop through the sequence
# here we' re goining through the entire list pf i/p and converting the chars to numbers with a for loop
# this will create a bunch of sequence where each sequence starts with the next character in the i/p data
# begining with the first character 
for i in range(0, input_len - seq_length,1):
    #define the i/p and o/p of sqeuences
    # i/p is the current character plus total sequence length 
    in_seq = processed_inputs[i:i + seq_length]
    #out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]
    #converting the list of characters to integers based on previous values and appending the values to our lists
    x_data.append([chars_to_num[char] for char in in_seq])
    y_data.append(chars_to_num[out_seq])

#check to see how many total input sequences we have
n_patterns = len(x_data)
print("TOtal Patterns", n_patterns)

TOtal Patterns 241773


In [0]:
#convert input sequence to np array that our network can use
X = numpy.reshape(x_data, (n_patterns, seq_length,1))
X = X/float(vocab_len)

In [0]:
# one-hot encoding our label
y = np_utils.to_categorical(y_data)

In [0]:
# creating the sequencial model
# creating a sequencial model
# dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [0]:
# comppile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [0]:
# saving weights
filepath ='model_weights_saved.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [14]:
# fit model and let it train
model.fit(X,y, epochs=4,  batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.93664, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.93664 to 2.91542, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.91542 to 2.90606, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.90606 to 2.87682, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x7fe34c8f4f28>

In [0]:
# recompile model with the saved weigths
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [0]:
# output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [37]:
#random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random seed: ")
print("\"",''.join([num_to_char[value] for value in pattern]), "\"")

Random seed: 
" ughtinformationkrempegivenconcerninglecturesalthoughcouldconsentgohearlittleconceitedfellowdeliverse "


In [38]:
# generate the text 
for i in range (1000):
  x = numpy.reshape(pattern,(1,len(pattern),1))
  x = x/float(vocab_len)
  prediction = model.predict(x, verbose = 0)
  index = numpy.argmax(prediction)
  result = num_to_char[index]
  seq_in = [num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern [1:len(pattern)]

reerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereere