<a href="https://colab.research.google.com/github/ShimriFernando/MyCaptainAI-ML/blob/master/ML_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy 
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using TensorFlow backend.


In [3]:
file = open("Frankenstein-21.txt").read()

In [4]:
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

processed_inputs = tokenize_words(file)

In [5]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i)for i, c in enumerate(chars))

In [6]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)

Total number of characters: 3906
Total vocab: 29


In [7]:
seq_length = 100
x_data = []
y_data = []
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 3806


In [8]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [9]:
y = np_utils.to_categorical(y_data)

In [10]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [11]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [23]:
model.fit(X, y, batch_size=1000, epochs=100, verbose=1, callbacks=desired_callbacks)

Epoch 1/100

Epoch 00001: loss improved from 2.81508 to 2.80985, saving model to model_weights_saved.hdf5
Epoch 2/100

Epoch 00002: loss improved from 2.80985 to 2.80357, saving model to model_weights_saved.hdf5
Epoch 3/100

Epoch 00003: loss improved from 2.80357 to 2.79572, saving model to model_weights_saved.hdf5
Epoch 4/100

Epoch 00004: loss improved from 2.79572 to 2.78608, saving model to model_weights_saved.hdf5
Epoch 5/100

Epoch 00005: loss improved from 2.78608 to 2.77515, saving model to model_weights_saved.hdf5
Epoch 6/100

Epoch 00006: loss did not improve from 2.77515
Epoch 7/100

Epoch 00007: loss improved from 2.77515 to 2.76986, saving model to model_weights_saved.hdf5
Epoch 8/100

Epoch 00008: loss improved from 2.76986 to 2.75471, saving model to model_weights_saved.hdf5
Epoch 9/100

Epoch 00009: loss improved from 2.75471 to 2.74876, saving model to model_weights_saved.hdf5
Epoch 10/100

Epoch 00010: loss improved from 2.74876 to 2.74495, saving model to model_weig

<keras.callbacks.callbacks.History at 0x7f429670e630>

In [24]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [25]:
num_to_char = dict((i, c) for i,c in enumerate(chars))


In [32]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print( "\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
"  occupied duties new situation relinquished many public employments devoted education children eldes "


In [33]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose = 0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

as alssintien satt piarsi ciillint stocicsi fossosinn sessed sennen doied botert pooe bomdated lonthr sar sess cone rendd mond motthr man man reve monte wear connene miaeen lorter aather dete rever ahnce consene manren dnnse teved sears sems mine coccd podten man hens reare creed tereon mans piotiee seaeuser aaupy rirte tife eosliee sever aroorrent pemc ceupirity pase coulived livervad snntent masrene sene tottes massed mnnten sears sersed mnndrad senter sers cirpined siaeen ruuent pouha bouranee sone suppor mine sented lotllat seass counered sivery aettiotien stecetion doild bett roorr pobliety foelt cotloved aare inpery pose sise resenn derol sive momd coodd mond mont persen sespent dlil betlortent mamc posters mime ceuiinee leverad tenter aateer liver coucere  ffcmient soonn cespirie live toouuen matr cotiitien diellas aaslott tessed senned severa pooter tecinnn fesporit nereer lott mitted mond peteral mene reauuas mensened snne settelted mann piverat seme coulived aeutirtent doictm