In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import json
import tensorflowjs as tfjs

## Import Text

In [None]:
with open("./trainingData/sherlockS.txt", 'r', encoding='utf-8') as myfile:
    mytext = myfile.read()

## Tokenize Text

In [None]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1

In [None]:
my_input_sequences = []
for line in mytext.split('\n'):
    token_list = mytokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        my_n_gram_sequence = token_list[:i+1]
        my_input_sequences.append(my_n_gram_sequence)

In [None]:
max_sequence_len = max([len(seq) for seq in my_input_sequences])
input_sequences = np.array(pad_sequences(my_input_sequences, maxlen=max_sequence_len, padding='pre'))

In [None]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [None]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

## Create Model Architecture

In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())

## Train Model

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

## Save Model for JS

In [None]:

output_dir = "./tfjs_models/RNN" 
tfjs.converters.save_keras_model(model, output_dir)

## Save word index

In [None]:
wordIndex = {}
for word in mytokenizer.word_index:
    wordIndex[word] = mytokenizer.word_index[word]

with open('./tokenizer/wordIndex.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(wordIndex, ensure_ascii=False))

## Save Max Sequence Length

In [None]:
with open('./tokenizer/max_sequence_len.txt', 'w') as f:
    f.write(str(max_sequence_len))