In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import json
import tensorflowjs as tfjs

## Import Text

In [3]:
with open("./trainingData/sherlockS.txt", 'r', encoding='utf-8') as file:
    text = file.read()

## Tokenize Text

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [5]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [6]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [None]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [None]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

## Create Model Architecture

In [None]:
model = Sequential()
model.add(Dense(150, input_shape=(max_sequence_len-1,), activation='relu'))
model.add(Dense(300, activation='relu'))
model.add(Dense(300, activation='relu'))
model.add(Dense(300, activation='relu'))
model.add(Dense(300, activation='relu'))
model.add(Dense(150, activation='relu'))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())

## Train Model

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

## Save Model for JS

In [None]:

output_dir = "./tfjs_models/FFNN" 
tfjs.converters.save_keras_model(model, output_dir)

## Save word index

In [7]:
wordIndex = {}
for word in tokenizer.word_index:
    wordIndex[word] = tokenizer.word_index[word]

with open('./tokenizer/wordIndex.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(wordIndex, ensure_ascii=False))

## Save Max Sequence Length

In [8]:
with open('./tokenizer/max_sequence_len.txt', 'w') as f:
    f.write(str(max_sequence_len))