In [1]:
import numpy as np

with open('../data/wizard_of_oz.txt', 'r') as file:
    raw_text = file.read()

In [2]:
from nltk import download, word_tokenize

# download('punkt')

text = word_tokenize(raw_text)
print(len(text))

51652


In [3]:
words = sorted(list(set(text)))
print('total words:', len(words))

total words: 4719


In [4]:
from category_encoders import OneHotEncoder

encoder = OneHotEncoder()
encoded_words = np.array(encoder.fit_transform(words), dtype=np.int8)

In [5]:
index_of_word = dict((w, i) for i, w in enumerate(words))
word_from_word = dict((i, w) for i, w in enumerate(words))

#### cut the text in semi-redundant sequences of 40 characters, in steps of 3

In [15]:
tokens_per_sentence = 60
sentences = []

zeros = np.zeros((len(words)))

encode_word = lambda word: index_of_word[word]
encode_sentence = lambda sent: np.array([encode_word(word) for _, word in enumerate(sent)])

for i in range(0, len(text) - tokens_per_sentence, 3):
    excerpt = encode_sentence(text[i: i + tokens_per_sentence])
    next_word = encode_word(text[i + tokens_per_sentence])
    sentences.append((excerpt, next_word))
print('nb sequences:', len(sentences))
print('nb sentence shape', np.shape(sentences[0][0]))

nb sequences: 17198
nb sentence shape (60,)


In [18]:
import numpy as np

X = np.zeros((len(sentences), tokens_per_sentence, np.shape(encoded_words)[1]), dtype=int)
y = np.zeros((len(sentences), np.shape(encoded_words)[1]), dtype=int)
print('Vectorization...', np.shape(X), np.shape(y))
for row, (sentence, next_word) in enumerate(sentences):
    X[row, np.arange(sentence), sentence] = 1
    y[row, next_word] = 1

print("Vector of 1st letter of 1st sentence:", X[0][0])

Vectorization... (17198, 60, 4719) (17198, 4719)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
from keras import Input, models, layers, losses, optimizers, activations, metrics

print('Build model...')
model = models.Sequential()
model.add(Input(shape=np.shape(X)[1:]))
model.add(layers.LSTM(units=256))
model.add(layers.Dropout(rate=0.7))
model.add(layers.Dense(units=64, activation=activations.relu))
model.add(layers.Dropout(rate=0.2))
model.add(layers.Dense(units=np.shape(y)[1], activation=activations.sigmoid))

model.summary()

In [None]:
model.compile(loss=losses.binary_crossentropy, optimizer=optimizers.legacy.RMSprop(learning_rate=0.01), metrics=[metrics.binary_accuracy])

In [None]:
model.fit(X, y, epochs=5)

In [None]:
import random

start_index = random.randint(0, len(text) - tokens_per_sentence - 1)

input_sentence = text[start_index: start_index + tokens_per_sentence]

print(" ".join(input_sentence))

for i in range(15):
    x = np.expand_dims(encode_sentence(input_sentence), axis=0)
    x = np.asarray(x).astype(np.int8)

    prediction = model.predict(x, verbose=0)
    
    next_index = tuple((1 if value > 0.1 else 0 for value in prediction[0]))
    
    next_char = word_from_encoded[next_index]
    print(next_char)
    input_sentence = np.append(np.delete(input_sentence, 0), next_char)
