In [2]:
import numpy as np
import re

with open('../data/wizard_of_oz.txt', 'r') as file:
    raw_text = file.read()

In [3]:
raw_text = raw_text.lower()
raw_text = re.sub('\d', '$1 ', raw_text)
raw_text = re.sub('[^\w\s]+', ' ', raw_text)
raw_text = re.sub('[_-]', ' ', raw_text)
raw_text = re.sub('\s+', ' ', raw_text)
text = np.array(raw_text.split())
print(len(text))

43187


In [4]:
from nltk import download, word_tokenize

# download('punkt')

text = word_tokenize(raw_text)
print(len(text))

43215


In [5]:
words = sorted(list(set(text)))
print('total words:', len(words))

total words: 4140


In [6]:
from category_encoders import BinaryEncoder

encoder = BinaryEncoder()
encoded_words = np.array(encoder.fit_transform(words), dtype=np.int8)
np.shape(encoded_words)

(4140, 13)

In [7]:
encoded_from_word = dict((w, encoded_words[i]) for i, w in enumerate(words))
word_from_encoded = dict((tuple(encoded_words[i]), w) for i, w in enumerate(words))

# cut the text in semi-redundant sequences of 40 characters, in steps of 3

In [8]:
tokens_per_sentence = 60
sentences = []

encode_word = lambda word: encoded_from_word[word]
encode_sentence = np.vectorize(encode_word, otypes=[np.ndarray], signature='()->(n)')

for i in range(0, len(text) - tokens_per_sentence, 3):
    excerpt = encode_sentence(text[i: i + tokens_per_sentence])
    next_word = encode_word(text[i + tokens_per_sentence])
    sentences.append((excerpt, next_word))
print('nb sequences:', len(sentences))

nb sequences: 14385


In [9]:
import numpy as np

X = np.zeros((len(sentences), tokens_per_sentence, np.shape(encoded_words)[1]), dtype=int)
y = np.zeros((len(sentences), np.shape(encoded_words)[1]), dtype=int)
print('Vectorization...', np.shape(X), np.shape(y))
for row, (sentence, next_word) in enumerate(sentences):
    X[row] = sentence
    y[row] = next_word

print("Vector of 1st letter of 1st sentence:", X[0][0])

Vectorization... (14385, 60, 13) (14385, 13)
Vector of 1st letter of 1st sentence: [0 0 1 0 0 0 0 1 0 0 0 0 1]


In [18]:
from keras import Input, models, layers, losses, optimizers, activations, metrics

print('Build model...')
model = models.Sequential()
model.add(Input(shape=np.shape(X)[1:]))
model.add(layers.Embedding(input_dim=np.shape(y)[0], output_dim=64))
model.add(layers.Reshape((-1, -1)))
model.add(layers.LSTM(units=256))
model.add(layers.Dropout(rate=0.7))
model.add(layers.Dense(units=64, activation=activations.relu))
model.add(layers.Dropout(rate=0.2))
model.add(layers.Dense(units=np.shape(y)[1], activation=activations.sigmoid))

model.summary()

Build model...


ValueError: Exception encountered when calling layer "reshape" (type Reshape).

There must be at most one unknown dimension in output_shape. Received: output_shape=[-1, -1].

Call arguments received by layer "reshape" (type Reshape):
  • inputs=tf.Tensor(shape=(None, 60, 13, 64), dtype=float32)

In [None]:
model.compile(loss=losses.binary_crossentropy, optimizer=optimizers.legacy.RMSprop(learning_rate=0.01), metrics=[metrics.binary_accuracy])

In [None]:
model.fit(X, y, epochs=5)

In [None]:
import random

start_index = random.randint(0, len(text) - tokens_per_sentence - 1)

input_sentence = text[start_index: start_index + tokens_per_sentence]

print(" ".join(input_sentence))

for i in range(15):
    x = np.expand_dims(encode_sentence(input_sentence), axis=0)
    x = np.asarray(x).astype(np.int8)

    prediction = model.predict(x, verbose=0)
    
    next_index = tuple((1 if value > 0.1 else 0 for value in prediction[0]))
    
    next_char = word_from_encoded[next_index]
    print(next_char)
    input_sentence = np.append(np.delete(input_sentence, 0), next_char)
