In [1]:
import pandas as pd
import numpy as np
import string

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
import keras.utils as ku

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Data sources:
# https://en.wikipedia.org/wiki/Little_Red_Riding_Hood
# https://www.storyarts.org/library/aesops/stories/boy.html

In [3]:
# load data
raw_text = open('text_data.txt', 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
raw_text = raw_text.replace('\n', '. ')
raw_text = raw_text.replace(' . ', ' ')
raw_text[:100]

'little red riding hood. the story centers around a girl named little red riding hood, after the red '

In [4]:
# Tokenize to sentences
sentences = sent_tokenize(raw_text)
print(len(sentences))
sentences[:3]

50


['little red riding hood.',
 'the story centers around a girl named little red riding hood, after the red hooded cape that she wears.',
 'the girl walks through the woods to deliver food to her sickly grandmother (wine and cake depending on the translation).']

In [16]:
# Remove punctuations
def remove_punctuation(txt):
    txt = ''.join(v for v in txt if v not in string.punctuation)
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt


# Convert to sequence of tokens
tokenizer = Tokenizer()

def sequence_of_tokens(sentences):
    ## tokenization
    tokenizer.fit_on_texts(sentences)
    total_words = len(tokenizer.word_index) + 1

    ## convert data to sequence of tokens
    input_sequences = []
    for line in sentences:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words


# Generate padded sequences
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # Feature: 1 word until second last word. label: last word
    feature, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return feature, label, max_sequence_len


# Set callbacks
def create_callbacks(model_name):
    # Checkpoint
    checkpoint = ModelCheckpoint(
        filepath=model_name,
        monitor='loss',
        mode='min',
        save_best_only=True
    )

    return checkpoint

# Build the model
def build_model(max_sequence_len, total_words):
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model


def generate_text(input_text, num_generated_words, model, max_sequence_len):
    for i in range(num_generated_words):
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        prediction = np.argmax(model.predict(token_list, verbose=0))
        prediction = inverse_tokenizer[prediction]
        input_text += ' ' + prediction

    return input_text


In [15]:
# Remove punctuations
sentences = [remove_punctuation(s) for s in sentences]
sentences[:3]

['little red riding hood',
 'the story centers around a girl named little red riding hood after the red hooded cape that she wears',
 'the girl walks through the woods to deliver food to her sickly grandmother wine and cake depending on the translation']

In [9]:
# Convert to sequence of tokens
input_sequences, total_words = sequence_of_tokens(sentences)
input_sequences[:10]

[[19, 20],
 [19, 20, 21],
 [19, 20, 21, 22],
 [1, 28],
 [1, 28, 103],
 [1, 28, 103, 58],
 [1, 28, 103, 58, 6],
 [1, 28, 103, 58, 6, 33],
 [1, 28, 103, 58, 6, 33, 104],
 [1, 28, 103, 58, 6, 33, 104, 19]]

In [14]:
# Inverse tokenizer
inverse_tokenizer = {token: word for word, token in tokenizer.word_index.items()}

# The total tokenizes words
print(total_words)

# Examples
print(
    tokenizer.word_index['little'],
    tokenizer.word_index['red'],
    tokenizer.word_index['riding'],
    tokenizer.word_index['hood'],
)

287
19 20 21 22


In [51]:
# Generate padded sequences
feature, label, max_sequence_len = generate_padded_sequences(input_sequences)
max_sequence_len

94

In [21]:
feature[:5]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 19],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 19, 20],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0

In [22]:
label[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [33]:
# Build the model
model = build_model(max_sequence_len, total_words)
# Checkpoint
checkpoint = create_callbacks('text_generation.hdf5')
# Fit the model
model.fit(feature, label, epochs=50, verbose=0, callbacks=checkpoint)

<keras.src.callbacks.History at 0x7da519e40eb0>

In [49]:
generate_text('little red riding', 2, model, max_sequence_len)

'little red riding hood and'

In [52]:
generate_text('the red riding hood and the sheperd boy', 10, model, max_sequence_len)

'the red riding hood and the sheperd boy here returned the village sheep the village sheep when the'

In [53]:
generate_text('the villagers think', 10, model, max_sequence_len)

'the villagers think came up the hill to help the boy drive the'

In [54]:
generate_text('The villagers came running', 10, model, max_sequence_len)

'The villagers came running up the hill to help the boy drive the wolf'

In [55]:
generate_text('food in the basket', 5, model, max_sequence_len)

'food in the basket version the wolf leaves the'

In [56]:
generate_text('The grandmother were locked', 5, model, max_sequence_len)

'The grandmother were locked and attempts to them grandmother'