In [34]:
import numpy as np
from pickle import dump, load
from keras.utils import to_categorical
from keras .models import Sequential
from keras.layers import Dense, LSTM, Input
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

Data preparation

In [1]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read();
    file.close()
    return text

In [2]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'. join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [3]:
raw_text = load_doc('rhyme.txt')
tokens = raw_text.split()
raw_text = ' '.join(tokens)

In [4]:
# organize into sequences of characters
length = 10
sequences = list()
for i  in range(length, len(raw_text)):
    seq = raw_text[i-length : i+1]
    sequences.append(seq)
print(f"Total sequences: {len(sequences)}")

out_filename = 'char_sequences.txt'
save_doc(sequences, out_filename)
print(sequences[:5])

Total sequences: 399
['Sing a song', 'ing a song ', 'ng a song o', 'g a song of', ' a song of ']


Train Language Model

In [22]:
def define_model(x, vocab_size):
    model = Sequential()
    model.add(Input(shape = (x.shape[1], x.shape[2])))
    model.add(LSTM(75))
    model.add(Dense(vocab_size, activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [23]:
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split('\n')

In [6]:
# encode sequences
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))
vocab_size = len(mapping)
print(mapping)

{'\n': 0, ' ': 1, "'": 2, ',': 3, '.': 4, ';': 5, 'A': 6, 'B': 7, 'C': 8, 'E': 9, 'F': 10, 'H': 11, 'S': 12, 'T': 13, 'W': 14, 'a': 15, 'b': 16, 'c': 17, 'd': 18, 'e': 19, 'f': 20, 'g': 21, 'h': 22, 'i': 23, 'k': 24, 'l': 25, 'm': 26, 'n': 27, 'o': 28, 'p': 29, 'q': 30, 'r': 31, 's': 32, 't': 33, 'u': 34, 'w': 35, 'x': 36, 'y': 37}


In [7]:
sequences = list()
for line in lines:
    encoded_seq = [mapping[char] for char in line]
    sequences.append(encoded_seq)
print(f"Total sequences: {len(sequences)}")

Total sequences: 399


In [9]:
# separate into input and output
sequences = np.array(sequences)
X, Y = sequences[:, :-1], sequences[:, -1]
sequences = [to_categorical(x, num_classes = vocab_size) for x in X]
X = np.array(sequences)
Y = to_categorical(Y, num_classes = vocab_size)

In [18]:
print(X.shape)
print(Y.shape)

(399, 10, 38)
(399, 38)


In [72]:
model = define_model(X, vocab_size)
model.fit(X, Y, epochs = 100, verbose = 2)
model.save('model.h5')

dump(mapping, open('mapping.pkl', 'wb'))

Epoch 1/100
13/13 - 8s - 606ms/step - accuracy: 0.0677 - loss: 3.6109
Epoch 2/100
13/13 - 0s - 17ms/step - accuracy: 0.1604 - loss: 3.5046
Epoch 3/100
13/13 - 0s - 20ms/step - accuracy: 0.1905 - loss: 3.2002
Epoch 4/100
13/13 - 0s - 16ms/step - accuracy: 0.1905 - loss: 3.0429
Epoch 5/100
13/13 - 0s - 20ms/step - accuracy: 0.1905 - loss: 3.0067
Epoch 6/100
13/13 - 0s - 19ms/step - accuracy: 0.1905 - loss: 2.9894
Epoch 7/100
13/13 - 0s - 24ms/step - accuracy: 0.1905 - loss: 2.9715
Epoch 8/100
13/13 - 0s - 22ms/step - accuracy: 0.1905 - loss: 2.9505
Epoch 9/100
13/13 - 0s - 19ms/step - accuracy: 0.1905 - loss: 2.9300
Epoch 10/100
13/13 - 0s - 19ms/step - accuracy: 0.1905 - loss: 2.9096
Epoch 11/100
13/13 - 0s - 18ms/step - accuracy: 0.2030 - loss: 2.8805
Epoch 12/100
13/13 - 0s - 18ms/step - accuracy: 0.1955 - loss: 2.8564
Epoch 13/100
13/13 - 0s - 16ms/step - accuracy: 0.2080 - loss: 2.8284
Epoch 14/100
13/13 - 0s - 17ms/step - accuracy: 0.2306 - loss: 2.7903
Epoch 15/100
13/13 - 0s - 18



Generate Text

In [73]:
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes = len(mapping))
        encoded  = encoded.reshape(1, encoded.shape[1], encoded.shape[2])
        yhat = model.predict(encoded, verbose = 0)
        yhat = np.argmax(yhat, axis=-1)
        
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        in_text += char
    return in_text

In [74]:
mapping = load(open('mapping.pkl', 'rb'))

# test start of rhyme
print(generate_seq(model, mapping, 10, 'Sing a son', 20))
# test mid-line
print(generate_seq(model, mapping, 10, 'king was i', 20))
# test not in original
print(generate_seq(model, mapping, 10, 'hello worl', 20))

Sing a song of sixpence, A poc
king was in his counting house
hello worl he iieng. ahn ting 
