In [3]:
import numpy as np
import pandas as pd
import pickle
import os
from string import punctuation
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

In [5]:
with open('poems.txt',  encoding='utf8') as f:
    poems = f.readlines()
    poems = ''.join(str(e) for e in poems if not e.isdigit())

In [6]:
vocab = ''.join(sorted(set(poems)))

In [7]:
char2int = {c: i for i, c in enumerate(vocab)}
int2char = {i: c for i, c in enumerate(vocab)}

In [8]:
pickle.dump(char2int, open('char2int.pickle', 'wb'))
pickle.dump(int2char, open('int2char.pickle', 'wb'))

In [9]:
encoded_text = np.array([char2int[c] for c in poems])

In [10]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [11]:
for char in char_dataset.take(10):
    print(char.numpy(), int2char[char.numpy()])

45 S
60 h
57 e
2  
55 c
67 o
73 u
64 l
56 d
66 n


In [12]:
sequence_length = 100
BATCH_SIZE = 256
EPOCHS = 30

In [13]:
sequences = char_dataset.batch(2*sequence_length + 1, drop_remainder = True)

In [14]:
for sequence in sequences.take(1):
    print(''.join([int2char[i] for i in sequence.numpy()]))

She couldn't help but sting my finger,

clinging a moment before I flung her

to the ground. Her gold is true, not the trick

evening light plays on my roses.

She curls into herself, stinger twitching


In [15]:
def split_sample(sample):
    ds = tf.data.Dataset.from_tensors((sample[:sequence_length], sample[sequence_length]))
    for i in range(1, (len(sample)-1) // 2):
        input_ = sample[i: i+sequence_length]
        target = sample[i+sequence_length]
        other_ds = tf.data.Dataset.from_tensors((input_, target))
        ds = ds.concatenate(other_ds)
    return ds

In [16]:
dataset = sequences.flat_map(split_sample)

In [17]:
def one_hot_samples(input_, target):
    return tf.one_hot(input_, len(vocab)), tf.one_hot(target, len(vocab))

In [18]:
dataset = dataset.map(one_hot_samples)

In [19]:
for element in dataset.take(2):
    print('Input:', ''.join([int2char[np.argmax(char_vector)] for char_vector in element[0].numpy()]))
    print('Target:', int2char[np.argmax(element[1].numpy())])
    print('Input shape:', element[0].shape)
    print('Target shape:', element[1].shape)
    print('='*50, '\n')

Input: She couldn't help but sting my finger,

clinging a moment before I flung her

to the ground. Her gol
Target: d
Input shape: (100, 99)
Target shape: (99,)

Input: he couldn't help but sting my finger,

clinging a moment before I flung her

to the ground. Her gold
Target:  
Input shape: (100, 99)
Target shape: (99,)



In [20]:
ds = dataset.repeat().shuffle(1024).batch(BATCH_SIZE, drop_remainder = True)

In [21]:
model = Sequential([LSTM(256, input_shape = (sequence_length, len(vocab)), return_sequences = True),
                   Dropout(0.3),
                   LSTM(256),
                   Dense(len(vocab), activation = 'softmax'),
                   ])

In [22]:
model_weights_path = f'{sequence_length}.h5'
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 100, 256)          364544    
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dense (Dense)               (None, 99)                25443     
                                                                 
Total params: 915,299
Trainable params: 915,299
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [24]:
if not os.path.isdir('results'):
    os.mkdir('results')
model.fit(ds, steps_per_epoch = (len(encoded_text) - sequence_length) // BATCH_SIZE, epochs = EPOCHS)
model.save(model_weights_path)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
