In [19]:
import numpy as np

import tensorflow as tf

In [6]:
with open('../datasets/mltensor.txt', 'r') as f:
    text = f.read()

In [7]:
start_indx = text.find('Python Machine Learning')
end_indx = text.find('[ 741 ]')
text = text[start_indx:end_indx]
char_set = set(text)

print('Total Length:', len(text))

Total Length: 1252868


In [9]:
print('Unique Characters:', len(char_set))

Unique Characters: 272


In [10]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i, ch in enumerate(chars_sorted) }
char_array = np.array(chars_sorted)

In [11]:
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)
print("Text encoded shape: ", text_encoded.shape)

Text encoded shape:  (1252868,)


In [14]:
print(text[:15], '=== Encoded ==> ', text_encoded[:15])

Python Machine  === Encoded ==>  [50 90 85 73 80 79  2 47 66 68 73 74 79 70  2]


In [17]:
print(text_encoded[15:35], '=== reverse ==>', ''.join(char_array[text_encoded[15:35]]))

[46 70 66 83 79 74 79 72  1 54 73 74 83 69  2 39 69 74 85 74] === reverse ==> Learning
Third Editi


In [21]:
ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)

for example in ds_text_encoded.take(5):
    print('{} --> {}'.format(example.numpy(), char_array[example.numpy()]))

50 --> P
90 --> y
85 --> t
73 --> h
80 --> o


In [22]:
seq_len = 50
chunk_size = seq_len + 1

ds_chunks = ds_text_encoded.batch(batch_size=chunk_size, drop_remainder=True)

In [23]:
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    
    return input_seq, target_seq

ds_sequences = ds_chunks.map(split_input_target)

In [24]:
for sample in ds_sequences.take(2):
    print('  Input (x) : ', repr(''.join(char_array[sample[0].numpy()])))
    print('  Target (y)  : ', repr(''.join(char_array[sample[1].numpy()])))
    print()

  Input (x) :  'Python Machine Learning\nThird Edition\nMachine Lear'
  Target (y)  :  'ython Machine Learning\nThird Edition\nMachine Learn'

  Input (x) :  'ing and Deep Learning with Python,\nscikit-learn, a'
  Target (y)  :  'ng and Deep Learning with Python,\nscikit-learn, an'



In [25]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

ds = ds_sequences.shuffle(BUFFER_SIZE).batch(batch_size=BATCH_SIZE)

#### Building a character-level RNN model.

In [26]:
def build_model(vocab_size, embedding_dim, rnn_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

In [28]:
#Set training parameters.
charset_size = len(char_array)
embedding_dim = 256
rnn_units = 512

In [29]:
tf.random.set_seed(42)

model = build_model(vocab_size=charset_size, 
                    embedding_dim=embedding_dim, 
                    rnn_units=rnn_units)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         69632     
                                                                 
 lstm (LSTM)                 (None, None, 512)         1574912   
                                                                 
 dense (Dense)               (None, None, 272)         139536    
                                                                 
Total params: 1784080 (6.81 MB)
Trainable params: 1784080 (6.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [30]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [31]:
model.fit(ds, epochs=20)

Epoch 1/20
Epoch 2/20

#### Evaluation phase – generating new text passages.

In [None]:
tf.random.set_seed(42)

logits = [[1.0, 1.0, 1.0]]
print('Probabilities: ', tf.math.softmax(logits).numpy()[0])

In [None]:
samples = tf.random.categorical(logits=logits, num_samples=10)
tf.print(samples.numpy())

In [None]:
def sample(model, starting_str,
           len_generated_txt=700, 
           max_input_length=50, scale_factor=1.0):
    encoded_input = [char2int[s] for s in starting string]
    encoded_input = tf.reshape(encoded_input, (1, -1))
    
    generated_str = starting_str
    model.reset_states()
    for i in range(len_generated_txt):
        logits = model(encoded_input)
        logits = tf.squeeze(logits, 0)
        
        scaled_logits = logits * scale_factor
        new_char_idx = tf.random.categorical(scaled_logits, num_samples=1)
        new_char_idx = tf.squeeze(new_char_idx)[0].numpy()