In [1]:
import os
import shutil
import tensorflow as tf
import pickle
import numpy as np
import random as rnd

# set random seed
rnd.seed(32)

2023-09-06 21:25:04.859070: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dirname = ''
filename = 'shakespeare_data.txt'
lines = [] # storing all the lines in a variable. 

counter = 0

with open(os.path.join(dirname, filename)) as files:
    for line in files:        
        # remove leading and trailing whitespace
        pure_line = line.strip()

        # if pure_line is not the empty string,
        if pure_line:
            # append it to the list
            lines.append(pure_line)

In [3]:
n_lines = len(lines)
print(f"Number of lines: {n_lines}")
print(f"Sample line at position 0 {lines[0]}")
print(f"Sample line at position 999 {lines[999]}")

Number of lines: 125097
Sample line at position 0 A LOVER'S COMPLAINT
Sample line at position 999 With this night's revels and expire the term


In [4]:
# go through each line
for i, line in enumerate(lines):
    # convert to all lowercase
    lines[i] = line.lower()

print(f"Number of lines: {n_lines}")
print(f"Sample line at position 0 {lines[0]}")
print(f"Sample line at position 999 {lines[999]}")

Number of lines: 125097
Sample line at position 0 a lover's complaint
Sample line at position 999 with this night's revels and expire the term


In [4]:
eval_lines = lines[-1000:] # Create a holdout validation set
lines = lines[:-1000] # Leave the rest for training

print(f"Number of lines for training: {len(lines)}")
print(f"Number of lines for validation: {len(eval_lines)}")

Number of lines for training: 124097
Number of lines for validation: 1000


<a name='1-2'></a>
### 1.2 - Convert a Line to Tensor

Now that we have our list of lines, we will convert each character in that list to a number. We can use Python's `ord` function to do it. 

Given a string representing of one Unicode character, the `ord` function return an integer representing the Unicode code point of that character.



In [5]:
# View the unique unicode integer associated with each character
print(f"ord('a'): {ord('a')}")
print(f"ord('b'): {ord('b')}")
print(f"ord('c'): {ord('c')}")
print(f"ord(' '): {ord(' ')}")
print(f"ord('x'): {ord('x')}")
print(f"ord('y'): {ord('y')}")
print(f"ord('z'): {ord('z')}")
print(f"ord('1'): {ord('1')}")
print(f"ord('2'): {ord('2')}")
print(f"ord('3'): {ord('3')}")

ord('a'): 97
ord('b'): 98
ord('c'): 99
ord(' '): 32
ord('x'): 120
ord('y'): 121
ord('z'): 122
ord('1'): 49
ord('2'): 50
ord('3'): 51


<a name='ex-1'></a>
### line_to_tensor

Write a function that takes in a single line and transforms each character into its unicode integer.  This returns a list of integers, which we'll refer to as a tensor.
- Use a special integer to represent the end of the sentence (the end of the line).
- This will be the EOS_int (end of sentence integer) parameter of the function.
- Include the EOS_int as the last integer of the 
- For this exercise, we will use the number `1` to represent the end of a sentence.

In [6]:
def line_to_tensor(line, EOS_int=1):
    """Turns a line of text into a tensor

    Args:
        line (str): A single line of text.
        EOS_int (int, optional): End-of-sentence integer. Defaults to 1.

    Returns:
        list: a list of integers (unicode values) for the characters in the `line`.
    """
    
    # Initialize the tensor as an empty list
    tensor = []
    
    # for each character:
    for c in line:
        
        # convert to unicode int
        c_int = ord(c)
        
        # append the unicode integer to the tensor list
        tensor.append(c_int)
    
    # include the end-of-sentence integer
    tensor.append(EOS_int)
    
    return tensor

In [7]:
# Testing our output
line_to_tensor('abc xyz')

[97, 98, 99, 32, 120, 121, 122, 1]

<a name='1-3'></a>
### 1.3 - Batch Generator 

Most of the time in Natural Language Processing, and AI in general we use batches when training our data sets. Here, we will build a data generator that takes in a text and returns a batch of text lines (lines are sentences).
- The generator converts text lines (sentences) into numpy arrays of integers padded by zeros so that all arrays have the same length, which is the length of the longest sentence in the entire data set.

In [8]:
def stack_to_tensor(data_lines, line_to_tensor=line_to_tensor):
    """

    Args:
        batch_size (int): number of examples (in this case, sentences) per batch.
        max_length (int): maximum length of the output tensor.
        NOTE: max_length includes the end-of-sentence character that will be added
                to the tensor.  
                Keep in mind that the length of the tensor is always 1 + the length
                of the original line of characters.
        data_lines (list): list of the sentences to group into batches.
        line_to_tensor (function, optional): function that converts line to tensor. Defaults to line_to_tensor.
        shuffle (bool, optional): True if the generator should generate random batches of data. Defaults to True.

    Yields:
        data stack to tensor
    """
    
    tensor_l = []
    for line in data_lines:
        line_tensor = line_to_tensor(line)
        
        tensor_l += line_tensor
            
    return np.array(tensor_l)

In [9]:
max_length = 100

train_data_tensor = stack_to_tensor(data_lines=lines)
train_char_dataset = tf.data.Dataset.from_tensor_slices(train_data_tensor)

val_data_tensor = stack_to_tensor(data_lines=eval_lines)
val_char_dataset = tf.data.Dataset.from_tensor_slices(val_data_tensor)
val_char_dataset

2023-09-06 21:25:17.857279: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-06 21:25:17.857399: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-06 21:25:17.899520: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the 

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [10]:
train_sequences = train_char_dataset.batch(max_length + 1, drop_remainder=True)
val_sequences = val_char_dataset.batch(max_length + 1, drop_remainder=True)

In [11]:
train_sequences

<_BatchDataset element_spec=TensorSpec(shape=(101,), dtype=tf.int64, name=None)>

In [12]:
# Create the target text by left shift of one character
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

train_dataset = train_sequences.map(split_input_target)
val_dataset = val_sequences.map(split_input_target)

In [13]:
# Create training batches
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possible infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)
val_dataset = val_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

In [14]:
train_dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

<a name='2'></a>
## 2 - Defining the GRU Model

Now that we have the input and output tensors, we will go ahead and initialize our model. We will be implementing the `GRULM`, gated recurrent unit model.

In [15]:
vocab_size = 256 # 256 characters
embedding_dims = 1024
n_GRU = 2

In [16]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(vocab_size, embedding_dims, batch_input_shape=[batch_size, None]),
# ])

# # Stack the GRU layers
# model.add(tf.keras.layers.GRU(embedding_dims, return_sequences=True,
#                              stateful=True, recurrent_initializer="glorot_uniform"))  # Use return_sequences=True for all but the last layer

# # Dense layer and LogSoftmax
# model.add(tf.keras.layers.Dense(vocab_size, activation="softmax"))

# # Print model summary
# model.summary()

def build_model(vocab_size, embedding_dims, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dims),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            recurrent_initializer="glorot_uniform"),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            recurrent_initializer="glorot_uniform"),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

In [17]:
model = build_model(
    vocab_size=256,
    embedding_dims=embedding_dims,
    rnn_units=embedding_dims,
    batch_size=BATCH_SIZE
)

In [18]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer="adam", loss=loss, metrics = ["accuracy"])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 1024)        262144    
                                                                 
 gru (GRU)                   (None, None, 1024)        6297600   
                                                                 
 gru_1 (GRU)                 (None, None, 1024)        6297600   
                                                                 
 dense (Dense)               (None, None, 256)         262400    
                                                                 
Total params: 13119744 (50.05 MB)
Trainable params: 13119744 (50.05 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


<a name='3'></a>
## 3 - Training

Now we are going to train our model. As usual, we have to define the cost function, the optimizer, and decide whether we will be training it on a `gpu` or `cpu`. We also have to feed in a built model.

In [20]:
model.fit(train_dataset,
          validation_data=val_dataset,
          callbacks = tf.keras.callbacks.EarlyStopping(patience=3),
          epochs=30)

Epoch 1/30
 53/790 [=>............................] - ETA: 26:55 - loss: 3.4296 - accuracy: 0.1865

KeyboardInterrupt: 

## Evaluation

In [None]:
loss, acc = model.evaluate(val_dataset)

In [None]:
train_perplexity = tf.exp(loss)
train_perplexity

## Generate text

In [None]:
def generate_text(model, start_string):
    num_generate = 40
    input_eval = line_to_tensor(start_string)
    input_eval = tf.expand_dims(input_eval, 0)
    temparature = 1.0
    text_generated = []
    
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temparature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(chr(predicted_id))
    
    return start_string + ''.join(text_generated)

In [None]:
print(generate_text(model, start_string="How are"))

In [35]:
import tensorflow as tf
import numpy as np
import os
import time

In [36]:
dirname = ''
filename = 'shakespeare_data.txt'
lines = [] # storing all the lines in a variable. 

counter = 0

with open(os.path.join(dirname, filename)) as files:
    for line in files:        
        # remove leading and trailing whitespace
        pure_line = line.strip()

        # if pure_line is not the empty string,
        if pure_line:
            # append it to the list
            lines.append(pure_line)

In [37]:
def create_vocab(lines):
    vocab = {}
    idx2char = []
    
    for line in lines:
        for char in line:
            if char not in vocab:
                vocab[char] = len(vocab)
                idx2char.append(char)
    return vocab, idx2char

In [38]:
vocab, idx2char = create_vocab(lines)
vocab, len(vocab)

({'A': 0,
  ' ': 1,
  'L': 2,
  'O': 3,
  'V': 4,
  'E': 5,
  'R': 6,
  "'": 7,
  'S': 8,
  'C': 9,
  'M': 10,
  'P': 11,
  'I': 12,
  'N': 13,
  'T': 14,
  'F': 15,
  'o': 16,
  'f': 17,
  'a': 18,
  'h': 19,
  'i': 20,
  'l': 21,
  'w': 22,
  's': 23,
  'e': 24,
  'c': 25,
  'n': 26,
  'v': 27,
  'm': 28,
  'b': 29,
  'r': 30,
  'd': 31,
  'p': 32,
  't': 33,
  'u': 34,
  'y': 35,
  'g': 36,
  ',': 37,
  '-': 38,
  ';': 39,
  'k': 40,
  '.': 41,
  'U': 42,
  'W': 43,
  ':': 44,
  'q': 45,
  'z': 46,
  'x': 47,
  'H': 48,
  'j': 49,
  'B': 50,
  '!': 51,
  'Y': 52,
  'D': 53,
  '?': 54,
  'K': 55,
  'G': 56,
  'J': 57,
  '\t': 58,
  '(': 59,
  ')': 60,
  '|': 61,
  '[': 62,
  ']': 63,
  'Q': 64,
  'Z': 65,
  '&': 66,
  'X': 67,
  '2': 68,
  '1': 69,
  '3': 70,
  '4': 71,
  '5': 72,
  '6': 73,
  '7': 74,
  '8': 75,
  '9': 76,
  '0': 77,
  '$': 78},
 79)

In [39]:
def create_tensor_data(lines, vocab):
    tensor_data = []
    
    for line in lines:
        for char in line:
            tensor_data.append(vocab[char])
            
    return np.array(tensor_data)

tensor_data = create_tensor_data(lines, vocab)
tensor_data

array([ 0,  1,  2, ..., 26, 33, 63])

In [40]:
seq_length = 100

char_dataset = tf.data.Dataset.from_tensor_slices(tensor_data)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    
    return input_text, target_text

In [41]:
dataset = sequences.map(split_input_target)
dataset

<_MapDataset element_spec=(TensorSpec(shape=(100,), dtype=tf.int64, name=None), TensorSpec(shape=(100,), dtype=tf.int64, name=None))>

In [42]:
batch_size = 64
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [43]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 512
gru_units = 2

def build_model(vocab_size, embedding_dim, rnn_units, gru_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
    ])
    
    for gru_unit in range(gru_units):
        model.add(tf.keras.layers.GRU(units=rnn_units,
                                     return_sequences=True,
                                     stateful=True,
                                     recurrent_initializer="glorot_uniform"))
    model.add(tf.keras.layers.Dense(vocab_size))
    
    return model

In [44]:
model = build_model(vocab_size, embedding_dim, rnn_units, gru_units, batch_size)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (64, None, 256)           20224     
                                                                 
 gru_4 (GRU)                 (64, None, 512)           1182720   
                                                                 
 gru_5 (GRU)                 (64, None, 512)           1575936   
                                                                 
 dense_2 (Dense)             (64, None, 79)            40527     
                                                                 
Total params: 2819407 (10.76 MB)
Trainable params: 2819407 (10.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [45]:
def loss(y_true, y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [46]:
model.compile(optimizer="adam", loss=loss)

In [47]:
checkpoint_dir = './training_checkpoints_custom'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

EPOCHS = 10

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [49]:
model = build_model(vocab_size, embedding_dim, rnn_units, 2, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [52]:
def generate_text(model, start_string):
    num_generate = 1000
    input_eval = [vocab[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    temparature = 1.0
    text_generated = []
    
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temparature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
    
    return start_string + ''.join(text_generated)

In [53]:
print(generate_text(model, start_string=u"ROMMEO: "))

ROMMEO: this was more wit![To Yorkill honour from England,But this antic thy belly--for there are gates religious fortune.GLOUCESTER	And let the full maid lose. How my handpet any of the helpingly, as you understand thyself,That it shall plot.Bring de enlord of me; I did beseech you, sir, if best doth lend mightBut 'tis made but such shores, being if you killTan are bound I have no gates of me about theing love: and in 'll reap your pillow.[Music striken]CLIFFORD	[Seats dinder this hand, an you think I warrantShe more than made the name of flesh, hatick so Boult for it.Messenger	Repent to be father that of deathing stocks the' how more thieves.Hence! a know me a nerve in malice, de cast.TRANIO	Sawyou, further water! thou art good five lord to heaven!ANTIPHOLUSOF SYRACUSE	Give me the western hidden knight in the cred cubb of Pease.Virtue that bless thee! must thou wouldst contentMy suit diffird with oaths, the king bestrew the earthWelcomere he is, for thy summanded foesWere so quick no