In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# 1. Data Preparation (Synthetic for simplicity)
# Reverse a sequence of characters, e.g., "abc" -> "cba"

In [None]:
def generate_data(num_samples=10000, max_len=10):
    chars = 'abcdefghijklmnopqrstuvwxyz'
    input_texts = []
    target_texts = []
    for _ in range(num_samples):
        length = np.random.randint(1, max_len + 1)
        input_seq = ''.join(np.random.choice(list(chars), length))
        target_seq = input_seq[::-1]
        input_texts.append(input_seq)
        target_texts.append(target_seq)
    return input_texts, target_texts

input_texts, target_texts = generate_data()

In [3]:
input_texts

['hcttiypvd',
 'hlqjskwyh',
 'hxkjvnp',
 'ykorsuc',
 'hpxcsi',
 'a',
 'vtbfcze',
 'b',
 'hdj',
 'nxgkshfpa',
 'yeit',
 'hgcoq',
 'f',
 'yfd',
 'byrsrj',
 'nzuxonwg',
 'k',
 'jmtyphjz',
 'dsl',
 'rqt',
 'uhpcmuri',
 'vq',
 'ejlmwelbi',
 'lucmpwzjm',
 'ydgesbhm',
 'bqkyiffmn',
 'eg',
 'sxyvxea',
 'qhvmasnyvs',
 'sks',
 'm',
 'oigwpqhtcv',
 'jujg',
 'tx',
 'qs',
 'qlmdgjipik',
 'obyil',
 'lj',
 'nqurlup',
 'vwaaewa',
 'qy',
 'k',
 'v',
 'fszxyg',
 'fbh',
 'j',
 'zaujz',
 'cnonf',
 'xebmquj',
 'jgzcehkqu',
 'navdm',
 'm',
 'zt',
 'jc',
 'bmhkc',
 'kmu',
 'byveyzu',
 'phqzzuzsl',
 'em',
 'p',
 'sf',
 'ifcud',
 'bjiamaa',
 'ylbrpqgvg',
 'sm',
 'gxctptyx',
 'pq',
 'vxlhgcegz',
 'kpuje',
 'xsalowxi',
 'qlzcomudc',
 'tkjduirrez',
 'ddhvtmwtwa',
 'lm',
 'gjs',
 'odthigbwda',
 'ialm',
 'lzapi',
 'mrmhda',
 'g',
 'xrwofyqe',
 'ajy',
 'mvratn',
 'thdjnnungl',
 'lrvxvuw',
 'hqktxsov',
 'dvxywdij',
 'rclhf',
 'lyumg',
 'rvxfymq',
 'tbeut',
 'xgh',
 'hzuqxa',
 'sx',
 'ehpicjxeed',
 'f',
 'l',
 'r',
 '

In [4]:
# Add special tokens and create vocabulary
input_vocab = sorted(list(set("".join(input_texts))))
target_vocab = sorted(list(set("".join(target_texts))))

input_vocab_size = len(input_vocab) + 3  # +3 for <pad>, <start>, and <end>
target_vocab_size = len(target_vocab) + 3  # +3 for <pad>, <start>, and <end>


In [6]:
input_vocab_size, target_vocab_size

(29, 29)

In [7]:
input_token_index = dict([(char, i + 2) for i, char in enumerate(input_vocab)])
input_token_index["<pad>"] = 0
input_token_index["<start>"] = 1
input_token_index["<end>"] = len(input_vocab) + 2

In [9]:
target_token_index = dict([(char, i + 2) for i, char in enumerate(target_vocab)])
target_token_index["<pad>"] = 0
target_token_index["<start>"] = 1
target_token_index["<end>"] = len(target_vocab) + 2

reverse_target_token_index = dict((i, char) for char, i in target_token_index.items())

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts]) + 1 # +1 for <end>

In [10]:
# Vectorize the data
def vectorize_sequences(texts, token_index, max_len):
    vectorized_data = np.zeros((len(texts), max_len), dtype='int32')
    for i, text in enumerate(texts):
        for t, char in enumerate(text):
            vectorized_data[i, t] = token_index[char]
    return vectorized_data

In [11]:
encoder_input_data = vectorize_sequences(input_texts, input_token_index, max_encoder_seq_length)
decoder_input_data = np.zeros((len(target_texts), max_decoder_seq_length), dtype='int32')
decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, target_vocab_size), dtype='float32')

for i, target_text in enumerate(target_texts):
    decoder_input_data[i, 0] = target_token_index['<start>']
    for t, char in enumerate(target_text):
        decoder_input_data[i, t + 1] = target_token_index[char]
        decoder_target_data[i, t, target_token_index[char]] = 1.
    decoder_target_data[i, len(target_text), target_token_index['<end>']] = 1.

In [12]:
# 2. Build the Encoder-Decoder with Attention Model

# Encoder
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(enc_units, return_sequences=True, return_state=True)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self, batch_sz):
        return tf.zeros((batch_sz, self.gru.units))

In [13]:
# Attention (Bahdanau Attention)
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # values encoder output shape == (batch_size, max_len, hidden size)

        # expand_dims to add time axis to query
        query_with_time_axis = tf.expand_dims(query, 1) # (batch_size, 1, hidden size)

        # score shape == (batch_size, max_len, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_len, units)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_len, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [14]:
# Decoder
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(dec_units, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_len, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_wei

In [16]:
# Model Parameters
embedding_dim = 256
units = 512
batch_size = 64
epochs = 10

encoder = Encoder(input_vocab_size, embedding_dim, units)
decoder = Decoder(target_vocab_size, embedding_dim, units)

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


In [17]:
# 3. Training Step
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([target_token_index['<start>']] * batch_size, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [18]:
# 4. Training Loop

# Prepare dataset for training
BUFFER_SIZE = len(encoder_input_data)
dataset = tf.data.Dataset.from_tensor_slices((encoder_input_data, decoder_input_data)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(batch_size, drop_remainder=True)

for epoch in range(epochs):
    enc_hidden = encoder.initialize_hidden_state(batch_size)
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(len(encoder_input_data) // batch_size)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')

    print(f'Epoch {epoch+1} Loss {total_loss.numpy() / (len(encoder_input_data) // batch_size):.4f}')

NameError: in user code:

    File "C:\Users\sande\AppData\Local\Temp\ipykernel_18612\3002211202.py", line 16, in train_step  *
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
    File "d:\Project-to-learn\.machine\lib\site-packages\keras\src\utils\traceback_utils.py", line 122, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\sande\AppData\Local\Temp\ipykernel_18612\430463626.py", line 30, in call
        return x, state, attention_wei

    NameError: Exception encountered when calling Decoder.call().
    
    [1mname 'attention_wei' is not defined[0m
    
    Arguments received by Decoder.call():
      • x=tf.Tensor(shape=(64, 1), dtype=int32)
      • hidden=tf.Tensor(shape=(64, 512), dtype=float32)
      • enc_output=tf.Tensor(shape=(64, 10, 512), dtype=float32)
