In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import random
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
import nltk

tf.keras.utils.set_random_seed(1234)

MAX_SENTENCE_LEN = 40


In [None]:
def clean_text(text):

    # remove unnecessary characters in sentences and v

    text = text.lower().strip()
    #Seperate ?.!, with spaces
    text = re.sub(r"([?.!,])", r" \1 ", text)
    #Replace extra spaces with a single space
    text = re.sub(r'[" "]+', " ", text)

    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[^a-zA-Z?.!,]+", " ", text)

    #Remove trailing spaces
    text = text.strip()

    return text

In [None]:
def preprocess(movie_lines, movie_convs, split_ratio, start_tok, end_tok, subword=False):
    #map line ids to line/dialog
    conv_map = {}
    for line in movie_lines:
        if len(line) != 0:
            line_split = line.split(" +++$+++ ")
            conv_map[line_split[0]] = line_split[4]

    #create list containing lists of conversations
    convid_list = []
    for line in movie_convs:
        if len(line) != 0:
            conv = line.split(" +++$+++ ")[-1][1:-1].strip("'").split("', '")
            convid_list.append(conv)

    #split into questions and answers
    input, response  = [], []

    for conv in convid_list:
        for i in range(len(conv)-1):
            input.append(clean_text(conv_map[conv[i]]))
            response.append(clean_text(conv_map[conv[i+1]]))

    #Segregating sentences which habe less than or eqqual to 100 words for faster training
    filtered_input, filtered_response = [], []

    num_qnans_pairs = len(input)

    if not subword:
        for i in range(num_qnans_pairs):
            if len(input[i].split()) <= MAX_SENTENCE_LEN-2 and len(response[i].split()) <= MAX_SENTENCE_LEN-2:
                    filtered_input.append(start_tok + " " + input[i] + " " + end_tok)
                    filtered_response.append(start_tok + " " + response[i] + " " + end_tok)
    else:
        for i in range(num_qnans_pairs):
            if len(input[i].split()) <= MAX_SENTENCE_LEN-2 and len(response[i].split()) <= MAX_SENTENCE_LEN-2:
                    filtered_input.append(input[i])
                    filtered_response.append(response[i])


    #Split to training and test set
    training_size = int(len(filtered_input) * split_ratio)

    #Shuffe the qn answer pairs
    idx = np.arange(len(filtered_input))
    random.shuffle(idx)

    shuffled_input, shuffled_response = [], []

    for i in idx:
        shuffled_input.append(filtered_input[i])
        shuffled_response.append(filtered_response[i])

    train_input, train_responses = shuffled_input[:training_size], shuffled_response[:training_size]
    test_input, test_responses = shuffled_input[training_size:], shuffled_response[training_size:]

    return (train_input, train_responses), (test_input, test_responses)

In [None]:
def tokenize(train_inputs, train_outputs, test_inputs, test_outputs, oov_tok, num_words):
    if num_words is not None:
        tokenizer = Tokenizer(num_words=num_words, oov_token=oov_tok, lower=False, filters='"#$%&()*+-/:;<=>@[\\]^_`{|}~\t\n',)
    else:
        tokenizer = Tokenizer(oov_token=oov_tok, lower=False)

    tokenizer.fit_on_texts(train_inputs+train_outputs)

    train_input_seq = tokenizer.texts_to_sequences(train_inputs)
    train_output_seq = tokenizer.texts_to_sequences(train_outputs)

    test_input_seq = tokenizer.texts_to_sequences(test_inputs)
    test_output_seq = tokenizer.texts_to_sequences(test_outputs)

    train_input_seq_pad = pad_sequences(train_input_seq, padding="post", maxlen=MAX_SENTENCE_LEN)
    train_output_seq_pad = pad_sequences(train_output_seq, padding="post", maxlen=MAX_SENTENCE_LEN)

    test_input_seq_pad = pad_sequences(test_input_seq, padding="post", maxlen=MAX_SENTENCE_LEN)
    test_output_seq_pad = pad_sequences(test_output_seq, padding="post", maxlen=MAX_SENTENCE_LEN)

    return (train_input_seq_pad, train_output_seq_pad), (test_input_seq_pad, test_output_seq_pad), tokenizer

In [None]:
def sub_tokenize(train_inputs, train_outputs, test_inputs, test_outputs, oov_tok, num_words):
    # Build tokenizer using tfds for both questions and answers
    tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
        train_inputs+train_outputs, target_vocab_size=num_words)

    # Define start and end token to indicate the start and end of a sentence
    START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

    train_in, train_out, test_in, test_out = [], [], [], []
    for i in range(len(train_inputs)):
        train_in.append(START_TOKEN + tokenizer.encode(train_inputs[i]) + END_TOKEN)
        train_out.append(START_TOKEN + tokenizer.encode(train_outputs[i]) + END_TOKEN)

    for i in range(len(test_inputs)):
        test_in.append(START_TOKEN + tokenizer.encode(test_inputs[i]) + END_TOKEN)
        test_out.append(START_TOKEN + tokenizer.encode(test_outputs[i]) + END_TOKEN)


    pad_train_inputs = pad_sequences(train_in, padding="post", maxlen=MAX_SENTENCE_LEN)
    pad_train_outputs = pad_sequences(train_out, padding="post", maxlen=MAX_SENTENCE_LEN)

    pad_test_inputs = pad_sequences(test_in, padding="post", maxlen=MAX_SENTENCE_LEN)
    pad_test_outputs = pad_sequences(test_out, padding="post", maxlen=MAX_SENTENCE_LEN)

    return (pad_train_inputs, pad_train_outputs), (pad_test_inputs, pad_test_outputs), tokenizer

In [None]:
def train_dataset(train_in, train_out, batch_size):
    #END token removed from decoder (as there's nothing to predict after the token) input and START token removed from output
    dataset = tf.data.Dataset.from_tensor_slices(({"encoder_in":train_in, "decoder_in":train_out[:,:-1]}, {"outputs": train_out[:, 1:]}))
    dataset = dataset.cache()
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    return dataset

In [None]:
def create_pad_mask(input):

    pad_mask = tf.cast(tf.math.equal(input, 0), tf.float32)
    pad_mask = tf.expand_dims(tf.expand_dims(pad_mask, axis=1), axis=1)
    return pad_mask

def create_look_ahead_mask(input):

    seq_len = tf.shape(input)[1]
    pad_mask = create_pad_mask(input)

    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    pad_and_look_ahead_mask = tf.maximum(pad_mask, look_ahead_mask)
    return pad_and_look_ahead_mask


In [None]:
class multiHeadAttn_layer(tf.keras.layers.Layer):
    def __init__(self, num_heads, embedding_dim, **kwargs):
        #check if nembedding dim divisible by mum_heads
        assert embedding_dim%num_heads == 0

        super(multiHeadAttn_layer, self).__init__(**kwargs)

        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.embedding_dim_per_head = self.embedding_dim // self.num_heads

        self.query_transform = tf.keras.layers.Dense(self.embedding_dim)
        self.key_transform = tf.keras.layers.Dense(self.embedding_dim)
        self.value_transform = tf.keras.layers.Dense(self.embedding_dim)

        self.permute = tf.keras.layers.Permute((2, 1, 3))
        self.dense = tf.keras.layers.Dense(self.embedding_dim)


    def get_config(self):
        config = super(multiHeadAttn_layer, self).get_config()

        #Update config with new layer attributes to make loading models easier
        config.update({"num_heads": self.num_heads, "embedding_dim": self.embedding_dim})

        return config

    def call(self, query, key, value, mask):

        batch_size, q_seq_len, k_seq_len= tf.shape(query)[0], tf.shape(query)[1], tf.shape(key)[1]

        #Transform key, query, value
        query_transformed = self.query_transform(query)
        key_transformed = self.key_transform(key)
        value_transformed = self.value_transform(value)

        #Reshape  and permute dimensions to perform dot product per head
        query_per_head = tf.reshape(query_transformed, (batch_size, q_seq_len, self.num_heads, self.embedding_dim_per_head))
        key_per_head = tf.reshape(key_transformed, (batch_size, k_seq_len, self.num_heads, self.embedding_dim_per_head))
        value_per_head = tf.reshape(value_transformed, (batch_size, k_seq_len, self.num_heads, self.embedding_dim_per_head))

        query_per_head = self.permute(query_per_head)
        key_per_head = self.permute(key_per_head)
        value_per_head = self.permute(value_per_head)

        query_per_head = tf.reshape(query_per_head, (batch_size*self.num_heads, q_seq_len, self.embedding_dim_per_head))
        key_per_head = tf.reshape(key_per_head, (batch_size*self.num_heads, k_seq_len, self.embedding_dim_per_head))
        value_per_head = tf.reshape(value_per_head, (batch_size*self.num_heads, k_seq_len, self.embedding_dim_per_head))

        #Dot product between key and query to find similarities
        dot_prod = tf.matmul(query_per_head, key_per_head, transpose_b=True)/ tf.math.sqrt(tf.cast(self.embedding_dim_per_head, dtype=tf.float32))

        #To avoid considering the padded tokens and future tokens
        dot_prod_reshaped = tf.reshape(dot_prod, (batch_size, self.num_heads, q_seq_len, k_seq_len))

        dot_prod_reshaped += mask * -1e9

        dot_prod = tf.reshape(dot_prod_reshaped, (batch_size*self.num_heads, q_seq_len, k_seq_len))

        #Findding attention weights
        attn_weights = tf.nn.softmax(dot_prod, axis=-1)

        attn_out = tf.matmul(attn_weights, value_per_head)

        #Reshaping the output back to the original shape
        attn_out_reshaped = tf.reshape(attn_out, (batch_size, self.num_heads, q_seq_len, self.embedding_dim_per_head))

        attn_out_permuted = self.permute(attn_out_reshaped)

        attn_out = tf.reshape(attn_out_permuted, (batch_size, q_seq_len, self.embedding_dim))

        #Final linear dense layer
        output = self.dense(attn_out)

        return output

In [None]:
class PositionalEncoding_layer(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, max_len=10000, **kwargs):
        super(PositionalEncoding_layer, self).__init__(**kwargs)

        self.embedding_dim = embedding_dim
        self.max_len = max_len


    def get_config(self):
        config = super(PositionalEncoding_layer, self).get_config()
        config.update({"embedding_dim": self.embedding_dim, "max_len": self.max_len})

        return config

    def call(self, input):
        batch_size = tf.shape(input)[0]
        seq_len = tf.shape(input)[1]

        #denominator
        den = self.max_len**(tf.range(self.embedding_dim, delta=2, dtype=tf.float32)/self.embedding_dim)
        den_stacked = tf.expand_dims(tf.expand_dims(den, axis=0), axis=1)
        den_stacked = tf.repeat(tf.repeat(den_stacked, repeats=seq_len, axis=1), repeats=batch_size, axis=0)

        #numerator
        num_stacked = tf.expand_dims(tf.expand_dims(tf.range(seq_len, dtype=tf.float32), axis=0), axis=2)
        num_stacked = tf.repeat(num_stacked, repeats=batch_size, axis=0)

        inner_term = num_stacked / den_stacked

        postn_encoding = tf.stack([tf.sin(inner_term), tf.cos(inner_term)], axis=-1)

        postn_encoding = tf.reshape(postn_encoding, (batch_size, seq_len, self.embedding_dim))

        output = input + postn_encoding

        # return postn_encoding
        return output, postn_encoding


In [None]:

def postn_encoding_check():

    input = tf.ones((2, 10, 64))
    _, out = PositionalEncoding_layer(64)(input)

    plt.pcolormesh(out[0])
    plt.xlabel("Depth")
    plt.xlim((0, 64))
    plt.ylabel("Position")
    plt.colorbar()
    plt.show()


In [None]:
class feed_forward_network(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_units, **kwargs):
        super(feed_forward_network, self).__init__(**kwargs)

        self.embedding_dim = embedding_dim
        self.num_units = num_units

        self.dense1 = tf.keras.layers.Dense(self.num_units, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(self.embedding_dim)

    def get_config(self):
        config = super(feed_forward_network, self).get_config()
        config.update({"embedding_dim": self.embedding_dim, "num_units": self.num_units})
        return config


    def call(self, input):
        dense_out1 = self.dense1(input)
        dense_out2 = self.dense2(dense_out1)
        return dense_out2

In [None]:
class encoder_layer(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, num_dense_units, dropout_rate, **kwargs):
        super(encoder_layer, self).__init__(**kwargs)

        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.num_dense_units = num_dense_units
        self.dropout_rate = dropout_rate

        self.multiheadAttn = multiHeadAttn_layer(self.num_heads, self.embedding_dim)
        self.feed_forward = feed_forward_network(self.embedding_dim, self.num_dense_units)
        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)
        self.add = tf.keras.layers.Add()
        self.layernorm = tf.keras.layers.LayerNormalization()

    def get_config(self):
        config = super(encoder_layer, self).get_config()
        config.update({"embedding_dim": self.embedding_dim, "num_heads": self.num_heads, "num_dense_units": self.num_dense_units, "dropout_rate": self.dropout_rate})
        return config

    def call(self, input, mask):

        attn_out = self.multiheadAttn(input, input, input, mask)
        dropout_out1 = self.dropout(attn_out)
        res_out1 = self.add([input, dropout_out1])
        norm_out1 = self.layernorm(res_out1)

        feed_forward_out = self.feed_forward(norm_out1)
        dropout_out2 = self.dropout(feed_forward_out)
        res_out2 = self.add([norm_out1, dropout_out2])
        norm_out2 = self.layernorm(res_out2)

        return norm_out2


In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_encoding_layers, embedding_dim, num_heads, num_dense_units, dropout_rate, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.num_encoding_layers =  num_encoding_layers
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.num_dense_units = num_dense_units
        self.dropout_rate = dropout_rate

        self.encoder_layers_list = [encoder_layer(self.embedding_dim, self.num_heads, self.num_dense_units, self.dropout_rate) for _ in range(self.num_encoding_layers)]
        self.layernorm = tf.keras.layers.LayerNormalization()

    def get_config(self):
        config = super(Encoder, self).get_config()
        config.update({"num_encoding_layers": self.num_encoding_layers, "embedding_dim": self.embedding_dim,\
                       "num_heads": self.num_heads, "num_dense_units": self.num_dense_units, "dropout_rate": self.dropout_rate})

        return config

    def call(self, input, mask):

        x = input
        for layer in self.encoder_layers_list:
            x = layer(x, mask)
        output = self.layernorm(x)

        return output

In [None]:
class decoder_layer(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, num_dense_units, dropout_rate, **kwargs):
        super(decoder_layer, self).__init__(**kwargs)

        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.num_dense_units = num_dense_units
        self.dropout_rate = dropout_rate

        self.multiHeadAttn_self = multiHeadAttn_layer(self.num_heads, self.embedding_dim)
        self.multiHeadAttn_cross = multiHeadAttn_layer(self.num_heads, self.embedding_dim)

        self.feed_forward = feed_forward_network(self.embedding_dim, self.num_dense_units)

        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)
        self.add = tf.keras.layers.Add()
        self.layernorm = tf.keras.layers.LayerNormalization()


    def get_config(self):
        config = super(decoder_layer, self).get_config()
        config.update({"embedding_dim": self.embedding_dim, "num_heads": self.num_heads, "num_dense_units": self.num_dense_units, "dropout_rate": self.dropout_rate})

        return config

    def call(self, input, encoder_output, look_ahead_mask, pad_mask):

        self_attn_out = self.multiHeadAttn_self(input, input, input, look_ahead_mask)
        dropout_out1 = self.dropout(self_attn_out)
        res_out1 = self.add([input, dropout_out1])
        norm_out1 = self.layernorm(res_out1)

        cross_attn_out = self.multiHeadAttn_cross(norm_out1, encoder_output, encoder_output, pad_mask)
        dropout_out2 = self.dropout(cross_attn_out)
        res_out2 = self.add([norm_out1, dropout_out2])
        norm_out2 = self.layernorm(res_out2)

        feed_forward_out = self.feed_forward(norm_out2)
        dropout_out3 = self.dropout(feed_forward_out)
        res_out3 = self.add([norm_out2, dropout_out3])
        norm_out3 = self.layernorm(res_out3)

        return norm_out3


In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_decoding_layers, embedding_dim, num_heads, num_dense_units, dropout_rate, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.num_decoding_layers =  num_decoding_layers
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.num_dense_units = num_dense_units
        self.dropout_rate = dropout_rate

        self.decoder_layers_list = [decoder_layer(self.embedding_dim, self.num_heads, self.num_dense_units, self.dropout_rate) for _ in range(self.num_decoding_layers)]
        self.layernorm = tf.keras.layers.LayerNormalization()

    def get_config(self):
        config = super(Decoder, self).get_config()
        config.update({"num_decoding_layers": self.num_decoding_layers, "embedding_dim": self.embedding_dim,\
                       "num_heads": self.num_heads, "num_dense_units": self.num_dense_units, "dropout_rate": self.dropout_rate})

        return config

    def call(self, input, encoder_output, look_ahead_mask, pad_mask):

        x = input
        for layer in self.decoder_layers_list:
            x = layer(x, encoder_output, look_ahead_mask, pad_mask)
        output = self.layernorm(x)

        return output

In [None]:
def Transformer(vocab_size, embedding_dim, num_layers, num_heads, num_dense_units, dropout_rate):

    #Tokenized encoder and decoder inputs
    encoder_inputs = tf.keras.Input(shape=(None,), name="encoder_in")
    decoder_inputs = tf.keras.Input(shape=(None,), name="decoder_in")

    #Create masks
    encoder_pad_mask = tf.keras.layers.Lambda(create_pad_mask, output_shape=(1, 1, None))(encoder_inputs)
    decoder_pad_mask = tf.keras.layers.Lambda(create_pad_mask, output_shape=(1, 1, None))(encoder_inputs)
    decoder_look_ahead_mask=  tf.keras.layers.Lambda(create_look_ahead_mask, output_shape=(1, None, None))(decoder_inputs)

    #Embed the inputs
    embed_encoder_inputs = tf.keras.layers.Embedding(vocab_size, embedding_dim)(encoder_inputs)
    embed_decoder_inputs = tf.keras.layers.Embedding(vocab_size, embedding_dim)(decoder_inputs)

    #Positional Encoding
    encoder_inputs_postn_encoded, _ = PositionalEncoding_layer(embedding_dim)(embed_encoder_inputs)
    decoder_inputs_postn_encoded, _ = PositionalEncoding_layer(embedding_dim)(embed_decoder_inputs)

    #Encoder
    encoder_outputs = Encoder(num_layers, embedding_dim, num_heads, num_dense_units, dropout_rate)(encoder_inputs_postn_encoded, encoder_pad_mask)

    #Decoder
    decoder_outputs = Decoder(num_layers, embedding_dim, num_heads, num_dense_units, dropout_rate)(decoder_inputs_postn_encoded, encoder_outputs,\
                                                                                                   decoder_look_ahead_mask, decoder_pad_mask)

    #Linear layer
    logits = tf.keras.layers.Dense(vocab_size, name="outputs")(decoder_outputs)

    return tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=logits)


In [None]:
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_SENTENCE_LEN - 1))

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [None]:
def accuracy(y_true, y_pred):
    # ensure labels have shape (batch_size, MAX_LENGTH - 1)
    y_true = tf.reshape(y_true, shape=(-1, MAX_SENTENCE_LEN - 1))

    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, embedding_dim, warmup_steps=3000):
        super(CustomSchedule, self).__init__()

        self.embedding_dim = tf.constant(embedding_dim, dtype=tf.float32)
        self.warmup_steps = warmup_steps

    def get_config(self):
        return {"d_model": self.d_model, "warmup_steps": self.warmup_steps}

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        lr = tf.math.multiply(tf.math.rsqrt(self.embedding_dim), tf.math.minimum(arg1, arg2))

        return lr

In [None]:
def prediction(input, query_sentences, tokenizer, start_token, end_token, model, output=None):

    num_inputs = input.shape[0]

    bleu_list = []
    prediction_list = []
    for _ in range(num_inputs):
        prediction_list.append([start_token])

    prediction_tensor = tf.convert_to_tensor(prediction_list, dtype=tf.int32)

    input = tf.convert_to_tensor(input, dtype=tf.int32)

    for i in range(MAX_SENTENCE_LEN):
        model_out = model.predict([input, prediction_tensor], verbose=0)

        last_words = model_out[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(last_words, axis=-1), tf.int32)

        # concatenated the predicted_id to the output which is given to the decoder
        # as its input.
        prediction_tensor = tf.concat([prediction_tensor, predicted_id], axis=1)

    for idx, pred in enumerate(prediction_tensor):
        pred_tokens = []
        for token in pred:
            token_np = token.numpy()
            if token_np != end_token:
                word = tokenizer.index_word[token_np]
                pred_tokens.append(word)
            else:
                break
        pred_sentence = " ".join(pred_tokens[1:])

        query_words = query_sentences[idx].split(" ")
        query = " ".join(query_words)

        print(f"User: {query}")
        print(f"Chatbot: {pred_sentence}")

        if output is not None:
            bleu_list.append(nltk.translate.bleu_score.sentence_bleu([output[idx].split(" ")], pred_sentence.split(" ")))


    if output is not None:
        print(sum(bleu_list)/len(bleu_list))


In [None]:
def preprocess_testdata(sentence_list, tokenizer, start_token=None, end_token=None):
    if start_token is not None:
        for sentence in sentence_list:
            sentence = start_token + " " + sentence + " " + end_token

    test_tokens = tokenizer.texts_to_sequences(sentence_list)
    pad_tokens = pad_sequences(test_tokens, padding="post", maxlen=MAX_SENTENCE_LEN)
    return pad_tokens


In [None]:
class TrainingStopCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if epoch%100 == 0:
            tf.keras.models.save_model(model, filepath=f"/kaggle/working/chatbot_{epoch}.h5", include_optimizer=False)

        if logs["accuracy"] > 0.7:
            print("\n70% accuracy reached, training stopped!\n")
            self.model.stop_training = True


In [None]:
if __name__ == "__main__":

    #Data preprocessing constants
    SPLIT_RATIO = 0.9
    START_TOKEN, END_TOKEN = "START", "END"
    OOV_TOKEN = "OOV"


    #Transformer constants
    EMBEDDING_DIM = 128
    NUM_LAYERS = 4
    NUM_HEADS = 8
    UNITS = 512
    DROPOUT = 0.1
    EPOCHS = 250
    NUM_WORDS = None
    TRAINING_SIZE = None

    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

    BATCH_SIZE = 512 * tpu_strategy.num_replicas_in_sync

    #open files and save data as variables
    with open('/kaggle/input/cornell-moviedialog-corpus/movie_lines.txt', encoding='utf-8', errors='ignore') as f:
        movie_lines = f.read().split('\n')

    with open('/kaggle/input/cornell-moviedialog-corpus/movie_conversations.txt', encoding='utf-8', errors='ignore') as f:
        movie_convs = f.read().split('\n')

    #preprocess and toknize data
    (train_sentences, train_sentences_outputs), (test_sentences, test_outputs) = preprocess(movie_lines, movie_convs, SPLIT_RATIO, START_TOKEN, END_TOKEN, subword=False)

    (train_inputs, train_outputs), (test_inputs, test_outputs), tokenizer = tokenize(train_sentences,\
                                                                                     train_sentences_outputs, test_sentences, test_outputs, OOV_TOKEN, NUM_WORDS)

    if TRAINING_SIZE is not None:
        train_inputs, train_outputs = train_inputs[:TRAINING_SIZE, :], train_outputs[:TRAINING_SIZE, :]

    #convert data to tf.data.Dataset
    dataset = train_dataset(train_inputs, train_outputs, BATCH_SIZE)

    # clear backend
    tf.keras.backend.clear_session()

    if NUM_WORDS is None:
        vocab_size = len(tokenizer.word_index) + 1

    else:
        vocab_size = NUM_WORDS    #for word tokenizer
        #vocab_size = NUM_WORDS + 2    #for subword tokenier



INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local
INFO:tensorflow:Finished initializing TPU system.




INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


In [None]:

learning_rate = CustomSchedule(EMBEDDING_DIM)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# initialize and compile model
with tpu_strategy.scope():
    model = Transformer(vocab_size, EMBEDDING_DIM, NUM_LAYERS, NUM_HEADS, UNITS, DROPOUT)

    model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])


In [None]:
    stop_callback = TrainingStopCallback()
    model.fit(dataset, epochs=EPOCHS, verbose=1, callbacks=[stop_callback])

Epoch 1/250


2023-08-10 22:22:12.093102: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2023-08-10 22:22:12.877511: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 7

<keras.callbacks.History at 0x7d86801de160>

In [None]:
    test_sentences_list = []
    gt_list = []
    for sent in train_sentences:
        word_list = sent.split(" ")
        test_sentences_list.append(" ".join(word_list[1:-1]))

    for sent in train_sentences_outputs:
        word_list = sent.split(" ")
        gt_list.append(" ".join(word_list[1:-1]))

    prediction(train_inputs[:50,:], test_sentences_list[:50], tokenizer, tokenizer.word_index[START_TOKEN], tokenizer.word_index[END_TOKEN], model, gt_list[:50])

User: i am here .
Chatbot: you are not going to be late
User: are you okay ?
Chatbot: i am not sure i am just
User: i do not know , i just got i got very dizzy . . . i feel dizzy , max .
Chatbot: well i am glad to see you
User: why yes , thank you .
Chatbot: i am not hungry
User: hand it over . basic plot ?
Chatbot: i am not quite reacting to what i am council
User: see , i told you !
Chatbot: i am not sure i am not
User: we are going over to her place to make salad and pasta . just , you know , nothing special .
Chatbot: okay okay
User: him . . . ?
Chatbot: yes
User: you almost married recently , did not you ?
Chatbot: i am feeling very happy since i was twelve
User: i beat the odds !
Chatbot: you gonna shoot him
User: you are a man of the world , fettes , you would not hold me to promise given in drink .
Chatbot: i would not speak to you mrs lampert
User: she is got a hidden pitbull . maybe she hired someone to kill them .
Chatbot: she is not getting any minds she is not her fault
Us

In [None]:
    test_input_sentences = ["Hello, what's up?", "What is your plan?", "Are you going to the gym now?", "When is the book due", "What's the point?", "I'm visiting my parents tomorrow", "It'll be windy next week"]
    test_inputs = preprocess_testdata(test_input_sentences, tokenizer, START_TOKEN, END_TOKEN)
    prediction(test_inputs, test_input_sentences, tokenizer, tokenizer.word_index[START_TOKEN], tokenizer.word_index[END_TOKEN], model)

User: Hello, what's up?
Chatbot: i am not sure i am sorry
User: What is your plan?
Chatbot: i am not going to let this team get em
User: Are you going to the gym now?
Chatbot: right
User: When is the book due
Chatbot: uh uh we are going to seal it
User: What's the point?
Chatbot: i am not sure i am not going to be a good night
User: I'm visiting my parents tomorrow
Chatbot: immediately
User: It'll be windy next week
Chatbot: i know focusing
