In [1]:
import sys
sys.path.append("../")

In [2]:
import os 
import tensorflow as tf

from src.data.TextToToken.MyTfTokenizer import MyTfToken
from src.data.DataObjects.TransformerTextDataObject import TransformerTextDataObject
from src.data.DataLoaders import get_webscrape_data

from src.models.Transformer.Transformer import Transformer

from src.models.LossAndMetrics import masked_loss, masked_accuracy, CustomSchedule
from src.models.Callbacks.callbacks import csv_callback, checkpoint_callback, OutputTextCallback
from src.models.TextGenerators.StandardTransformerGenerator import StandardTransformerGenerator



In [3]:

#Project details
project_directory = os.path.abspath("../")
path_to_data_folder = os.path.join(project_directory, "data/processed/webdata")
context_token = MyTfToken(use_bookmark=True)
content_token = MyTfToken(use_bookmark=True)

In [4]:
model_name = "TestS2S"

sequence_length = 62
batch_size = 64
buffer_size = 10000
embedding_dimension = 128
dense_dimension = 256
num_heads = 2
num_att_layers = 1
dropout_rate = 0.1

In [5]:
my_data_set = TransformerTextDataObject(context_sequencer=context_token, content_sequencer=content_token
                                        , context_len=sequence_length, content_len=sequence_length
                                        ,data_loader=get_webscrape_data, data_path=path_to_data_folder)

vocab_size_shake = my_data_set.content_vocab
vocab_size_eng = my_data_set.context_vocab

In [6]:
example_context = [[1,2,3]]
example_content =[[5,6,7]]

In [7]:
training_dataset = my_data_set.batch_and_shuffle(batch_size=batch_size,buffer_size=buffer_size)

In [8]:
import tensorflow as tf

In [9]:

def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss

In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Define the model architecture
def seq2seq_model(input_vocab_size, output_vocab_size, embedding_dim, hidden_units):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm = LSTM(hidden_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(output_vocab_size, embedding_dim)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=encoder_states)
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

model = seq2seq_model(vocab_size_eng+1, vocab_size_shake+1, embedding_dimension, dense_dimension)

# Compile the model
model.compile(optimizer='adam', loss=masked_loss, metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 128)            1166720   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 128)            1464320   ['input_2[0][0]']             
                                                                                              

In [11]:
from src.models.TextGenerators.TextGenerator import TextGenerator
from src.data.TextToToken.TextToToken import TextToToken
from src.models.Transformer.Transformer import Transformer
import tensorflow as tf
import numpy as np


class StandardTransformerGenerator2(TextGenerator):
    def __init__(self, input_str:str,source_model: Transformer 
                 ,context_sequencer: TextToToken, content_sequencer: TextToToken
                 , output_len: int,initializer= None):
        self.source_model = source_model
        self.context_length = 62#source_model.context_length
        self.content_length = 62#source_model.content_length
        self.context_sequencer = context_sequencer
        self.content_sequencer = content_sequencer
        self.input_str = input_str
        self.initializer = initializer
        self.output_len = output_len
        self.context_vector = self.create_context_vector()

    def generate_output(self):
        generated_tokens = []
        bookmark_tokens = self.content_sequencer.get_bookmark_tokens()
        content_vector = [[bookmark_tokens[0]]] if self.initializer is None else self.content_sequencer.tokenise([self.initializer])
        token_index = 0

        for i in range(self.output_len):
                content_padded = tf.keras.preprocessing.sequence.pad_sequences(content_vector, maxlen=self.content_length, padding='post')
                model_output = self.source_model([self.context_vector, content_padded])
                index_output = model_output[0][token_index]
                found_token = tf.argmax(index_output)
                #print(found_token)
                if found_token == bookmark_tokens[1]: 
                     print(f"\nTermination Reached for: {self.input_str[:15]}")
                     break
                content_vector = np.hstack((content_vector, [[found_token]]))
                
                token_index += 1
        return ''.join(self.content_sequencer.detokenise(content_vector))

    def create_context_vector(self):
        token_seq = self.context_sequencer.tokenise([self.input_str])
        token_vals = self.context_sequencer.get_bookmark_tokens()
        if self.context_sequencer.bookmark_status():
            token_seq = [[token_vals[0]] + seq + [token_vals[1]] for seq in token_seq]
        padded_seq = tf.keras.preprocessing.sequence.pad_sequences(token_seq, maxlen=self.context_length, padding='post')
        return np.array(padded_seq)


In [12]:
from src.models.Seq2Seq.Seq2Seq import Seq2SeqModel

In [14]:
model = Seq2SeqModel(dense_dimension=dense_dimension, embedding_dimension=embedding_dimension
                     , num_layers=1, content_vocab=vocab_size_shake+1, context_vocab=vocab_size_eng+1)

In [15]:
tester= StandardTransformerGenerator2(input_str="this is my brother", source_model=model, output_len=sequence_length
                                     ,context_sequencer=my_data_set.context_sequencer, content_sequencer=my_data_set.content_sequencer)


In [16]:
from src.models.Callbacks.callbacks import csv_callback, checkpoint_callback, OutputTextCallback

In [17]:
output_callback = OutputTextCallback(tester, project_directory, model_name)

In [23]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Layer

In [29]:
class EncoderLayer(Layer):
    def __init__(self, hidden_units, num_layers,embedding_dim,input_vocab_size, **kwargs):
        super(EncoderLayer, self).__init__(**kwargs)
        self.embedding = Embedding(input_vocab_size, embedding_dim)
        self.encoder_layers = [LSTM(hidden_units, return_sequences=True, return_state=True) for _ in range(num_layers)]

    def call(self, inputs):
        x = self.embedding(inputs)
        states = []
        for layer in self.encoder_layers:
            x, state_h, state_c = layer(x)
            states += [state_h, state_c]
        return x, states

In [30]:
class DecoderLayer(Layer):
    def __init__(self, hidden_units, num_layers, output_vocab_size,embedding_dim, **kwargs):
        super(DecoderLayer, self).__init__(**kwargs)
        self.embedding = Embedding(output_vocab_size, embedding_dim)
        self.decoder_layers = [LSTM(hidden_units, return_sequences=True, return_state=True) for _ in range(num_layers)]
        self.dense = Dense(output_vocab_size, activation='softmax')

    def call(self, inputs, encoder_states):
        x = self.embedding(inputs)
        for layer in self.decoder_layers:
            x, _, _ = layer(x, initial_state=encoder_states[-2:])
        outputs = self.dense(x)
        return outputs

In [31]:
class Seq2SeqModel(tf.keras.Model):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_dim, hidden_units, num_layers, **kwargs):
        super(Seq2SeqModel, self).__init__(**kwargs)
        self.encoder = EncoderLayer(hidden_units, num_layers, embedding_dim,input_vocab_size)
        self.decoder = DecoderLayer(hidden_units, num_layers, output_vocab_size, embedding_dim)

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs
        encoder_outputs, encoder_states = self.encoder(encoder_inputs)
        decoder_outputs = self.decoder(decoder_inputs, encoder_states)
        return decoder_outputs

In [32]:
model = Seq2SeqModel(vocab_size_eng+1, vocab_size_shake+1, embedding_dimension, dense_dimension, 1)

In [20]:

def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss

In [21]:
model.compile(optimizer='adam', loss=masked_loss, metrics=['accuracy'])

In [22]:
model.fit(training_dataset, epochs=10, callbacks=[output_callback])

Epoch 1/10


Termination Reached for: this is my brot

generated text:   you you you you you you the lord 

Epoch 2/10
Termination Reached for: this is my brot

generated text:   you you you you you 

Epoch 3/10
Termination Reached for: this is my brot

generated text:   you you you you you 

Epoch 4/10
 2/78 [..............................] - ETA: 1:59 - loss: 5.8473 - accuracy: 0.0326

KeyboardInterrupt: 

In [14]:
print(tester.generate_output())


Termination Reached for: this is my brot
 
