<a href="https://colab.research.google.com/github/OlatundeEso/Resume/blob/main/My_Simple_Text_Generator_Model_23122022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, LayerNormalization
from tensorflow.keras.utils import to_categorical

# Corpus for training
corpus = [
    "This is a sample sentence.",
    "Another example for self-supervised learning.",
    "Medical science is a great course of study.",
    "NLP tasks can benefit from self-supervised methods."
]

# Tokenize the corpus into words
tokenized_corpus = [tf.keras.preprocessing.text.text_to_word_sequence(sentence) for sentence in corpus]

# Derive input and target sequences
input_sequences = []
target_sequences = []

for sentence_tokens in tokenized_corpus:
    for i in range(len(sentence_tokens) - 1):
        input_sequences.append(sentence_tokens[:i + 1])
        target_sequences.append(sentence_tokens[i + 1])

# Flatten the list of input sequences
flat_input_sequences = [' '.join(seq) for seq in input_sequences]

# Tokenize the input sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(flat_input_sequences)
total_words = len(tokenizer.word_index) + 1

# Convert input and target sequences to numerical sequences
input_sequences_numeric = tokenizer.texts_to_sequences(flat_input_sequences)
target_sequences_numeric = tokenizer.texts_to_sequences(target_sequences)

# Pad sequences for a consistent input size
max_sequence_length = max([len(seq) for seq in input_sequences_numeric])
padded_input_sequences = pad_sequences(input_sequences_numeric, maxlen=max_sequence_length, padding='pre')
padded_target_sequences = pad_sequences(target_sequences_numeric, maxlen=max_sequence_length, padding='pre')
padded_target_sequences_one_hot = to_categorical(padded_target_sequences, num_classes=total_words)

# Build the Transformer model using MultiHeadAttention layer
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=50, input_length=max_sequence_length))
model.add(TransformerBlock(embed_dim=50, num_heads=4, ff_dim=100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(padded_input_sequences, padded_target_sequences, epochs=100, verbose=2)

def generate_text(model, tokenizer, max_sequence_length, seed_text, next_words=1):
    generated_text = seed_text
    for _ in range(next_words):
        # Tokenize the input text
        seed_text_tokens = tf.keras.preprocessing.text.text_to_word_sequence(seed_text)
        # Convert input sequence to numerical sequence
        seed_sequence_numeric = tokenizer.texts_to_sequences([' '.join(seed_text_tokens)])[0]
        # Pad the input sequence
        padded_seed_sequence = pad_sequences([seed_sequence_numeric], maxlen=max_sequence_length, padding='pre')
        # Use the model to predict the next token probabilities
        predicted_probs = model.predict(padded_seed_sequence, verbose=0)[0]
        # Select the index with the highest probability
        predicted_index = np.random.choice(len(predicted_probs))
        # Decode the predicted token to obtain the word
        predicted_word = tokenizer.index_word.get(predicted_index, '')
        # Update the seed text for the next iteration
        seed_text += ' ' + predicted_word
        generated_text += ' ' + predicted_word

    return generated_text

# Example usage with temperature adjustment
seed_text = "Great"
generated_text = generate_text(model, tokenizer, max_sequence_length, seed_text, next_words=10)
print("Generated Text:", generated_text)


Epoch 1/100
1/1 - 3s - loss: 4.6542 - accuracy: 0.0186 - 3s/epoch - 3s/step
Epoch 2/100
1/1 - 0s - loss: 3.0062 - accuracy: 0.1242 - 26ms/epoch - 26ms/step
Epoch 3/100
1/1 - 0s - loss: 1.7746 - accuracy: 0.6832 - 23ms/epoch - 23ms/step
Epoch 4/100
1/1 - 0s - loss: 1.0816 - accuracy: 0.8696 - 23ms/epoch - 23ms/step
Epoch 5/100
1/1 - 0s - loss: 0.8767 - accuracy: 0.8820 - 26ms/epoch - 26ms/step
Epoch 6/100
1/1 - 0s - loss: 0.7737 - accuracy: 0.8820 - 21ms/epoch - 21ms/step
Epoch 7/100
1/1 - 0s - loss: 0.7659 - accuracy: 0.8820 - 24ms/epoch - 24ms/step
Epoch 8/100
1/1 - 0s - loss: 0.7628 - accuracy: 0.8820 - 31ms/epoch - 31ms/step
Epoch 9/100
1/1 - 0s - loss: 0.7330 - accuracy: 0.8820 - 23ms/epoch - 23ms/step
Epoch 10/100
1/1 - 0s - loss: 0.7576 - accuracy: 0.8820 - 25ms/epoch - 25ms/step
Epoch 11/100
1/1 - 0s - loss: 0.7469 - accuracy: 0.8820 - 23ms/epoch - 23ms/step
Epoch 12/100
1/1 - 0s - loss: 0.7183 - accuracy: 0.8820 - 28ms/epoch - 28ms/step
Epoch 13/100
1/1 - 0s - loss: 0.7365 - ac