<a href="https://colab.research.google.com/github/Srivardhan2004/nlp/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Example text data (you can replace this with any larger corpus) text = """ Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods. One day, her mother asked her to take a basket of goodies to her grandmother. On her way through the woods, she met a big bad wolf who wanted to eat her. [CO5]
(i) Build the Transformer Model on above dataset
(ii) Train the model using 20, 60, 70 epochs
(iii) After training, use the model to generate new text by feeding it an initial seed text
(iv) Experimenting and Improving the Model by large dataset and hyper tune parameter.

In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, LayerNormalization, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Sample text data
text = """Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother,
who lived in the woods. One day, her mother asked her to take a basket of goodies to her grandmother.
On her way through the woods, she met a big bad wolf who wanted to eat her."""

# Tokenize and pad data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])
input_data = pad_sequences(sequences, padding='post')

# Transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Transformer model
def create_transformer_model(vocab_size, embed_dim, num_heads, ff_dim):
    inputs = tf.keras.Input(shape=(None,))
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    # Pass training=False to the call method
    x = transformer_block(x, training=False)
    outputs = Dense(vocab_size, activation="softmax")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

vocab_size = len(tokenizer.word_index) + 1
embed_dim = 64
num_heads = 2
ff_dim = 128
model = create_transformer_model(vocab_size, embed_dim, num_heads, ff_dim)

# Compile and train model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
epochs_list = [20, 60, 70]
for epochs in epochs_list:
    model.fit(input_data, input_data, epochs=epochs)

# Text generation
def generate_text(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=input_data.shape[1], padding='post')
        predicted = model.predict(token_list, verbose=0)
        predicted_id = predicted.argmax()
        # Check if predicted_id is within the vocabulary
        if predicted_id in tokenizer.index_word:
            next_word = tokenizer.index_word[predicted_id]
            seed_text += " " + next_word
        else:
            # Handle out-of-vocabulary words
            seed_text += " <UNK>"  # Replace with a suitable token
    return seed_text

seed_text = "Once upon a time"
generated_text = generate_text(seed_text, next_words=20)
print(generated_text)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0357 - loss: 4.4802
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.0536 - loss: 3.8564
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1250 - loss: 3.3033
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.3036 - loss: 2.8733
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.4107 - loss: 2.4705
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.4643 - loss: 2.1571
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.5893 - loss: 1.8985
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.6429 - loss: 1.6271
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m