In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the Excel file
df = pd.read_excel("C:\\Users\\prana\\Downloads\\Nouns_new_6.xlsx")

# Prepare input and target texts
input_texts = df['Word'].astype(str).tolist()
target_texts = df['Base Word'].astype(str).tolist()

# Extract suffixes (exclude the first character)
input_suffixes = [word[1:] for word in input_texts]
target_suffixes = [base_word[1:] for base_word in target_texts]

# Combine all characters from input and target suffixes to create a tokenizer
all_texts = input_suffixes + target_suffixes

# Character-level tokenization
tokenizer = Tokenizer(char_level=True, filters='')
tokenizer.fit_on_texts(all_texts)

# Convert texts to sequences of integers
input_sequences = tokenizer.texts_to_sequences(input_suffixes)
target_sequences = tokenizer.texts_to_sequences(target_suffixes)

# Determine the maximum sequence length for padding
max_suffix_length = max(len(seq) for seq in input_sequences)

# Pad sequences to ensure uniform length
encoder_input_data = pad_sequences(input_sequences, maxlen=max_suffix_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_suffix_length, padding='post')

# Prepare the target data, shifted by one timestep
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
decoder_target_data[:, -1] = 0  # Padding for the last timestep

# Get the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Print some statistics
print(f'Max suffix length: {max_suffix_length}')
print(f'Vocabulary size: {vocab_size}')


Max suffix length: 24
Vocabulary size: 39


In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, LayerNormalization, Dropout, MultiHeadAttention
from tensorflow.keras.models import Model

# Define hyperparameters
embedding_dim = 64
num_heads = 4
ff_dim = 128  # Feed forward network dimension
num_layers = 4

# Define the transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Positional Encoding
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super(PositionalEncoding, self).__init__()
        self.pos_emb = self.add_weight("pos_emb", shape=[maxlen, embed_dim])

    def call(self, x):
        return x + self.pos_emb

# Define the model
def build_transformer_model(vocab_size, maxlen, embed_dim, num_heads, ff_dim, num_layers):
    inputs = Input(shape=(maxlen,))
    x = Embedding(vocab_size, embed_dim)(inputs)
    x = PositionalEncoding(maxlen, embed_dim)(x)

    for _ in range(num_layers):
        x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)

    outputs = Dense(vocab_size, activation="softmax")(x)
    return Model(inputs, outputs)

# Build and compile the model
transformer_model = build_transformer_model(vocab_size, max_suffix_length, embedding_dim, num_heads, ff_dim, num_layers)
transformer_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
transformer_model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 24)]              0         
                                                                 
 embedding (Embedding)       (None, 24, 64)            2496      
                                                                 
 positional_encoding (Posit  (None, 24, 64)            1536      
 ionalEncoding)                                                  
                                                                 
 transformer_block (Transfo  (None, 24, 64)            83200     
 rmerBlock)                                                      
                                                                 
 transformer_block_1 (Trans  (None, 24, 64)            83200     
 formerBlock)                                                    
                                                             

In [3]:
# Train the model
transformer_model.fit(encoder_input_data, decoder_target_data,
                      batch_size=64, epochs=50, validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x268c03a9890>

: 

In [12]:
def decode_sequence(input_seq, first_char):
    # Predict the output sequence (suffix)
    output_seq = transformer_model.predict(input_seq)
    # Decode the sequence into characters
    decoded_suffix = ''.join([tokenizer.index_word.get(np.argmax(char_prob), '') for char_prob in output_seq[0]])
    # Reconstruct the full base word by prepending the first character
    decoded_word = first_char + decoded_suffix.strip()
    return decoded_word

# Test with a new word
test_input = "krishnaya"  # Replace with any new word
first_char = test_input[0]
test_input_suffix = test_input[1:]
test_input_seq = pad_sequences(tokenizer.texts_to_sequences([test_input_suffix]), maxlen=max_suffix_length, padding='post')

# Predict the base word
predicted_base_word = decode_sequence(test_input_seq, first_char)
print(f'Input: {test_input}')
print(f'Predicted Base Word: {predicted_base_word}')


Input: pranavaya
Predicted Base Word: paṣn
