<a href="https://colab.research.google.com/github/ShahistaAfreen/DL_DA6401_A3/blob/main/Sweep_with_attention_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import package

In [1]:
!pip install wandb
!pip install xtarfile

Collecting xtarfile
  Downloading xtarfile-0.2.1.tar.gz (7.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: xtarfile
  Building wheel for xtarfile (setup.py) ... [?25l[?25hdone
  Created wheel for xtarfile: filename=xtarfile-0.2.1-py3-none-any.whl size=8056 sha256=750e6e14450bf0cc6cf8cb792a2899abdda8477392af016b7130066039c7ce1b
  Stored in directory: /root/.cache/pip/wheels/73/de/d6/def0eebfc3d5adb3a866d1bd9ae45649e07e6cffb284314a00
Successfully built xtarfile
Installing collected packages: xtarfile
Successfully installed xtarfile-0.2.1


In [2]:
START_TOKEN="\t"
END_TOKEN="\n"

In [3]:
import os
import random
import time
import re
import string
import tarfile
from os.path import exists
from collections import Counter
from tqdm import tqdm

# Data manipulation libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.font_manager import FontProperties

# Deep learning frameworks
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.backend as K
import keras

# Experiment tracking
import wandb

# **Bahdanau Attention**

In [7]:
"""
Attention mechanism for sequence processing
"""
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    """Initialize attention mechanism with specified units

    Args:
        units: Dimensionality of the attention space
    """
    super(BahdanauAttention, self).__init__()

    # Define trainable parameters
    self.W1 = tf.keras.layers.Dense(units)  # Transform encoder state
    self.W2 = tf.keras.layers.Dense(units)  # Transform encoder outputs
    self.V = tf.keras.layers.Dense(1)       # Project to attention scores

  def call(self, enc_state, enc_out):
    """Compute attention context vector

    Args:
        enc_state: Hidden state from encoder
        enc_out: Output sequence from encoder

    Returns:
        Tuple of context vector and attention weights
    """
    # Prepare encoder state for attention calculation
    combined_state = tf.concat(enc_state, axis=1)
    reshaped_state = tf.expand_dims(combined_state, axis=1)

    # Compute attention energy scores
    energy = tf.nn.tanh(self.W1(reshaped_state) + self.W2(enc_out))
    attention_score = self.V(energy)

    # Apply softmax to get probability distribution
    attention_distribution = tf.nn.softmax(attention_score, axis=1)

    # Weight encoder outputs by attention distribution
    weighted_output = attention_distribution * enc_out

    # Sum weighted vectors to produce context vector
    context_vector = tf.reduce_sum(weighted_output, axis=1)

    return context_vector, attention_distribution

In [8]:
!pwd

/content


# **Helper functions**

In [5]:
START_TOKEN="0"
END_TOKEN="1"


"""Retrieve dataset if not already available locally"""
def downloadDataSet():
   working_directory = os.getcwd()

   dataset_archive_present = exists('./dakshina_dataset_v1.0.tar')
   if not dataset_archive_present:
     print('initiating download process....')
     os.system('wget -q https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar -O dakshina_dataset_v1.0.tar')
     print('download operation completed')

   dataset_folder_present = exists('./dakshina_dataset_v1.0/')
   if not dataset_folder_present:
     print('Beginning extraction process..')
     try:
       dataset_archive = tarfile.open('dakshina_dataset_v1.0.tar', 'r')
       dataset_archive.extractall()
       dataset_archive.close()
     except Exception as e:
       print(f"Extraction error: {e}")
     print('Extraction finished')

   print('Dataset preparation complete')


def get_files(language):
  """Construct file paths for the specified language dataset"""
  base_path = f'./dakshina_dataset_v1.0/{language}/lexicons/'
  train_dir = f'{base_path}{language}.translit.sampled.train.tsv'
  val_dir = f'{base_path}{language}.translit.sampled.dev.tsv'
  test_dir = f'{base_path}{language}.translit.sampled.test.tsv'

  return train_dir, val_dir, test_dir


"""Convert text data to token sequences"""
def tokenize(lang, tokenizer=None):
    """Transform text into character-level token sequences

    Args:
        lang: List of text strings to tokenize
        tokenizer: Optional pre-fitted tokenizer

    Returns:
        Tuple of tokenized sequences and tokenizer object
    """
    if tokenizer is None:
        # Initialize new tokenizer if not provided
        tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
        # Fit tokenizer on input text data
        tokenizer.fit_on_texts(lang)

    # Convert text to sequences
    lang_sequences = tokenizer.texts_to_sequences(lang)
    # Pad sequences to uniform length
    lang_tensor = tf.keras.preprocessing.sequence.pad_sequences(
        lang_sequences,
        padding='post'
    )

    return lang_tensor, tokenizer


"""Prepare dataset for model training"""
def preprocess_data(fpath, ip_tokenizer=None, tgt_tokenizer=None):
    """Process raw data files into model-ready tensors

    Args:
        fpath: Path to input data file
        ip_tokenizer: Optional input tokenizer
        tgt_tokenizer: Optional target tokenizer

    Returns:
        Tuple of TF dataset, input tokenizer, and target tokenizer
    """
    # Load data from tsv file
    data_frame = pd.read_csv(fpath, sep="\t", header=None)

    # Append special tokens to target sequences
    data_frame[0] = data_frame[0].apply(lambda x: START_TOKEN + x + END_TOKEN)

    # Generate token sequences
    input_tensor, ip_tokenizer = tokenize(
        data_frame[1].astype(str).tolist(),
        tokenizer=ip_tokenizer
    )

    target_tensor, tgt_tokenizer = tokenize(
        data_frame[0].astype(str).tolist(),
        tokenizer=tgt_tokenizer
    )

    # Create and shuffle TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor))
    dataset = dataset.shuffle(len(dataset))

    return dataset, ip_tokenizer, tgt_tokenizer

In [10]:
# Helper function for creating RNN layers
def get_layer(layer_type, units, dropout, return_sequences, return_state):
    if layer_type.lower() == "lstm":
        return tf.keras.layers.LSTM(units,
                                    return_sequences=return_sequences,
                                    return_state=return_state,
                                    dropout=dropout)
    elif layer_type.lower() == "gru":
        return tf.keras.layers.GRU(units,
                                   return_sequences=return_sequences,
                                   return_state=return_state,
                                   dropout=dropout)
    elif layer_type.lower() == "rnn":
        return tf.keras.layers.SimpleRNN(units,
                                   return_sequences=return_sequences,
                                   return_state=return_state,
                                   dropout=dropout)
    else:
        raise ValueError(f"Unsupported layer type: {layer_type}")


In [9]:
class Parameters():
  def  __init__(self,  language='te',encoder_layers=1,decoder_layers=1,embedding_dim=128,\
                layer_type='lstm', units=128, dropout=0.5, attention=False,attention_type="Luong",batch_size=128,\
                apply_beam_search=False,apply_teacher_forcing=False,teacher_forcing_ratio=1,\
                 save_outputs=None,epochs=5,wandb=None,beamWidth=5,restoreBestModel=True,\
                 patience=2,encoder_vocab_size=64,decoder_vocab_size=64):
        self.language = language
        self.embedding_dim = embedding_dim
        self.encoder_layers=encoder_layers
        self.decoder_layers=decoder_layers
        self.layer_type = layer_type
        self.units = units
        self.dropout = dropout
        self.attention = attention
        self.stats = []
        self.wandb=wandb
        self.epochs=epochs
        self.batch_size = 128
        self.apply_beam_search = apply_beam_search
        self.batch_size = batch_size
        self.apply_teacher_forcing=apply_teacher_forcing
        self.save_outputs=save_outputs
        self.restoreBestModel=restoreBestModel
        self.attention_type=attention_type
        self.patience=patience
        self.encoder_vocab_size=encoder_vocab_size
        self.decoder_vocab_size=decoder_vocab_size
        self.teacher_forcing_ratio=teacher_forcing_ratio

# **Encoder**

In [11]:
class Encoder(tf.keras.Model):
    def __init__(self, param):
        super(Encoder, self).__init__()
        self.layer_type = param.layer_type.lower()
        self.n_layers = param.encoder_layers
        self.units = param.units
        self.dropout = param.dropout
        self.embedding = tf.keras.layers.Embedding(
            param.encoder_vocab_size,
            param.embedding_dim,
            trainable=True
        )

        # Create RNN layers directly in __init__
        self.rnn_layers = []
        for i in range(self.n_layers):
            # For stacked RNNs, all layers except last should return sequences
            return_sequences = True if i < self.n_layers - 1 else True  # Always return sequences for encoder
            layer = get_layer(
                self.layer_type,
                self.units,
                self.dropout,
                return_sequences=return_sequences,
                return_state=True
            )
            self.rnn_layers.append(layer)

        # Add this to ensure the layer is properly built
        self.built = True

    @tf.function
    def call(self, x, hidden=None):
        # x: [batch_size, seq_len]
        batch_size = tf.shape(x)[0]

        # Apply embedding
        x = self.embedding(x)  # [batch_size, seq_len, embedding_dim]

        # Initialize hidden states if not provided
        if hidden is None:
            if self.layer_type == "lstm":
                hidden = []
                for _ in range(self.n_layers):
                    h = tf.zeros((batch_size, self.units))
                    c = tf.zeros((batch_size, self.units))
                    hidden.extend([h, c])
            else:  # GRU or RNN
                hidden = [tf.zeros((batch_size, self.units)) for _ in range(self.n_layers)]

        outputs = []
        states = []

        # Format the hidden states based on RNN type
        if self.layer_type == "lstm":
            hidden_states = []
            for i in range(self.n_layers):
                if 2*i+1 < len(hidden):
                    hidden_states.append([hidden[2*i], hidden[2*i+1]])
                else:
                    # Default to None if not enough states
                    hidden_states.append(None)
        else:  # GRU or RNN
            hidden_states = []
            for i in range(self.n_layers):
                if i < len(hidden):
                    hidden_states.append(hidden[i])
                else:
                    hidden_states.append(None)

        # Process through RNN layers
        current_input = x

        for i, rnn in enumerate(self.rnn_layers):
            if hidden_states[i] is not None:
                if self.layer_type == "lstm":
                    # For LSTM with initial state
                    output, h_state, c_state = rnn(current_input, initial_state=hidden_states[i])
                    states.extend([h_state, c_state])
                else:
                    # For GRU/RNN with initial state
                    output, state = rnn(current_input, initial_state=hidden_states[i])
                    states.append(state)
            else:
                # No initial state provided
                if self.layer_type == "lstm":
                    output, h_state, c_state = rnn(current_input)
                    states.extend([h_state, c_state])
                else:
                    output, state = rnn(current_input)
                    states.append(state)

            current_input = output
            outputs.append(output)

        # Return the output of last RNN layer and all states
        return outputs[-1], states

    def initialize_hidden_state(self, batch_size):
        if self.layer_type == "lstm":
            # For LSTM, we need h_state and c_state for each layer
            states = []
            for _ in range(self.n_layers):
                h = tf.zeros((batch_size, self.units))
                c = tf.zeros((batch_size, self.units))
                states.extend([h, c])
        else:
            # For GRU/RNN, we need one state per layer
            states = [tf.zeros((batch_size, self.units)) for _ in range(self.n_layers)]

        return states

# **Decoder**

In [12]:
class Decoder(tf.keras.Model):
    def __init__(self, param):
        super(Decoder, self).__init__()

        self.layer_type = param.layer_type.lower()
        self.n_layers = param.decoder_layers
        self.units = param.units
        self.dropout = param.dropout
        self.attention = param.attention
        self.attention_type = getattr(param, 'attention_type', 'bahdanau')

        # Embedding layer
        self.embedding_layer = layers.Embedding(
            input_dim=param.decoder_vocab_size,
            output_dim=param.embedding_dim,
            trainable=True
        )

        # Output dense layer
        self.dense = layers.Dense(param.decoder_vocab_size, activation="softmax")

        # Attention mechanism
        if self.attention:
            self.attention_layer = BahdanauAttention(self.units)

        # Create RNN layers directly in __init__
        self.rnn_layers = []
        for i in range(self.n_layers):
            return_sequences = True if i < self.n_layers - 1 else False
            layer = get_layer(
                self.layer_type, self.units, self.dropout,
                return_sequences=return_sequences, return_state=True
            )
            self.rnn_layers.append(layer)

        # Add this to ensure the layer is properly built
        self.built = True

    @tf.function
    def call(self, x, hidden, enc_out=None):
        # x: [batch_size, 1]
        # hidden: list of states from encoder
        # enc_out: encoder outputs for attention

        # Apply embedding
        x = self.embedding_layer(x)  # [batch_size, 1, embedding_dim]

        attention_weights = None

        # Format hidden states based on RNN type
        if self.layer_type == "lstm":
            # For LSTM, we need h_state and c_state for each layer
            hidden_states = []
            for i in range(self.n_layers):
                if 2*i+1 < len(hidden):
                    hidden_states.append([hidden[2*i], hidden[2*i+1]])
                else:
                    hidden_states.append(None)

            # Use first hidden state for attention
            attention_state = hidden[0] if hidden else None
        else:
            # For GRU/RNN, we need one state per layer
            hidden_states = []
            for i in range(self.n_layers):
                if i < len(hidden):
                    hidden_states.append(hidden[i])
                else:
                    hidden_states.append(None)

            # Use first hidden state for attention
            attention_state = hidden[0] if hidden else None

        # Apply attention if enabled
        if self.attention and enc_out is not None and attention_state is not None:
            # Apply attention mechanism
            context_vector, attention_weights = self.attention_layer(attention_state, enc_out)

            # Expand context vector to match x's time dimension
            context_vector_expanded = tf.expand_dims(context_vector, 1)

            # Concatenate context vector with input embedding along feature dimension
            x = tf.concat([context_vector_expanded, x], axis=-1)

        # Process through RNN layers
        states = []
        current_input = x

        for i, rnn in enumerate(self.rnn_layers):
            if hidden_states[i] is not None:
                if self.layer_type == "lstm":
                    # For LSTM with initial state
                    output, h_state, c_state = rnn(current_input, initial_state=hidden_states[i])
                    states.extend([h_state, c_state])
                else:
                    # For GRU/RNN with initial state
                    output, state = rnn(current_input, initial_state=hidden_states[i])
                    states.append(state)
            else:
                # No initial state provided
                if self.layer_type == "lstm":
                    output, h_state, c_state = rnn(current_input)
                    states.extend([h_state, c_state])
                else:
                    output, state = rnn(current_input)
                    states.append(state)

            current_input = output

        # Apply the dense layer to get output probabilities
        # If the last RNN layer returns sequences, take only the last timestep
        if len(output.shape) > 2:
            output = output[:, -1, :]

        # Get final output prediction
        prediction = self.dense(output)

        return prediction, states, attention_weights

# **SequenceTOSequence**

In [13]:

class SequenceTOSequence():
    def __init__(self, parameters):
        #Basic configurations
        self.param = parameters
        self.embedding_dim = parameters.embedding_dim
        self.encoder_layers = parameters.encoder_layers
        self.decoder_layers = parameters.decoder_layers
        self.layer_type = parameters.layer_type
        self.units = parameters.units
        self.dropout = parameters.dropout
        self.batch_size = parameters.batch_size

        #Add information regarding attention layer
        self.attention = parameters.attention
        self.attention_type = parameters.attention_type

        self.stats = []

        self.apply_beam_search = parameters.apply_beam_search

        #Early stop conditions
        self.patience = parameters.patience
        self.restoreBestModel = parameters.restoreBestModel

        #teacher forcing
        self.apply_teacher_forcing = parameters.apply_teacher_forcing
        self.teacher_forcing_ratio = parameters.teacher_forcing_ratio

    #Build model Add specific optimizers
    def build(self, loss, metric, optimizer='adam', lr=0.001):
        self.loss = loss

        #Select specific optimizer
        if(optimizer=='adam'):
            self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        if(optimizer=='nadam'):
            self.optimizer = tf.keras.optimizers.Nadam(learning_rate=lr)
        else:
            self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)

        self.metric = metric

    def set_vocabulary(self, input_tokenizer, targ_tokenizer):
        self.input_tokenizer = input_tokenizer
        self.targ_tokenizer = targ_tokenizer
        self.create_model()

    """This procedure used to define Encoder Decoder Layer"""
    def create_model(self):
        encoder_vocab_size = len(self.input_tokenizer.word_index) + 1
        decoder_vocab_size = len(self.targ_tokenizer.word_index) + 1
        self.param.encoder_vocab_size = encoder_vocab_size
        self.param.decoder_vocab_size = decoder_vocab_size

        #Add Encoder layer
        self.encoder = Encoder(self.param)

        #Create decode with or without any attention layer
        #Check following properties to add attention
        # self.attention
        # self.attention_type
        self.decoder = Decoder(self.param)

    @tf.function
    def train(self, input, target, enc_state):
        """
        Training step function with gradient tape.
        Handles both teacher forcing and non-teacher forcing modes.

        Args:
            input: Input tensor of shape [batch_size, max_input_len]
            target: Target tensor of shape [batch_size, max_target_len]
            enc_state: Initial encoder state

        Returns:
            batch_loss: Average loss for this batch
            accuracy: Current accuracy metric value
        """
        loss = 0

        with tf.GradientTape() as tape:
            # Run input through encoder
            enc_out, enc_state = self.encoder(input, enc_state)

            # Set initial state of decoder from encoder state
            dec_state = enc_state

            # Start token for all sequences in the batch
            dec_input = tf.expand_dims([self.targ_tokenizer.word_index["\t"]] * self.batch_size, 1)

            # Determine whether to use teacher forcing for this batch
            apply_teacher_forcing = False
            if self.apply_teacher_forcing and random.random() < self.teacher_forcing_ratio:
                apply_teacher_forcing = True

            # Teacher forcing: use actual target tokens as next input
            if apply_teacher_forcing:
                for t in range(1, target.shape[1]):
                    # Pass the decoder input, state, and encoder output to the decoder
                    preds, dec_state, _ = self.decoder(dec_input, dec_state, enc_out)

                    # Calculate loss and update metrics
                    loss += self.loss(target[:, t], preds)
                    self.metric.update_state(target[:, t], preds)

                    # Use the actual target as the next decoder input (teacher forcing)
                    dec_input = tf.expand_dims(target[:, t], 1)

            # No teacher forcing: use model's own predictions as next input
            else:
                for t in range(1, target.shape[1]):
                    # Pass the decoder input, state, and encoder output to the decoder
                    preds, dec_state, _ = self.decoder(dec_input, dec_state, enc_out)

                    # Calculate loss and update metrics
                    loss += self.loss(target[:, t], preds)
                    self.metric.update_state(target[:, t], preds)

                    # Use our own prediction as the next decoder input
                    predicted_ids = tf.argmax(preds, axis=1)
                    dec_input = tf.expand_dims(predicted_ids, 1)

            # Calculate average loss per time step
            batch_loss = loss / tf.cast(target.shape[1], dtype=tf.float32)

            # Get all trainable variables and apply gradients
            variables = self.encoder.trainable_variables + self.decoder.trainable_variables
            gradients = tape.gradient(loss, variables)
            self.optimizer.apply_gradients(zip(gradients, variables))

        return batch_loss, self.metric.result()

    def fit(self, dataset, val_dataset, batch_size=128, epochs=5, wandb=None, apply_teacher_forcing=True, teacher_forcing_ratio=0.7):
        self.batch_size = batch_size
        self.apply_teacher_forcing = apply_teacher_forcing
        self.teacher_forcing_ratio = teacher_forcing_ratio

        #Prepare chunk of data based on batch size provided
        steps_per_epoch = len(dataset) // self.batch_size
        #steps_per_epoch_val = len(val_dataset) // self.batch_size

        dataset = dataset.batch(self.batch_size, drop_remainder=False)
        #val_dataset = val_dataset.batch(self.batch_size, drop_remainder=False)

        sample_inp, sample_targ = next(iter(dataset))
        self.max_target_len = sample_targ.shape[1]
        self.max_input_len = sample_inp.shape[1]

        #Store Encoder, decoder details in case model get good accuracy
        #Will be useful to restore best model
        self.bestEncoder = self.encoder
        self.bestDecoder = self.decoder
        self.bestoptimizer = self.optimizer

        accuracyDegradePatience = 0
        self.oldaccuracy = 0
        for epoch in tqdm(range(1, epochs+1), total=epochs, desc="Epochs "):
            if(accuracyDegradePatience >= self.patience):
                if(self.restoreBestModel == True):
                    self.encoder = self.bestEncoder
                    self.decoder = self.bestDecoder
                    self.optimizer = self.bestoptimizer
                break

            ## Training loop ##
            total_loss = 0
            total_acc = 0
            self.metric.reset_state()

            starting_time = time.time()
            enc_state = self.encoder.initialize_hidden_state(self.batch_size)

            for batch, (input, target) in enumerate(dataset.take(steps_per_epoch)):
                #Accumulate loss and accuracy for each batch
                batch_loss, acc = self.train(input, target, enc_state)
                total_loss += batch_loss
                total_acc += acc

            #Calculate validation accuracy for current Epoch
            avg_acc = total_acc / steps_per_epoch
            avg_loss = total_loss / steps_per_epoch

            # Validation loop ##
            total_val_loss = 0
            total_val_acc = 0
            self.metric.reset_state()

            enc_state = self.encoder.initialize_hidden_state(self.batch_size)

            #Process data in batches
            avg_val_loss, avg_val_acc = self.evaluate(val_dataset, batch_size=self.batch_size)

            #Verify if model performance degrading
            #In case train accuracy improved but no significant improvement in validation
            #Add condition for early stopping
            #Restore best model based on the input
            if(self.oldaccuracy > avg_val_acc):
                accuracyDegradePatience += 1
            else:
                self.bestEncoder = self.encoder
                self.bestDecoder = self.decoder
                self.bestoptimizer = self.optimizer
                self.oldaccuracy = avg_val_acc
                accuracyDegradePatience = 0

            print("\nTrain Loss: {0:.4f} Train Accuracy: {1:.4f} Validation Loss: {2:.4f} Validation Accuracy: {3:.4f}".format(
                avg_loss, avg_acc*100, avg_val_loss, avg_val_acc*100))

            time_taken = time.time() - starting_time

            #Add logs for WanDb
            self.stats.append({
                "epoch": epoch,
                "train_loss": avg_loss,
                "val_loss": avg_val_loss,
                "train_acc": avg_acc*100,
                "val_acc": avg_val_acc*100,
                "training time": time_taken
            })

            #Log to wanDB
            if not (wandb is None):
                wandb.log(self.stats[-1])

            print(f"\nTime taken for the epoch {time_taken:.4f}")

        print("\nModel trained successfully !!")

    @tf.function
    def validation(self, inp, trgt, encoder_state):
        """
        Validation step function.
        Always uses the model's predictions as the next input (no teacher forcing).

        Args:
            inp: Input tensor of shape [batch_size, max_input_len]
            trgt: Target tensor of shape [batch_size, max_target_len]
            encoder_state: Initial encoder state

        Returns:
            batch_loss: Average loss for this batch
            accuracy: Current accuracy metric value
        """
        loss = 0

        # Run input through encoder
        encoder_output, encoder_state = self.encoder(inp, encoder_state)

        # Set initial state of decoder from encoder state
        decoder_state = encoder_state

        # Start token for all sequences in the batch
        decoder_input = tf.expand_dims([self.targ_tokenizer.word_index["\t"]] * self.batch_size, 1)

        # Process each time step
        for t in range(1, trgt.shape[1]):
            # Get decoder prediction
            prediction, decoder_state, _ = self.decoder(decoder_input, decoder_state, encoder_output)

            # Calculate loss and update metrics
            loss += self.loss(trgt[:, t], prediction)
            self.metric.update_state(trgt[:, t], prediction)

            # Use our own prediction as the next decoder input
            predicted_ids = tf.argmax(prediction, axis=1)
            decoder_input = tf.expand_dims(predicted_ids, 1)

        # Calculate average loss per time step
        batch_loss = loss / tf.cast(trgt.shape[1], dtype=tf.float32)

        return batch_loss, self.metric.result()

    def evaluate(self, test_dataset, batch_size=None):
        """Evaluate our model on test data"""
        if batch_size is not None:
            self.batch_size = batch_size

        #prepare chunk of data based on the batch size
        steps_per_epoch_test = len(test_dataset) // batch_size
        test_dataset = test_dataset.batch(batch_size, drop_remainder=True)

        total_test_loss = 0
        total_test_acc = 0
        self.metric.reset_state()

        enc_state = self.encoder.initialize_hidden_state(self.batch_size)

        #print("\nRunning test dataset through the model...\n")
        #Run in batches based on the input batch size
        for batch, (input, target) in enumerate(test_dataset.take(steps_per_epoch_test)):
            batch_loss, acc = self.validation(input, target, enc_state)
            total_test_loss += batch_loss
            total_test_acc += acc

        #Calculate average test accuracy and loss
        avg_test_acc = total_test_acc / steps_per_epoch_test
        avg_test_loss = total_test_loss / steps_per_epoch_test

        #Display details
        #print(f"Test Loss: {avg_test_loss:.4f} Test Accuracy: {avg_test_acc:.4f}")

        return avg_test_loss, avg_test_acc

    """ This function used to translate english word to respective language"""
    def translate(self, word, get_heatmap=False):
        """
        Translate an input word to the target language.

        Args:
            word: Input word or sentence to translate
            get_heatmap: Whether to return attention weights for visualization

        Returns:
            result: Translated text
            att_wts: Attention weights (if get_heatmap=True)
        """
        # Add start and end tokens for input word
        start = "\t"
        end = "\n"
        word = start + word + end

        # Tokenize and pad input
        inputs = self.input_tokenizer.texts_to_sequences([word])
        inputs = tf.keras.preprocessing.sequence.pad_sequences(
            inputs,
            maxlen=self.max_input_len,
            padding="post"
        )

        # Initialize result string and attention weights list
        result = ""
        att_wts = []

        # Initialize encoder state and run input through encoder
        enc_state = self.encoder.initialize_hidden_state(1)
        enc_out, enc_state = self.encoder(inputs, enc_state)

        # Set initial decoder state to encoder state
        dec_state = enc_state

        # Start token as first decoder input
        dec_input = tf.expand_dims([self.targ_tokenizer.word_index[start]], 1)

        # Generate translation one token at a time
        for t in range(1, self.max_target_len):
            # Get prediction from decoder
            preds, dec_state, attention_weights = self.decoder(dec_input, dec_state, enc_out)

            # Store attention weights if needed
            if get_heatmap and attention_weights is not None:
                att_wts.append(attention_weights)

            # Get the predicted token ID
            predicted_id = tf.argmax(preds, axis=1)

            # Convert ID to character/word
            next_char = self.targ_tokenizer.index_word.get(predicted_id.numpy().item(), "<UNK>")

            # Add to result
            result += next_char

            # Use prediction as next input
            dec_input = tf.expand_dims(predicted_id, 1)

            # Stop if end token is generated
            if next_char == end:
                break

        # Remove the end token if present
        if result.endswith(end):
            result = result[:-1]

        return result, att_wts

# **wandb**

In [15]:
import wandb

In [16]:
wandb.init()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mna21b050[0m ([33mna21b050-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Data Preprocessing

In [17]:

START_TOKEN="\t"
END_TOKEN="\n"


In [18]:
downloadDataSet()

initiating download process....
download operation completed
Beginning extraction process..
Extraction finished
Dataset preparation complete


In [19]:
language="te"
train_dir, val_dir, test_dir = get_files(language)

In [20]:
dataset, input_tokenizer, targ_tokenizer = preprocess_data(train_dir)
val_dataset, _, _ = preprocess_data(val_dir,input_tokenizer,targ_tokenizer)

In [21]:
#train data
dataset, input_tokenizer, targ_tokenizer = preprocess_data(train_dir)

## Sweep Configuration

In [22]:
sweep_config = {
  "name": "DL_Assignment3_Rnn",
  "method": "bayes",
  "metric": {
      "name": "val_acc",
      "goal": "maximize",
  },

  "parameters": {
        "num_of_encoders":{
          "values":[1,2,3]
        },
        "num_of_decoders":{
            "values":[1,2,3]

        },
        "cell_type":{
          "values":['gru', 'lstm']
        },


        "lr":{
          "values":[0.001,0.005]
        },
        "optimizer":{
          "values":['adam','rmsprop']
        },
        "dropout":{ "values": [0.3,0.5]},
        "latent_dim":{ "values": [128,256,512]},
        "inp_emb_size": {"values": [64,128,256]},

        "batch_size":{"values":[32,64,128]},

        }
    }

# wandb runs

In [23]:
# This is the main function to use to train/fine-tune the model using wandb runs
def train_wandb():
    run = wandb.init()

    config=wandb.config
    # Set the run name
    name="num_of_encoders("+ str(config["num_of_encoders"]) + ")_"
    name = " num_of_decoders(" + str(config["num_of_decoders"]) + ")_"
    name += " cell_type(" + str(config["cell_type"]) + ")_"

    name += "latent_dim(" + str(config["latent_dim"])+ ")_"
    name += "lr(" + str(config["lr"])+ ")_"
    name += "optimizer(" + str(config["optimizer"]) + ")_"
    name += "dropout(" + str(config["dropout"]) + ")"
    name += "inp_emb_size(" + str(config["inp_emb_size"]) + ")_"
    name+="batch_size(" + str(config["batch_size"]) + ")"


    wandb.run.name = name[:-1]
    batch_size=config["batch_size"]
    inp_emb_size=config["inp_emb_size"]
    dropout=config["dropout"]
    optimizer=config["optimizer"]
    num_of_encoders=config["num_of_encoders"]
    num_of_decoders=config["num_of_decoders"]

    lr=config["lr"]
    latent_dim=config["latent_dim"]
    cell_type=config["cell_type"]


    param=Parameters(language="te",\
                        embedding_dim=inp_emb_size,\
                        encoder_layers=num_of_encoders,\
                        decoder_layers=num_of_decoders,\
                        layer_type=cell_type,\
                        units=latent_dim,\
                        dropout=dropout,
                        epochs=15,\
                 batch_size=batch_size\
                   )
    param.apply_teacher_forcing=True
    param.teacher_forcing_ratio=1
    param.patience=5
    param.attention=True
    model = SequenceTOSequence(param)
    model.set_vocabulary(input_tokenizer, targ_tokenizer)

    model.build(loss=tf.keras.losses.SparseCategoricalCrossentropy(),\
                metric = tf.keras.metrics.SparseCategoricalAccuracy(),\
                optimizer = optimizer,\
                lr=lr\
                )

    model.fit(dataset, val_dataset, epochs=param.epochs, wandb=wandb,teacher_forcing_ratio=param.teacher_forcing_ratio)



In [24]:
sweep_id = wandb.sweep(sweep_config, entity="na21b050-iit-madras", project="DA6401_Assignment3")

Create sweep with ID: 3chor4a8
Sweep URL: https://wandb.ai/na21b050-iit-madras/DA6401_Assignment3/sweeps/3chor4a8


In [None]:
wandb.agent(sweep_id, train_wandb, count = 1)

[34m[1mwandb[0m: Agent Starting Run: 36oy2opv with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	inp_emb_size: 256
[34m[1mwandb[0m: 	latent_dim: 128
[34m[1mwandb[0m: 	lr: 0.005
[34m[1mwandb[0m: 	num_of_decoders: 1
[34m[1mwandb[0m: 	num_of_encoders: 3
[34m[1mwandb[0m: 	optimizer: rmsprop


Epochs :   7%|▋         | 1/15 [06:20<1:28:40, 380.06s/it]


Train Loss: 0.5745 Train Accuracy: 74.4717 Validation Loss: 1.4500 Validation Accuracy: 79.2393

Time taken for the epoch 380.0484


Epochs :  13%|█▎        | 2/15 [12:01<1:17:21, 357.05s/it]


Train Loss: 0.2127 Train Accuracy: 92.4727 Validation Loss: 1.2167 Validation Accuracy: 83.5534

Time taken for the epoch 340.9487


Epochs :  20%|██        | 3/15 [17:43<1:10:05, 350.48s/it]


Train Loss: 0.1576 Train Accuracy: 94.7616 Validation Loss: 1.1114 Validation Accuracy: 85.8020

Time taken for the epoch 342.6463


Epochs :  27%|██▋       | 4/15 [23:25<1:03:36, 346.98s/it]


Train Loss: 0.1325 Train Accuracy: 95.7174 Validation Loss: 1.0958 Validation Accuracy: 86.3245

Time taken for the epoch 341.6053


Epochs :  33%|███▎      | 5/15 [29:08<57:36, 345.68s/it]  


Train Loss: 0.1181 Train Accuracy: 96.1778 Validation Loss: 1.1814 Validation Accuracy: 85.7141

Time taken for the epoch 343.3825


Epochs :  40%|████      | 6/15 [34:51<51:42, 344.68s/it]


Train Loss: 0.1086 Train Accuracy: 96.5209 Validation Loss: 1.0986 Validation Accuracy: 86.5572

Time taken for the epoch 342.7117
