In [None]:
# model_architectures.py

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Bidirectional, Dropout, Layer, Reshape
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.regularizers import l2

# --- Custom Attention Layer (Bahdanau Attention) ---
# This layer calculates attention weights and applies them to the GRU outputs.
class BahdanauAttention(Layer):
    """
    Custom Bahdanau Attention Layer for Keras models.
    Calculates attention weights and applies them to a sequence of values.
    """
    def __init__(self, units, **kwargs):
        super(BahdanauAttention, self).__init__(**kwargs)
        self.units = units
        self.W1 = Dense(units, name='attention_W1')
        self.W2 = Dense(units, name='attention_W2')
        self.V = Dense(1, name='attention_V')

    def build(self, input_shape):
        # input_shape[0] is query, input_shape[1] is values
        super(BahdanauAttention, self).build(input_shape)

    def call(self, query, values):
        """
        Performs the forward pass for the attention mechanism.

        Args:
            query (tf.Tensor): The query tensor (e.g., last hidden state of GRU).
                               Shape: (batch_size, gru_units)
            values (tf.Tensor): The values tensor (e.g., sequence of hidden states from GRU).
                                Shape: (batch_size, max_sequence_length, gru_units)

        Returns:
            tuple: (context_vector, attention_weights)
        """
        # query shape == (batch_size, gru_units)
        # values shape == (batch_size, max_sequence_length, gru_units)

        # expand_dims to add time axis to query
        # query_with_time_axis shape == (batch_size, 1, gru_units)
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_sequence_length, 1)
        # score is a non-normalized probability distribution over the attention weights.
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_sequence_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, gru_units)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

    def get_config(self):
        config = super(BahdanauAttention, self).get_config()
        config.update({'units': self.units})
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

def build_bigru_attention_model(max_words, max_sequence_length, embedding_dim,
                                gru_units, dense_units, dropout_rate, num_labels, l2_reg_factor):
    """
    Builds a Bidirectional GRU model with a Bahdanau Attention mechanism.

    Args:
        max_words (int): Maximum number of unique words in the vocabulary.
        max_sequence_length (int): Maximum length of input sequences.
        embedding_dim (int): Dimension of the word embeddings.
        gru_units (int): Number of units in each GRU layer.
        dense_units (int): Number of units in the intermediate Dense layer.
        dropout_rate (float): Dropout rate for regularization.
        num_labels (int): Number of output labels for multi-label classification.
        l2_reg_factor (float): L2 regularization factor.

    Returns:
        tf.keras.Model: Compiled Keras model.
    """
    print("\nBuilding the BiGRU + Attention model...")

    l2_regularizer = l2(l2_reg_factor)

    # Input Layer
    input_layer = Input(shape=(max_sequence_length,), name='input_lyrics')

    # Embedding Layer
    embedding_layer = Embedding(input_dim=max_words,
                                output_dim=embedding_dim,
                                input_length=max_sequence_length,
                                name='word_embedding')(input_layer)
    embedding_layer = Dropout(dropout_rate, name='embedding_dropout')(embedding_layer)

    # Bidirectional GRU Layers with L2 regularization
    bigru_output_1 = Bidirectional(GRU(units=gru_units, return_sequences=True,
                                       kernel_regularizer=l2_regularizer,
                                       recurrent_regularizer=l2_regularizer),
                                   name='bigru_1')(embedding_layer)
    bigru_output_1 = Dropout(dropout_rate, name='bigru_dropout_1')(bigru_output_1)

    bigru_output_2 = Bidirectional(GRU(units=gru_units, return_sequences=True,
                                       kernel_regularizer=l2_regularizer,
                                       recurrent_regularizer=l2_regularizer),
                                   name='bigru_2')(bigru_output_1)
    bigru_output_2 = Dropout(dropout_rate, name='bigru_dropout_2')(bigru_output_2)

    bigru_output_3 = Bidirectional(GRU(units=gru_units, return_sequences=True,
                                       kernel_regularizer=l2_regularizer,
                                       recurrent_regularizer=l2_regularizer),
                                   name='bigru_3')(bigru_output_2)
    bigru_output_3 = Dropout(dropout_rate, name='bigru_dropout_3')(bigru_output_3)

    # Query Generator GRU for Attention
    query_generator_bigru = Bidirectional(GRU(units=gru_units, return_sequences=False,
                                             kernel_regularizer=l2_regularizer,
                                             recurrent_regularizer=l2_regularizer),
                                         name='query_generator_gru')(bigru_output_3)

    # Attention Mechanism
    # Units for BahdanauAttention should match the concatenated output dimension of BiGRU (gru_units * 2)
    attention_layer = BahdanauAttention(gru_units * 2, name='bahdanau_attention')
    context_vector, attention_weights = attention_layer(query_generator_bigru, bigru_output_3)
    context_vector = Dropout(dropout_rate, name='attention_dropout')(context_vector)

    # First Dense Layer with L2 regularization
    dense_layer_1 = Dense(units=dense_units, activation='relu',
                          kernel_regularizer=l2_regularizer, name='dense_1')(context_vector)
    dense_layer_1 = Dropout(dropout_rate, name='dense_dropout_1')(dense_layer_1)

    # Output Layer
    output_layer = Dense(num_labels, activation='sigmoid',
                         kernel_regularizer=l2_regularizer, name='output_emotions')(dense_layer_1)

    # Create the Model
    model = Model(inputs=input_layer, outputs=output_layer, name='BiGRU_Attention_Model')
    print("BiGRU + Attention model built.")
    return model

def build_bert_xlstm_model(num_labels, lstm_units, dense_units, dropout_rate, l2_reg_factor):
    """
    Builds a model that takes BERT embeddings as input and processes them
    with a Bidirectional LSTM and multiple Dense layers.

    Args:
        num_labels (int): Number of output labels.
        lstm_units (int): Number of units in the LSTM layer.
        dense_units (int): Number of units in each intermediate Dense layer.
        dropout_rate (float): Dropout rate for regularization.
        l2_reg_factor (float): L2 regularization factor.

    Returns:
        tf.keras.Model: Compiled Keras model.
    """
    print("\nBuilding the BERT + XLSTM model...")
    l2_regularizer = l2(l2_reg_factor)

    # Input layer for BERT embeddings (768 for bert-base-uncased)
    input_layer = Input(shape=(768,), name='bert_embeddings_input')

    # Reshape for LSTM: (batch_size, 1, 768)
    x = Reshape((1, 768), name='reshape_for_lstm')(input_layer)

    # Bidirectional LSTM layer
    x = Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=False,
                                           kernel_regularizer=l2_regularizer,
                                           recurrent_regularizer=l2_regularizer),
                      name='bidirectional_lstm')(x)
    x = Dropout(dropout_rate, name='lstm_dropout')(x)

    # Multiple Dense Layers with L2 Regularization and Dropout
    for i in range(5): # Reduced from 10 to 5 for initial stability, can be tuned
        x = Dense(dense_units, activation='relu', kernel_regularizer=l2_regularizer,
                  name=f'dense_layer_{i+1}')(x)
        x = Dropout(dropout_rate, name=f'dense_dropout_{i+1}')(x)

    # Output Layer
    output_layer = Dense(num_labels, activation='sigmoid',
                         kernel_regularizer=l2_regularizer, name='output_emotions')(x)

    model = Model(inputs=input_layer, outputs=output_layer, name='BERT_XLSTM_Model')
    print("BERT + XLSTM model built.")
    return model