In [316]:
import tensorflow as tf
from keras import layers, models
from keras.layers import Embedding, Dropout, LayerNormalization

from MIDI_data_extractor import MIDI_data_extractor

In [317]:
def create_padding_mask(seq, n=4):
    mask = tf.cast(seq == -1, tf.float32)
    return tf.reshape(mask, (tf.shape(mask)[0], *(1,) * (n-2), tf.shape(mask)[-1]))

In [318]:
def create_look_ahead_mask(seq_len):
    mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    return tf.cast(mask, tf.float32)


In [319]:
def create_mask(inp, n=4):
    padding_mask = create_padding_mask(inp, n)
    seq_len = tf.shape(inp)[1]  # Get the sequence length dynamically
    look_ahead_mask = create_look_ahead_mask(seq_len)
    return tf.maximum(padding_mask, look_ahead_mask)

In [320]:
class RelativePositionMultiHeadAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads):
        super(RelativePositionMultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.depth = embed_dim // num_heads

        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.relative_embeddings = self.add_weight("relative_embeddings", shape=[embed_dim, embed_dim])

        self.final_dense = layers.Dense(embed_dim)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])


    def scaled_dot_product_attention(self, query, key, value, mask):
        matmul_qk = tf.matmul(query, key, transpose_b=True)
        # compute relative position embeddings
        rel_embeddings = self.relative_embeddings[:tf.shape(query)[-2], :]
        rel_embeddings = tf.pad(rel_embeddings, [[tf.shape(query)[-2], 0], [0, 0]])

        # reshaping the relative embeddings to match the query shape
        rel_embeddings = tf.reshape(rel_embeddings, [1, 1, tf.shape(query)[-2], tf.shape(query)[-2], self.num_heads, self.depth])
        rel_embeddings = tf.tile(rel_embeddings, [tf.shape(query)[0], tf.shape(query)[1], 1, 1, 1, 1])

        matmul_qk += rel_embeddings
        # scale matmul_qk
        dk = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # add the mask to the scaled tensor.
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        # softmax is normalized on the last axis (seq_len_k) so that the scores
        # add up to 1.
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-3)  # Updated axis here
        output = tf.matmul(attention_weights, value)
        return output, attention_weights

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.query_dense(q)
        k = self.key_dense(k)
        v = self.value_dense(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        # Use the mask
        scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask[:, tf.newaxis, :, :])

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.embed_dim))
        output = self.final_dense(concat_attention)

        return output

In [321]:
class PositionWiseFeedForwardNetwork(layers.Layer):
    def __init__(self, embed_dim, ffn_dim):
        super(PositionWiseFeedForwardNetwork, self).__init__()
        self.embed_dim = embed_dim
        self.ffn_dim = ffn_dim

        self.dense1 = layers.Dense(ffn_dim, activation="relu")
        self.dense2 = layers.Dense(embed_dim)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return x

In [322]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ffn_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = RelativePositionMultiHeadAttention(embed_dim, num_heads)
        self.ffn = PositionWiseFeedForwardNetwork(embed_dim, ffn_dim)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training, mask):
        attn_output = self.att(inputs, inputs, inputs, mask)
        print(f"Shape of attn_output: {attn_output.shape}")
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [323]:
def create_multi_output_model(input_dim, embed_dim, num_heads, ffn_dim, num_blocks, output_dims, dropout_rate=0.1):
    inputs = layers.Input(shape=(None,input_dim))
    mask = create_mask(inputs)
    x = Embedding(input_dim, embed_dim)(inputs)
    for _ in range(num_blocks):
        # Use the mask
        x = TransformerBlock(embed_dim, num_heads, ffn_dim, dropout_rate)(x, training=None, mask=mask)

    outputs = [layers.Dense(dim, activation='softmax')(x) for dim in output_dims]

    return models.Model(inputs=inputs, outputs=outputs)

# Unique output dimensions from the list provided
output_dims = [129,129,129,129,129,2,200,32,32,128,64,30,3000,129,5,129]

# Create the model
model = create_multi_output_model(input_dim=16, embed_dim=64, num_heads=4, ffn_dim=128, num_blocks=2, output_dims=output_dims)
model.compile('adam', ['sparse_categorical_crossentropy']*len(output_dims))

ValueError: Exception encountered when calling layer "transformer_block_37" (type TransformerBlock).

in user code:

    File "C:\Users\ilove\AppData\Local\Temp\ipykernel_2140\3159178080.py", line 12, in call  *
        attn_output = self.att(inputs, inputs, inputs, mask)
    File "C:\Users\ilove\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\ilove\AppData\Local\Temp\__autograph_generated_filegmkrvd7h.py", line 18, in tf__call
        scaled_attention = ag__.converted_call(ag__.ld(tf).transpose, (ag__.ld(scaled_attention),), dict(perm=[0, 2, 1, 3]), fscope)

    ValueError: Exception encountered when calling layer 'relative_position_multi_head_attention_37' (type RelativePositionMultiHeadAttention).
    
    in user code:
    
        File "C:\Users\ilove\AppData\Local\Temp\ipykernel_2140\3293392212.py", line 59, in call  *
            scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
    
        ValueError: Dimension must be 6 but is 4 for '{{node transformer_block_37/relative_position_multi_head_attention_37/transpose_5}} = Transpose[T=DT_FLOAT, Tperm=DT_INT32](transformer_block_37/relative_position_multi_head_attention_37/MatMul_1, transformer_block_37/relative_position_multi_head_attention_37/transpose_5/perm)' with input shapes: [?,4,?,4,4,16], [4].
    
    
    Call arguments received by layer 'relative_position_multi_head_attention_37' (type RelativePositionMultiHeadAttention):
      • v=tf.Tensor(shape=(None, None, 16, 64), dtype=float32)
      • k=tf.Tensor(shape=(None, None, 16, 64), dtype=float32)
      • q=tf.Tensor(shape=(None, None, 16, 64), dtype=float32)
      • mask=tf.Tensor(shape=(None, 1, None, 16), dtype=float32)


Call arguments received by layer "transformer_block_37" (type TransformerBlock):
  • inputs=tf.Tensor(shape=(None, None, 16, 64), dtype=float32)
  • training=None
  • mask=tf.Tensor(shape=(None, 1, None, 16), dtype=float32)

In [None]:
model.summary()

In [None]:
model.fit(MIDI_data_extractor('chaconne.mid'),epochs = 2, verbose=1)