In [1]:
import mido
import numpy as np
import matplotlib.pyplot as plt
import PreProcessor as pp
from keras.models import Model
from tensorflow.keras.layers import MultiHeadAttention, Embedding, Dense, LayerNormalization, Dropout, Softmax, concatenate, Add
from keras import optimizers
import tensorflow as tf
from utils import *

In [2]:
dataset = pp.load_dataset("../adl-piano-midi")
ClassicSongs = pp.files_to_songs(dataset["Classical"])

channel_to_ind, ind_to_channel, note_to_ind, ind_to_note, velocity_to_ind, ind_to_velocity = pp.dicts_from_songs(ClassicSongs)
time_range = pp.ranges_from_songs(ClassicSongs)

n_Channels = len(channel_to_ind)
n_Notes = len(note_to_ind)
n_Velocities = len(velocity_to_ind)

print("\nNumber of channels   :",n_Channels,"\nNumber of notes      :",n_Notes,"\nNumber of velocities :",n_Velocities,"\nTime range           :",time_range[0],time_range[1])


Number of channels   : 12 
Number of notes      : 105 
Number of velocities : 128 
Time range           : 0.0 190.285


In [3]:
class MusicEmbedding(tf.keras.layers.Layer):
    def __init__(self, n_Channels, n_Notes, n_Velocities, d_model):
        super().__init__()
        self.d_model = d_model
        
        tot_dim = n_Channels + n_Notes + n_Velocities
        self.d_Channels = int((d_model-1)*n_Channels/tot_dim)
        self.d_Notes = int((d_model-1)*n_Notes/tot_dim)
        self.d_Velocities = int((d_model-1)*n_Velocities/tot_dim)
        while self.d_Channels + self.d_Notes + self.d_Velocities != d_model - 1 : self.d_Channels += 1
            
        self.Channel_Embedding = Embedding(n_Channels, self.d_Channels)
        self.Notes_Embedding = Embedding(n_Notes, self.d_Notes)
        self.Velocities_Embedding = Embedding(n_Velocities, self.d_Velocities)

    def call(self, x):
        chan = self.Channel_Embedding(x[0])
        note = self.Notes_Embedding(x[1])
        velo = self.Velocities_Embedding(x[2])
        
        # Scale to values between 0 and 1 ? (/time_range[1])
        time = tf.expand_dims(x[3],-1)
        return concatenate([chan,note,velo,time])

In [4]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, d_model, seq_length):
        super().__init__()
        self.d_model = d_model
        self.seq_length = seq_length
        self.pos_encoding = self.positional_encoding(seq_length, d_model)
        
    def positional_encoding(self, length, depth):
        depth = depth/2

        positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
        depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

        angle_rates = 1 / (10000**depths)         # (1, depth)
        angle_rads = positions * angle_rates      # (pos, depth)

        pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)],axis=-1) 

        return tf.cast(pos_encoding, dtype=tf.float32)[tf.newaxis, :, :]

    def call(self, x):
        # This factor sets the relative scale of the embedding and positonal_encoding.
        # x *= tf.math.sqrt(tf.cast(self.d_model, tf.float64))
        return x + self.pos_encoding

In [5]:
class SelfAttentionBlock(tf.keras.layers.Layer):
    # Dropout ??
    def __init__(self, num_heads, d_model, **kwargs):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads, d_model, **kwargs)
        self.layer_norm = LayerNormalization()
        self.add = Add()

    def call(self, x):
        attn_output = self.mha(query=x,value=x,key=x,use_causal_mask = True)
        x = self.add([x, attn_output])
        x = self.layer_norm(x)
        return x

In [6]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
          Dense(dff, activation='relu'),
          Dense(d_model),
          Dropout(dropout_rate)
        ])
        self.add = Add()
        self.layer_norm = LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x) 
        return x

In [7]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,d_model,num_heads,dff,dropout_rate=0.1):
        super().__init__()
        self.self_attention_block = SelfAttentionBlock(num_heads,d_model)
        self.ffn = FeedForward(d_model, dff)

    def call(self, x):
        x = self.self_attention_block(x)
        x = self.ffn(x)
        return x

In [17]:
class Transformer(tf.keras.Model):
    def __init__(self, n_Channels, n_Notes, n_Velocities, max_time, seq_length, d_model, n_layers, n_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.embedding = MusicEmbedding(n_Channels, n_Notes, n_Velocities, d_model)
        self.pos_enc = PositionalEncoding(d_model, seq_length)
        self.layer_norm = LayerNormalization()
        self.dec_layers = [DecoderLayer(d_model, n_heads, dff, dropout_rate) for _ in range(n_layers)]
        self.out_chan = Dense(n_Channels, activation="softmax")
        self.out_note = Dense(n_Notes, activation="softmax")
        self.out_velo = Dense(1, activation = lambda x: (tf.sigmoid(x) * n_Notes))
        self.out_time = Dense(1, activation = lambda x: ((1 + tf.tanh(x)) * 0.5 * max_time))
    
    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.pos_enc(x)
        x = self.layer_norm(x)
        for dec_layer in self.dec_layers:
            x = dec_layer(x)
        chan = self.out_chan(x)
        note = self.out_note(x)
        velo = self.out_velo(x)
        time = self.out_time(x)
        return [chan, note, velo, time]  

In [10]:
chan = np.array([[1,2,2,0]],dtype=int)
note = np.array([[0,54,78,39]],dtype=int)
velo = np.array([[127,0,32,49]],dtype=int)
time = np.array([[0.541,0.0236,1.754,0.0416]])

dim = 50
seq_length = 4
n_heads = 2

musEmb = MusicEmbedding(n_Channels, n_Notes, n_Velocities, dim)
posEnc = PositionalEncoding(dim, seq_length)
selAtt = SelfAttentionBlock(n_heads,dim)
fedFor = FeedForward(dim,4*dim)

x = musEmb.call([chan,note,velo,time])
y = posEnc.call(x)
z = selAtt.call(y)
o = fedFor.call(z)
print(o)

tf.Tensor(
[[[-9.1796756e-01 -6.3600618e-01 -7.5247103e-01 -1.4177717e+00
   -8.7639356e-01 -3.9225024e-01 -1.8076243e+00 -1.0548118e+00
   -1.6961154e-01 -3.2920593e-01 -1.3802953e+00 -1.6918079e+00
   -8.0640751e-01 -1.0653987e+00 -3.8881657e-01 -6.2033200e-01
   -9.7946101e-01 -8.9179856e-01 -4.5252046e-01 -3.5649103e-01
   -1.2561321e+00 -9.1505116e-01 -1.1268779e+00 -1.5286310e+00
   -7.4414563e-01  8.1690013e-01  4.9796620e-01  4.6457461e-01
    5.9958440e-01  8.7763017e-01  9.2653829e-01  1.3074914e+00
    5.6613386e-01  3.7282428e-01  1.0854530e+00  4.4058508e-01
    1.3508781e+00  9.5127088e-01  1.5607020e+00  9.9361354e-01
    8.6978495e-01  4.3149260e-01  1.5279782e+00  6.5811598e-01
    2.7541986e-01  7.1707636e-01  1.1144626e+00  7.0899892e-01
    1.4358337e+00  2.0069711e+00]
  [ 5.5762464e-01  1.7696032e-01  1.2240556e-01 -9.9351454e-01
   -7.4468023e-01 -2.9302263e-01 -1.6682713e+00 -1.0001571e+00
   -2.1284811e-01 -5.5945241e-01 -1.6420459e+00 -1.6939449e+00
   -9.0124

In [18]:
seq_length = 30
d_model = 64
n_layers = 1
n_heads = 4
dff = 4*d_model

ClassicTransformer = Transformer(n_Channels, n_Notes, n_Velocities, time_range[1], 4, d_model, n_layers, n_heads, dff)

In [19]:
ClassicTransformer.call([chan,note,velo,time])

[<tf.Tensor: shape=(1, 4, 12), dtype=float32, numpy=
 array([[[0.04526208, 0.03121842, 0.02693846, 0.03178246, 0.07159364,
          0.07889612, 0.12914987, 0.03235696, 0.02352519, 0.17941017,
          0.25475916, 0.09510753],
         [0.01877262, 0.01186827, 0.02507685, 0.02738062, 0.03044505,
          0.07157721, 0.10186744, 0.03092128, 0.01620302, 0.16287616,
          0.44444248, 0.05856897],
         [0.0205715 , 0.01789005, 0.01924839, 0.0257921 , 0.01138469,
          0.20217595, 0.06241456, 0.02549324, 0.02078154, 0.08315939,
          0.48527342, 0.02581519],
         [0.01930026, 0.0186222 , 0.04439155, 0.02144288, 0.01613703,
          0.10259394, 0.13752444, 0.01788878, 0.01531351, 0.12229156,
          0.46669003, 0.01780377]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 4, 105), dtype=float32, numpy=
 array([[[0.00266814, 0.0023477 , 0.00488229, 0.00600979, 0.0033704 ,
          0.01198287, 0.00966611, 0.01609921, 0.00291979, 0.00310389,
          0.00309689, 0.01000643, 