In [1]:
import tensorflow as tf
import pdb
from functools import partial
from tqdm import tqdm_notebook as tqdm
# IPython
from IPython.display import clear_output
# Random
import random
from random import randint
# Keras
from keras.layers import *
from keras.models import Model
from keras.utils import to_categorical, plot_model
import keras.backend as K
from keras.optimizers import RMSprop
from keras.callbacks import *
# Numpy
import numpy as np
from numpy import array
from numpy import argmax
from numpy import array_equal
### Data Loading
from adlframework.processors.general_processors import crop, reshape, pdb_trace
from adlframework.processors.lstm_processors import crop_and_label
from adlframework.processors.midi_processors import midi_to_np, notes_to_classification, make_time_relative
from adlframework.filters.general_filters import min_array_shape, threshold_label
from adlframework.retrievals.BlobLocalCache import BlobLocalCache
from adlframework.datasource import DataSource
from adlframework.dataentity.midi_de import MidiDataEntity

Using TensorFlow backend.


In [2]:
#### Set keras session
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = "0"
set_session(tf.Session(config=config))

## Data

In [3]:
NUM_NOTES = 10

In [4]:
def convert_to_one_hot(sample):
    data, label = sample
    timesteps = data.shape[-2]
    ret = []
    for i, v  in enumerate([12, 11, 88]):
        z = np.zeros((timesteps, v))
        for ts in range(timesteps):
            z[ts][data[ts][i]] = 1
        ret.append(z)
    return label, ret

In [5]:
### Prefilter
prefilters = [partial(min_array_shape, min_shape=(NUM_NOTES, 4)),]
### Controllers
controllers = [midi_to_np,
                partial(crop, shape=(NUM_NOTES, 3)),
                make_time_relative,
                notes_to_classification,
                convert_to_one_hot
              ]

### Load Data
base = '../local_cache/alex_midiset/v2/'
midi_retrieval = BlobLocalCache(base+'midis/', base+'labels/')
midi_ds = DataSource(midi_retrieval, MidiDataEntity,
                         verbosity=0,
						controllers=controllers,
						backend='madmom',
						batch_size=50,
                        max_mem_percent=.7,
                        workers=6,
                        queue_size=100
                        )

# train_ds, temp = DataSource.split(midi_ds, split_percent=.6) # Train at .6
# val_ds, test_ds = DataSource.split(temp, split_percent=.6) # Val at .24, test at .16

Retrieval not named, so won't be cached.


In [6]:
def get_batch(ds, bs=10):
    while True:
        batch = ds.next(bs)[1]
        #### X1
        onset = np.array(list([x[0] for x in batch]))
        duration = np.array(list([x[1] for x in batch]))
        note = np.array(list([x[2] for x in batch]))
        #### X2
        onset_offset = np.zeros(onset.shape)
        duration_offset = np.zeros(duration.shape)
        note_offset = np.zeros(note.shape)
        onset_offset[:,1:] = onset[:,:-1]
        duration_offset[:,1:] = duration[:,:-1]
        note_offset[:,1:] = note[:,:-1]
        #### Y
        # Same as X1, so just repeat
        yield ([onset, duration, note, onset_offset, duration_offset, note_offset], [onset, duration, note])

### Define LSTM Model

In [15]:
latent_dim=2
epochs = 100

In [5]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, 3))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, 3))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(3, activation='relu')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [6]:
data = train_ds.next(1000)[0].astype(float)

In [7]:
data[:,:,0] = data[:,:,0]/12.0
data[:,:,1] = data[:,:,1]/12.0
data[:,:,2] = data[:,:,2]/88.0

In [8]:
out = np.zeros(data.shape)

In [9]:
out[:, 1:, :] = data[:, :-1, :]

In [12]:
model.compile(optimizer='adam', loss='mae')
model.fit([data, data], out,
          batch_size=10,
          epochs=epochs,
          validation_split=0.2)

Train on 800 samples, validate on 200 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

<keras.callbacks.History at 0x7f9ba8f5b590>

## Test Model

In [14]:
test = test_ds.next(1)[0].astype(float)

In [15]:
test[:,:,0] = test[:,:,0]/12.0
test[:,:,1] = test[:,:,1]/12.0
test[:,:,2] = test[:,:,2]/88.0

In [18]:
model.predict(np.array([test]), np.array([test]))

ValueError: The model expects 2  arrays, but only received one array. Found: array with shape (1, 1, 100, 3)

## VAE Testing

In [7]:
NUM_NOTES = 88
NUM_DURATIONS = 11
NUM_ENCODER_CELLS = 256
LATENT_VECTOR_SIZE = 10

In [8]:
#######################################
######### Training Encoder ############
#######################################
# Encoder Inputs
note_input_enc = Input(shape=(None, NUM_NOTES), name='note_enc')
onset_input_enc = Input(shape=(None, NUM_DURATIONS+1), name='onset_enc')
duration_input_enc = Input(shape=(None, NUM_DURATIONS), name='duration_enc')
encoder_input = [onset_input_enc, duration_input_enc, note_input_enc]
# LSTM Layer 1
note_encoder = LSTM(NUM_ENCODER_CELLS, return_sequences=True, name='note_1')(note_input_enc)
onset_encoder = LSTM(NUM_ENCODER_CELLS, return_sequences=True, name='onset_1')(onset_input_enc)
duration_encoder = LSTM(NUM_ENCODER_CELLS, return_sequences=True, name='duration_1')(duration_input_enc)
# Joint Network
enc_concat = concatenate([onset_encoder, duration_encoder, note_encoder], name='concat_enc', axis=2)
joint_encoder = LSTM(LATENT_VECTOR_SIZE, return_state=True, name='joint_enc') # Joint
encoder_outputs, state_h, state_c = joint_encoder(enc_concat) # Seperate hidden states
#     shp = state_h_prime = Dense(256)(state_h)
#     scp = state_c_prime = Dense(256)(state_c)
encoder_states = [state_h, state_c]

In [9]:
########################################
######### Training Decoder #############
########################################
# Decoder Inputs
note_input_dec = Input(shape=(None, NUM_NOTES), name='dec_note')
onset_input_dec = Input(shape=(None, NUM_DURATIONS+1), name='dec_onset')
duration_input_dec = Input(shape=(None, NUM_DURATIONS), name='dec_duration')
decoder_input = [onset_input_dec, duration_input_dec, note_input_dec]
# LSTM Layer 1
note_decoder = LSTM(NUM_ENCODER_CELLS, return_sequences=True, name='dec_note_1')(note_input_dec)
onset_decoder = LSTM(NUM_ENCODER_CELLS, return_sequences=True, name='dec_onset_1')(onset_input_dec)
duration_decoder = LSTM(NUM_ENCODER_CELLS, return_sequences=True, name='dec_duration_1')(duration_input_dec)
# Joint Network
dec_concat = concatenate([note_decoder, onset_decoder, duration_decoder], name='dec_concat', axis=2)
decoder_lstm_layer = LSTM(LATENT_VECTOR_SIZE, return_sequences=True, return_state=True, name='dec_joint')
joint_decoder, _, _ = decoder_lstm_layer(dec_concat, initial_state=encoder_states) # Joint
note_output = Dense(NUM_NOTES, activation='softmax', name='dec_note_out')
onset_output = Dense(NUM_DURATIONS+1, activation='softmax', name='dec_onset_out')
duration_output = Dense(NUM_DURATIONS, activation='softmax', name='dec_duration_out')
# Define training model
model = Model(encoder_input+decoder_input,
              [onset_output(joint_decoder),
               duration_output(joint_decoder),
               note_output(joint_decoder)])

In [10]:
########################################
######### Inference Encoder ############
########################################
# define inference encoder
encoder_model = Model(encoder_input, encoder_states)

In [11]:
###########################################
############ Inference Decoder ############
###########################################
# define inference decoder
decoder_state_input_h = Input(shape=(LATENT_VECTOR_SIZE,), name='dec_h')
decoder_state_input_c = Input(shape=(LATENT_VECTOR_SIZE,), name='dec_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm_layer(dec_concat, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

# Define output model
decoder_model = Model(decoder_input+decoder_states_inputs,
                      [onset_output(decoder_outputs),
                       duration_output(decoder_outputs),
                       note_output(decoder_outputs)]+decoder_states)


In [12]:
plot_model(encoder_model, to_file='enc_model.png')
plot_model(model, to_file='model.png')
plot_model(decoder_model, to_file='dec_model.png')

## Define Loss

In [13]:
def loss_across_categories(y_true, y_pred):
    return K.categorical_crossentropy(y_true[:, 0], y_pred[:, 0]) + \
           K.categorical_crossentropy(y_true[:, 1], y_pred[:, 1]) + \
           3*K.categorical_crossentropy(y_true[:, 2], y_pred[:, 2]) # Extra weight to notes

In [14]:
### Compile models!
opt = RMSprop(decay=0.0001)
encoder_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'])
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'])
decoder_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'])

## Train!

In [15]:
EPOCHS = 2000
batch_size = 10
steps_per_epoch = 100

In [16]:
### Callbacks
tb = TensorBoard(log_dir='./exp4', histogram_freq=0,  
          write_graph=True, write_images=True)

In [None]:
model.fit_generator(get_batch(midi_ds, batch_size),
                    epochs=EPOCHS,
                    steps_per_epoch=steps_per_epoch,
                    callbacks=[tb])


Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
 21/100 [=====>........................] - ETA: 17s - loss: 5.9828 - dec_onset_out_loss: 1.3217 - dec_duration_out_loss: 0.6700 - dec_note_out_loss: 3.9910 - dec_onset_out_acc: 0.6219 - dec_duration_out_acc: 0.8390 - dec_note_out_acc: 0.0795