# Task

Inputs: The power spectrum of an audio clip at a particular time.
The size of window is determined by the number of samples inside a quanta.

Outputs: An 88-dimensional vector, containing probabilities of whether or not a particular note was played.

In [1]:
import tensorflow as tf 
from tensorflow.keras import layers, models, metrics

import numpy as np 
import scipy as sp 
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, "../")

import midiio
import frequency_analysis as frqa

## Preprocessing step

In [3]:
MIDI_START = 20
MIDI_END = 127

FREQUENCY_COUNT = 257

In [4]:
# Let us define our input and output data
input_columns = ["f" + str(i) for i in range(0, FREQUENCY_COUNT)]
label_columns = [i for i in range(MIDI_START, MIDI_END+1)]

In [5]:
def get_data(wav_path, csv_path):
    # Open the midifile to see what is inside. Also open the accopmanying labels.
    data, sample_rate = midiio.read_file(wav_path)

    music_dataframe = pd.read_csv(csv_path)

    music_dataframe
    WINDOW_SAMPLES = sample_rate * 0.05

    # For the model, construct the training data as follows:
    music_dataframe_copy = music_dataframe.copy()
    training_data = pd.DataFrame(columns = ['start_time'] + input_columns + label_columns)

    # Create a linear series where points are WINDOW_SAMPLES apart from each other. 
    serialized_data = []
    xs = np.linspace(0, len(data), int (len(data) / WINDOW_SAMPLES), endpoint=False)

    # Iterate over the music dataframe. Construct a one hot encoded vector for this particular time based on the note value
    # At the given time.
    i = 0

    for x in xs:
        notes_on = music_dataframe_copy.query("start_time <= " + str(int(x))).query("end_time >= " + str(int(x)))
        music_dataframe_copy.drop(notes_on.index, axis='index', inplace=True)
        
        note_vec = [0 for i in range(MIDI_START, MIDI_END + 1)]
        if len(notes_on) != 0:
            for n in notes_on['note']:
                note_vec[n - MIDI_START] = 1
        
        
        f, power = frqa.get_frequencies(data[int(x) : int(x + WINDOW_SAMPLES)], sample_rate)
        training_data.loc[len(training_data.index)] = [x, *note_vec, *np.abs(power)]

    return training_data

In [6]:
training_data = get_data("../data/musicnet/1727.wav", "../data/musicnet/1727.csv")
training_data

Unnamed: 0,start_time,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,118,119,120,121,122,123,124,125,126,127
0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.062404e-15,3.430190e-15,4.673528e-15,5.891381e-15,5.858527e-15,4.660433e-15,3.481408e-15,3.053658e-15,3.204686e-15,1.679974e-15
1,2.205047e+03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.011342e-14,1.171406e-14,1.312596e-14,1.258901e-14,9.964661e-15,8.648433e-15,1.016250e-14,1.066891e-14,8.073073e-15,3.101940e-15
2,4.410095e+03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.789673e-15,6.257600e-15,5.665051e-15,5.725820e-15,5.550485e-15,5.254208e-15,5.236469e-15,5.166796e-15,4.378481e-15,1.875576e-15
3,6.615142e+03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.709834e-15,9.390881e-15,8.752375e-15,8.979006e-15,8.766644e-15,7.252050e-15,5.618708e-15,4.705924e-15,4.384813e-15,2.169624e-15
4,8.820189e+03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.340424e-15,5.782643e-15,7.306006e-15,9.065092e-15,9.491797e-15,8.408622e-15,7.915499e-15,9.190990e-15,1.064468e-14,5.548123e-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8936,1.970430e+07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.186895e-14,1.498090e-14,1.442782e-14,1.157023e-14,1.027975e-14,1.136856e-14,1.275910e-14,1.295489e-14,1.223149e-14,5.885627e-15
8937,1.970651e+07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.631299e-14,1.595025e-14,1.540118e-14,1.425347e-14,1.321235e-14,1.198773e-14,1.053063e-14,9.884222e-15,9.968695e-15,5.025379e-15
8938,1.970871e+07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.016615e-14,9.817810e-15,9.414397e-15,1.076868e-14,1.251101e-14,1.265427e-14,1.213434e-14,1.157699e-14,1.006929e-14,4.495790e-15
8939,1.971092e+07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.925631e-15,1.179639e-14,1.286866e-14,1.003691e-14,6.503629e-15,5.479009e-15,6.987898e-15,9.172150e-15,1.074676e-14,5.659694e-15


# Model Creation

In [None]:
def jaccard_distance(y_true, y_pred, smooth=100):
    """ Calculates mean of Jaccard distance as a loss function """
    intersection = tf.reduce_sum(y_true * y_pred)
    sum_ = tf.reduce_sum(y_true + y_pred)
    jac = (intersection + smooth) / (sum_ - intersection + smooth)
    jd =  (1 - jac) * smooth
    return tf.reduce_mean(jd)

In [72]:
# We create a tensorflow model as follows

note_classifier = models.Sequential()

note_classifier.add(layers.Dense(200, activation=tf.nn.elu, input_shape=(257,)))
note_classifier.add(layers.Dense(108, activation=tf.nn.elu))
note_classifier.add(layers.Lambda(lambda x: tf.abs(x)))


note_classifier.compile(optimizer="Adagrad", loss=jaccard_distance)

In [73]:
inputs = training_data[input_columns]
labels = training_data[label_columns]

In [74]:
note_classifier.fit(inputs, labels, batch_size=64, epochs=256)

Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epoch 20/256
Epoch 21/256
Epoch 22/256
Epoch 23/256
Epoch 24/256
Epoch 25/256
Epoch 26/256
Epoch 27/256
Epoch 28/256
Epoch 29/256
Epoch 30/256
Epoch 31/256
Epoch 32/256
Epoch 33/256
Epoch 34/256
Epoch 35/256
Epoch 36/256
Epoch 37/256
Epoch 38/256
Epoch 39/256
Epoch 40/256
Epoch 41/256
Epoch 42/256
Epoch 43/256
Epoch 44/256
Epoch 45/256
Epoch 46/256
Epoch 47/256
Epoch 48/256
Epoch 49/256
Epoch 50/256
Epoch 51/256
Epoch 52/256
Epoch 53/256
Epoch 54/256
Epoch 55/256
Epoch 56/256
Epoch 57/256
Epoch 58/256
Epoch 59/256
Epoch 60/256
Epoch 61/256
Epoch 62/256
Epoch 63/256
Epoch 64/256
Epoch 65/256
Epoch 66/256
Epoch 67/256
Epoch 68/256
Epoch 69/256
Epoch 70/256
Epoch 71/256
Epoch 72/256
Epoch 73/256
Epoch 74/256
Epoch 75/256
Epoch 76/256
Epoch 77/256
Epoch 78

<keras.callbacks.History at 0x1fbb1be6220>

# Model Evaluation

In [75]:
test_data = get_data('../data/musicnet/1759.wav', '../data/musicnet/1759.csv')
test_data

Unnamed: 0,start_time,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,118,119,120,121,122,123,124,125,126,127
0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.205164e+03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.410328e+03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.615492e+03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.820657e+03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3888,8.573678e+06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3889,8.575883e+06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3890,8.578089e+06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3891,8.580294e+06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
test_input = test_data[input_columns]
test_labels = test_data[label_columns]
result = note_classifier.evaluate(test_input, test_labels, batch_size=64)
result



2.596452236175537

In [77]:
# Save the model

note_classifier.save("note_classifier.h5", )

# Unsupervised Task

In [78]:
# Load a model
classifier_model = models.load_model("note_classifier.h5", custom_objects={
    "jaccard_distance": jaccard_distance
})

In [79]:
# Load the file
data, sample_rate = midiio.read_file("../data/piano.mp3")
WINDOW_SAMPLES = sample_rate * 0.05

data = data[0]
xs = np.linspace(0, len(data), int (len(data) / WINDOW_SAMPLES), endpoint=False)
inputs = []
for x in xs:
    f, power = frqa.get_frequencies(data[int(x) : int(x + WINDOW_SAMPLES)], sample_rate)
    inputs.append(np.abs(power))

inputs = np.array(inputs)
inputs.shape

  return f(*args, **kwargs)


(4706, 257)

In [80]:
classifier_model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 200)               51600     
_________________________________________________________________
dense_13 (Dense)             (None, 108)               21708     
_________________________________________________________________
lambda_1 (Lambda)            (None, 108)               0         
Total params: 73,308
Trainable params: 73,308
Non-trainable params: 0
_________________________________________________________________


In [92]:
# Use the model to identify the notes

label_vector = classifier_model.predict(inputs)

In [99]:
from midi.midi_representation import *

In [97]:
# Make the midi file :))

comp : Composition = Composition("Worse Apple")
comp.add_track(1, 1000)
track : Track= comp.tracks[0]

track.add_tempo_event(0, 130)
i = 0

virtual_keyboard = [0 for i in range(MIDI_START, MIDI_END + 1)]
for x in xs:
    note_vec = label_vector[i]
    note_vec[note_vec > 0.8] = 1
    note_vec[note_vec <= 0.8] = 0

    temp_keyboard = virtual_keyboard and note_vec

    for y in range(MIDI_START, MIDI_END + 1):
        if temp_keyboard[y - MIDI_START] == 1 and virtual_keyboard[y - MIDI_START] != 1:
            track.add_note_on_event(int(x), 1, y, 120)
        else: 
            track.add_note_off_event(int(x), 1, y)
    i = i + 1

In [103]:
comp.finalize()

ValueError: invalid literal for int() with base 10: '0.0'