## Link with drive

In [1]:
from google.colab import drive

drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
%pwd

'/content'

## Setup

In [3]:
!sudo apt install -y fluidsynth

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  fluid-soundfont-gm libfluidsynth2 libinstpatch-1.0-2 qsynth
  timgm6mb-soundfont
Suggested packages:
  fluid-soundfont-gs timidity jackd musescore
The following NEW packages will be installed:
  fluid-soundfont-gm fluidsynth libfluidsynth2 libinstpatch-1.0-2 qsynth
  timgm6mb-soundfont
0 upgraded, 6 newly installed, 0 to remove and 24 not upgraded.
Need to get 126 MB of archives.
After this operation, 157 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 fluid-soundfont-gm all 3.1-5.1 [119 MB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 libinstpatch-1.0-2 amd64 1.1.2-2build1 [238 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 timgm6mb-soundfont all 1.3-3 [5,420 kB]
Get:4 http://archive.ubuntu.com/ubuntu focal/universe amd64 libfluidsynth2 amd64 2.1.1-2 [

In [4]:
!pip install --upgrade pyfluidsynth

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyfluidsynth
  Downloading pyFluidSynth-1.3.2-py3-none-any.whl (19 kB)
Installing collected packages: pyfluidsynth
Successfully installed pyfluidsynth-1.3.2


In [5]:
!pip install pretty_midi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16
  Downloading mido-1.2.10-py2.py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty_midi: filename=pretty_midi-0.2.10-py3-none-any.whl size=5592303 sha256=d50474b15577469f4daf06f4e2008ea25cb2cbf172b7d63f008cd18d0f04da0d
  Stored in directory: /root/.cache/pip/wheels/75/ec/20/b8e937a5bcf1de547ea5ce465db7de7f6761e15e6f0a01e25f
Successfully built pretty_midi
Installing collected packages: mid

## Imports

In [38]:
import collections
import datetime
import fluidsynth
import glob
import numpy as np
import pathlib
import pandas as pd
import pretty_midi
import seaborn as sns
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import RNN
from tensorflow.keras import backend
from sklearn.model_selection import train_test_split

from IPython import display
from matplotlib import pyplot as plt
from typing import Optional

In [7]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

_SAMPLING_RATE = 16000

## Download the Maestro dataset

In [8]:
data_dir = pathlib.Path('/content/gdrive/MyDrive/Deep Learning/Project/dataset')

In [9]:
filenames = glob.glob(str(data_dir/'**/*.midi'))
print('Number of files:', len(filenames))

Number of files: 1276


## Testing the dataset

In [10]:
sample_file = filenames[874]
print(sample_file)

/content/gdrive/MyDrive/Deep Learning/Project/dataset/2009/MIDI-Unprocessed_11_R1_2009_06-09_ORIG_MID--AUDIO_11_R1_2009_11_R1_2009_06_WAV.midi


In [11]:
pm = pretty_midi.PrettyMIDI(sample_file)

In [12]:
def display_audio(pm: pretty_midi.PrettyMIDI, seconds=30):
  waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
  waveform_short = waveform[:seconds*_SAMPLING_RATE]
  return display.Audio(waveform_short, rate=_SAMPLING_RATE)

In [13]:
display_audio(pm)

In [14]:
print('Number of instruments:', len(pm.instruments))
instrument = pm.instruments[0]
instrument_name = pretty_midi.program_to_instrument_name(instrument.program)
print('Instrument name:', instrument_name)

Number of instruments: 1
Instrument name: Acoustic Grand Piano


## Extract notes to be passed to model

In [15]:
for i, note in enumerate(instrument.notes[:10]):
  note_name = pretty_midi.note_number_to_name(note.pitch)
  duration = note.end - note.start
  print(f'{i}: pitch={note.pitch}, note_name={note_name},'
        f' duration={duration:.4f}')

0: pitch=75, note_name=D#5, duration=0.1120
1: pitch=63, note_name=D#4, duration=0.3516
2: pitch=72, note_name=C5, duration=0.1029
3: pitch=60, note_name=C4, duration=0.0924
4: pitch=52, note_name=E3, duration=0.1107
5: pitch=68, note_name=G#4, duration=1.2057
6: pitch=80, note_name=G#5, duration=0.2253
7: pitch=72, note_name=C5, duration=0.2930
8: pitch=75, note_name=D#5, duration=0.0469
9: pitch=68, note_name=G#4, duration=0.0638


In [16]:
def midi_to_notes(midi_file: str) -> pd.DataFrame:
  pm = pretty_midi.PrettyMIDI(midi_file)
  instrument = pm.instruments[0]
  notes = collections.defaultdict(list)

  # Sort the notes by start time
  sorted_notes = sorted(instrument.notes, key=lambda note: note.start)
  prev_start = sorted_notes[0].start

  for note in sorted_notes:
    start = note.start
    end = note.end
    notes['pitch'].append(note.pitch)
    notes['start'].append(start)
    notes['end'].append(end)
    notes['step'].append(start - prev_start)
    notes['duration'].append(end - start)
    prev_start = start

  return pd.DataFrame({name: np.array(value) for name, value in notes.items()})

In [17]:
raw_notes = midi_to_notes(sample_file)
raw_notes.head()

Unnamed: 0,pitch,start,end,step,duration
0,68,0.959635,2.165365,0.0,1.205729
1,63,1.388021,1.739583,0.428385,0.351562
2,75,1.619792,1.731771,0.231771,0.111979
3,72,1.644531,1.747396,0.02474,0.102865
4,60,1.751302,1.84375,0.106771,0.092448


In [19]:
raw_notes

Unnamed: 0,pitch,start,end,step,duration
0,68,0.959635,2.165365,0.000000,1.205729
1,63,1.388021,1.739583,0.428385,0.351562
2,75,1.619792,1.731771,0.231771,0.111979
3,72,1.644531,1.747396,0.024740,0.102865
4,60,1.751302,1.843750,0.106771,0.092448
...,...,...,...,...,...
1047,46,131.682292,131.852865,0.462240,0.170573
1048,55,131.953125,132.009115,0.270833,0.055990
1049,63,132.063802,132.209635,0.110677,0.145833
1050,70,132.243490,132.326823,0.179688,0.083333


In [20]:
get_note_names = np.vectorize(pretty_midi.note_number_to_name)
sample_note_names = get_note_names(raw_notes['pitch'])
sample_note_names[:10]

array(['G#4', 'D#4', 'D#5', 'C5', 'C4', 'E3', 'G#5', 'C5', 'D#5', 'G#4'],
      dtype='<U3')

## Create a MIDI file

In [21]:
def notes_to_midi(
  notes: pd.DataFrame,
  out_file: str, 
  instrument_name: str,
  velocity: int = 100,  # note loudness
) -> pretty_midi.PrettyMIDI:

  pm = pretty_midi.PrettyMIDI()
  instrument = pretty_midi.Instrument(
      program=pretty_midi.instrument_name_to_program(
          instrument_name))

  prev_start = 0
  for i, note in notes.iterrows():
    start = float(prev_start + note['step'])
    end = float(start + note['duration'])
    note = pretty_midi.Note(
        velocity=velocity,
        pitch=int(note['pitch']),
        start=start,
        end=end,
    )
    instrument.notes.append(note)
    prev_start = start

  pm.instruments.append(instrument)
  pm.write(out_file)
  return pm

In [22]:
example_file = 'example.midi'
example_pm = notes_to_midi(
    raw_notes, out_file=example_file, instrument_name=instrument_name)

In [23]:
display_audio(example_pm)

## Create the training dataset


In [24]:
num_files = 5
all_notes = []
for f in filenames[:num_files]:
  notes = midi_to_notes(f)
  all_notes.append(notes)

all_notes = pd.concat(all_notes)

In [25]:
n_notes = len(all_notes)
print('Number of notes parsed:', n_notes)

Number of notes parsed: 21615


In [40]:
key_order = ['pitch', 'step', 'duration']
total_notes = np.stack([all_notes[key] for key in key_order], axis=1)

In [41]:
train_notes, test_notes = train_test_split(total_notes, test_size=0.3)

In [45]:
notes_train_ds = tf.data.Dataset.from_tensor_slices(train_notes)
notes_train_ds.element_spec

TensorSpec(shape=(3,), dtype=tf.float64, name=None)

In [46]:
notes_test_ds = tf.data.Dataset.from_tensor_slices(test_notes)
notes_test_ds.element_spec

TensorSpec(shape=(3,), dtype=tf.float64, name=None)

In [47]:
def create_sequences(dataset, seq_length,vocab_size = 128,):
  seq_length = seq_length+1

  windows = dataset.window(seq_length, shift=1, stride=1,
                              drop_remainder=True)

  flatten = lambda x: x.batch(seq_length, drop_remainder=True)
  sequences = windows.flat_map(flatten)
  
  def scale_pitch(x):
    x = x/[vocab_size,1.0,1.0]
    return x

  def split_labels(sequences):
    inputs = sequences[:-1]
    labels_dense = sequences[-1]
    labels = {key:labels_dense[i] for i,key in enumerate(key_order)}

    return scale_pitch(inputs), labels

  return sequences.map(split_labels, num_parallel_calls=tf.data.AUTOTUNE)

In [48]:
seq_length = 100
vocab_size = 128
seq_train_ds = create_sequences(notes_train_ds, seq_length, vocab_size)
seq_train_ds.element_spec

(TensorSpec(shape=(100, 3), dtype=tf.float64, name=None),
 {'pitch': TensorSpec(shape=(), dtype=tf.float64, name=None),
  'step': TensorSpec(shape=(), dtype=tf.float64, name=None),
  'duration': TensorSpec(shape=(), dtype=tf.float64, name=None)})

In [49]:
seq_test_ds = create_sequences(notes_test_ds, seq_length, vocab_size)
seq_test_ds.element_spec

(TensorSpec(shape=(100, 3), dtype=tf.float64, name=None),
 {'pitch': TensorSpec(shape=(), dtype=tf.float64, name=None),
  'step': TensorSpec(shape=(), dtype=tf.float64, name=None),
  'duration': TensorSpec(shape=(), dtype=tf.float64, name=None)})

In [50]:
for seq, target in seq_train_ds.take(1):
  print('sequence shape:', seq.shape)
  print()
  print('target:', target)

sequence shape: (100, 3)

target: {'pitch': <tf.Tensor: shape=(), dtype=float64, numpy=52.0>, 'step': <tf.Tensor: shape=(), dtype=float64, numpy=0.23229166666664014>, 'duration': <tf.Tensor: shape=(), dtype=float64, numpy=2.7020833333333485>}


In [51]:
for seq, target in seq_test_ds.take(1):
  print('sequence shape:', seq.shape)
  print()
  print('target:', target)

sequence shape: (100, 3)

target: {'pitch': <tf.Tensor: shape=(), dtype=float64, numpy=64.0>, 'step': <tf.Tensor: shape=(), dtype=float64, numpy=0.07604166666666856>, 'duration': <tf.Tensor: shape=(), dtype=float64, numpy=0.0677083333333286>}


In [52]:
batch_size = 64
buffer_size = n_notes - seq_length
train_ds = (seq_train_ds
            .shuffle(buffer_size)
            .batch(batch_size, drop_remainder=True)
            .cache()
            .prefetch(tf.data.experimental.AUTOTUNE))

In [82]:
batch_size = 64
buffer_size = n_notes - seq_length
test_ds = (seq_test_ds
            .shuffle(buffer_size)
            .batch(batch_size, drop_remainder=True)
            .cache()
            .prefetch(tf.data.experimental.AUTOTUNE))

In [57]:
train_ds.element_spec

(TensorSpec(shape=(64, 100, 3), dtype=tf.float64, name=None),
 {'pitch': TensorSpec(shape=(64,), dtype=tf.float64, name=None),
  'step': TensorSpec(shape=(64,), dtype=tf.float64, name=None),
  'duration': TensorSpec(shape=(64,), dtype=tf.float64, name=None)})

In [83]:
test_ds.element_spec

(TensorSpec(shape=(64, 100, 3), dtype=tf.float64, name=None),
 {'pitch': TensorSpec(shape=(64,), dtype=tf.float64, name=None),
  'step': TensorSpec(shape=(64,), dtype=tf.float64, name=None),
  'duration': TensorSpec(shape=(64,), dtype=tf.float64, name=None)})

## Create and train the model

In [59]:
def mse_with_positive_pressure(y_true: tf.Tensor, y_pred: tf.Tensor):
  mse = (y_true - y_pred) ** 2
  positive_pressure = 10 * tf.maximum(-y_pred, 0.0)
  return tf.reduce_mean(mse + positive_pressure)

In [60]:
class MinimalRNNCell(tf.keras.layers.Layer):

    def __init__(self, units, **kwargs):
        self.units = units
        self.state_size = units
        super(MinimalRNNCell, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
                                      initializer='uniform',
                                      name='kernel')
        self.recurrent_kernel = self.add_weight(
            shape=(self.units, self.units),
            initializer='uniform',
            name='recurrent_kernel')
        self.built = True

    def call(self, inputs, states):
        prev_output = states[0]
        h = backend.dot(inputs, self.kernel)
        output = h + backend.dot(prev_output, self.recurrent_kernel)
        return output, [output]


In [75]:
input_shape = (seq_length, 3)
learning_rate = 0.005

cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
inputs = keras.Input(input_shape)
RNNlayer = RNN(cells)
y = RNNlayer(inputs)

outputs = {
  'pitch': tf.keras.layers.Dense(128, name='pitch')(y),
  'step': tf.keras.layers.Dense(1, name='step')(y),
  'duration': tf.keras.layers.Dense(1, name='duration')(y),
}

model = tf.keras.Model(inputs, outputs)

loss = {
      'pitch': tf.keras.losses.SparseCategoricalCrossentropy(),
      'step': mse_with_positive_pressure,
      'duration': mse_with_positive_pressure,
}

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(loss=loss, optimizer=optimizer)

model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 100, 3)]     0           []                               
                                                                                                  
 rnn_2 (RNN)                    (None, 64)           7264        ['input_3[0][0]']                
                                                                                                  
 duration (Dense)               (None, 1)            65          ['rnn_2[0][0]']                  
                                                                                                  
 pitch (Dense)                  (None, 128)          8320        ['rnn_2[0][0]']                  
                                                                                            

In [76]:
losses = model.evaluate(train_ds, return_dict=True)
losses



{'loss': 8.962301254272461,
 'duration_loss': 0.15756820142269135,
 'pitch_loss': 8.774677276611328,
 'step_loss': 0.030054202303290367}

In [77]:
model.compile(
    loss=loss,
    loss_weights={
        'pitch': 0.05,
        'step': 1.0,
        'duration':1.0,
    },
    optimizer=optimizer,
)

In [78]:
model.evaluate(train_ds, return_dict=True)



{'loss': 0.6263566017150879,
 'duration_loss': 0.15756820142269135,
 'pitch_loss': 8.774677276611328,
 'step_loss': 0.030054202303290367}

In [79]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath='/content/gdrive/MyDrive/Deep Learning/Project/training_checkpoints/ckpt_{epoch}',
        save_weights_only=True),
    tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        patience=5,
        verbose=1,
        restore_best_weights=True),
]

In [80]:
epochs = 50

history = model.fit(
    train_ds,
    epochs=epochs,
    callbacks=callbacks,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 9: early stopping


In [84]:
model.evaluate(test_ds)



[0.3817161023616791, 0.106112539768219, 4.864051818847656, 0.03240077570080757]

In [86]:
model.save('/content/gdrive/MyDrive/Deep Learning/Project/musicGenerator.h5')

## Generate notes

In [91]:
def predict_next_note(notes, keras_model, temperature = 1.0):
  """Generates a note as a tuple of (pitch, step, duration), using a trained sequence model."""

  assert temperature > 0

  # Add batch dimension
  inputs = tf.expand_dims(notes, 0)

  predictions = model.predict(inputs)
  pitch_logits = predictions['pitch']
  step = predictions['step']
  duration = predictions['duration']
 
  pitch_logits /= temperature
  pitch = tf.random.categorical(pitch_logits, num_samples=1)
  pitch = tf.squeeze(pitch, axis=-1)
  duration = tf.squeeze(duration, axis=-1)
  step = tf.squeeze(step, axis=-1)

  # `step` and `duration` values should be non-negative
  step = tf.maximum(0, step)
  duration = tf.maximum(0, duration)

  return int(pitch), float(step), float(duration)

In [92]:
print(seq_length)

100


In [106]:
temperature = 2.5
num_predictions = 100

sample_notes = np.stack([raw_notes[key] for key in key_order], axis=1)
start = 500

# The initial sequence of notes; pitch is normalized similar to training
# sequences
input_notes = (
    sample_notes[start:seq_length+start] / np.array([vocab_size, 1, 1]))

generated_notes = []
prev_start = 0
for _ in range(num_predictions):
  pitch, step, duration = predict_next_note(input_notes, model, temperature)
  start = prev_start + step
  end = start + duration
  input_note = (pitch, step, duration)
  generated_notes.append((*input_note, start, end))
  input_notes = np.delete(input_notes, 0, axis=0)
  input_notes = np.append(input_notes, np.expand_dims(input_note, 0), axis=0)
  prev_start = start

generated_notes = pd.DataFrame(
    generated_notes, columns=(*key_order, 'start', 'end'))



In [107]:
generated_notes.head(30)

Unnamed: 0,pitch,step,duration,start,end
0,36,0.102427,0.194981,0.102427,0.297408
1,101,0.142054,1.573793,0.244481,1.818274
2,17,0.0,4.427605,0.244481,4.672086
3,124,0.0,1.842692,0.244481,2.087173
4,94,0.0,5.280482,0.244481,5.524962
5,45,0.0,4.963699,0.244481,5.208179
6,91,0.0,2.933041,0.244481,3.177522
7,65,0.0,4.387147,0.244481,4.631628
8,109,0.0,3.453498,0.244481,3.697979
9,61,0.0,5.219418,0.244481,5.463899


In [108]:
out_file = '/content/gdrive/MyDrive/Deep Learning/Project/output.midi'
out_pm = notes_to_midi(
    generated_notes, out_file=out_file, instrument_name=instrument_name)
display_audio(out_pm)