<a href="https://colab.research.google.com/github/eswens13/style_transfer/blob/dev/erik/Keras_Style_Transfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup


## Imports and Installs

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Installing packages and cloning git repos
!pip install music21
!pip install h5py
!git clone https://github.com/Skuldur/Classical-Piano-Composer.git

fatal: destination path 'Classical-Piano-Composer' already exists and is not an empty directory.


In [3]:
# imports
import keras
from keras import backend as K
import tensorflow as tf
import music21
import h5py
import os
import numpy as np

Using TensorFlow backend.


In [0]:
os.chdir('Classical-Piano-Composer')

# Define Loss Function

In [0]:
def content_loss(input_sequence, output_sequence):
  cl = tf.linalg.norm(input_sequence - output_sequence)
  return cl


def style_loss(trained_model, new_model):
  num_layers = len(trained_model.layers)
  loss = tf.constant(0.0, dtype=tf.float32)
  
  # Loop through weights
  for i in range(num_layers):
    trained_layer = trained_model.layers[i]
    new_layer = new_model.layers[i]
    # Check if layer is LSTM or Dense
    if not (str(type(trained_layer) == "<class 'keras.layers.recurrent.LSTM'>")) \
      and not (str(type(trained_layer) == "<class 'keras.layers.core.Dense'>")):
      continue
  
    trained_weights = trained_layer.weights
    new_weights = new_layer.weights 
    for j in range(len(trained_layer.weights)):
      # WTF??  Why does tf.linalg.norm(trained_weights[j] - new_weights[j]) work?
      # We think that, because some of these differences are very small, the
      # square root has a very hard time with the small input and results in NaN.
      loss += tf.reduce_sum(tf.square(trained_weights[j] - new_weights[j]))
      
  return loss


def total_loss(input_sequence, output_sequence, trained_model, new_model, alpha, beta):
  return tf.cast(alpha*content_loss(input_sequence, output_sequence), tf.float64) \
          + tf.cast(beta*style_loss(trained_model, new_model), tf.float64)

In [0]:
#style_loss(model, model)

In [0]:
#input_sequence = tf.zeros(shape=[2,2])
#output_sequence = tf.zeros(shape=[2,2])

#with tf.Session() as sess:
#  init = tf.global_variables_initializer()
#  sess.run(init)
#  x = sess.run(content_loss(input_sequence, output_sequence))
#  y = sess.run(total_loss(input_sequence, output_sequence, model, model, 1, 1))
#  print(y)"""

# Generate a Sequence

In [0]:
from lstm import get_notes, prepare_sequences

In [0]:
""" This module generates notes for a midi file using the
  trained neural network """
import pickle
import numpy
import keras
from music21 import instrument, note, stream, chord
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Activation

def generate(idx, notes):
  """ Generate a piano midi file """
  #load the notes used to train the model
  #notes = get_notes()

  # Get all pitch names
  pitchnames = sorted(set(item for item in notes))
  # Get all pitch names
  n_vocab = len(set(notes))

  network_input, normalized_input = prepare_sequences(notes, pitchnames, n_vocab)
  model = create_network(normalized_input, n_vocab)
  prediction_output, pattern = generate_notes(model, network_input, idx, pitchnames, n_vocab)
  create_midi(prediction_output)
  
  return pattern


def prepare_sequences(notes, pitchnames, n_vocab):
  """ Prepare the sequences used by the Neural Network """
  # map between notes and integers and back
  note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

  sequence_length = 100
  network_input = []
  output = []
  for i in range(0, len(notes) - sequence_length, 1):
      sequence_in = notes[i:i + sequence_length]
      sequence_out = notes[i + sequence_length]
      network_input.append([note_to_int[char] for char in sequence_in])
      output.append(note_to_int[sequence_out])

  n_patterns = len(network_input)

  # reshape the input into a format compatible with LSTM layers
  normalized_input = numpy.reshape(network_input, (n_patterns, sequence_length, 1))
  # normalize input
  normalized_input = normalized_input / float(n_vocab)

  return (network_input, normalized_input)

def create_network(network_input, n_vocab):
  """ create the structure of the neural network """
  model = Sequential()
  model.add(LSTM(
      512,
      input_shape=(network_input.shape[1], network_input.shape[2]),
      return_sequences=True
  ))
  model.add(Dropout(0.3))
  model.add(LSTM(512, return_sequences=True))
  model.add(Dropout(0.3))
  model.add(LSTM(512))
  model.add(Dense(256))
  model.add(Dropout(0.3))
  model.add(Dense(n_vocab+1))
  model.add(Activation('softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

  # Load the weights to each node
  model.load_weights('new_weights.hdf5')

  return model

def generate_notes(model, network_input, idx, pitchnames, n_vocab):
  """ Generate notes from the neural network based on a sequence of notes """
  # pick a random sequence from the input as a starting point for the prediction
  #start = numpy.random.randint(0, len(network_input)-1)
  start = idx

  int_to_note = dict((number, note) for number, note in enumerate(pitchnames))

  pattern = network_input[start]
  prediction_output = []

  # generate 500 notes
  #for note_index in range(500):
  for note_index in range(100):

      prediction_input = numpy.reshape(pattern, (1, len(pattern), 1))
      prediction_input = prediction_input / float(n_vocab)

      prediction = model.predict(prediction_input, verbose=0)

      index = numpy.argmax(prediction)
      result = int_to_note[index]
      prediction_output.append(result)

      pattern.append(index)
      pattern = pattern[1:len(pattern)]

  return prediction_output, pattern

def create_midi(prediction_output):
  """ convert the output from the prediction to notes and create a midi file
      from the notes """
  offset = 0
  output_notes = []

  # create note and chord objects based on the values generated by the model
  for pattern in prediction_output:
      # pattern is a chord
      if ('.' in pattern) or pattern.isdigit():
          notes_in_chord = pattern.split('.')
          notes = []
          for current_note in notes_in_chord:
              new_note = note.Note(int(current_note))
              new_note.storedInstrument = instrument.Piano()
              notes.append(new_note)
          new_chord = chord.Chord(notes)
          new_chord.offset = offset
          output_notes.append(new_chord)
      # pattern is a note
      else:
          new_note = note.Note(pattern)
          new_note.offset = offset
          new_note.storedInstrument = instrument.Piano()
          output_notes.append(new_note)

      # increase offset each iteration so that notes do not stack
      offset += 0.5

  midi_stream = stream.Stream(output_notes)

  midi_stream.write('midi', fp='test_output.mid')
  
  
def generate_notes_from_melody(normalized_meloday_seq, network_input, idx, pitchnames, n_vocab):
  """ Generate notes from the neural network based on a sequence of notes """
  # pick a random sequence from the input as a starting point for the prediction
  #start = numpy.random.randint(0, len(network_input)-1)
  start = idx

  int_to_note = dict((number, note) for number, note in enumerate(pitchnames))

  #pattern = network_input[start]
  pattern = normalized_meloday_seq
  prediction_output = []

  # generate 500 notes
  #for note_index in range(500):
  for note_index in range(100):

      prediction_input = numpy.reshape(pattern, (1, len(pattern), 1))
      prediction_input = prediction_input / float(n_vocab)

      prediction = model.predict(prediction_input, verbose=0)

      index = numpy.argmax(prediction)
      result = int_to_note[index]
      prediction_output.append(result)

      pattern = np.concatenate((pattern, np.array([index])))
      pattern = pattern[1:len(pattern)]

  return prediction_output, pattern


def generate_from_melody(normalized_melody_seq, notes):
  """ Generate a piano midi file """
  #load the notes used to train the model
  #notes = get_notes()

  # Get all pitch names
  pitchnames = sorted(set(item for item in notes))
  # Get all pitch names
  n_vocab = len(set(notes))

  network_input, normalized_input = prepare_sequences(notes, pitchnames, n_vocab)
  model = create_network(normalized_input, n_vocab)
  prediction_output, pattern = generate_notes_from_melody(normalized_melody_seq, network_input, 0, pitchnames, n_vocab)
  create_midi(prediction_output)
  
  return pattern

In [10]:
notes = get_notes()

Parsing midi_songs/AT.mid
Parsing midi_songs/VincentPiano.mid
Parsing midi_songs/Gold_Silver_Rival_Battle.mid
Parsing midi_songs/ff4-town.mid
Parsing midi_songs/Kingdom_Hearts_Traverse_Town.mid
Parsing midi_songs/Finalfantasy6fanfarecomplete.mid
Parsing midi_songs/ff7themep.mid
Parsing midi_songs/Final_Fantasy_7_-_Judgement_Day_Piano.mid
Parsing midi_songs/electric_de_chocobo.mid
Parsing midi_songs/JENOVA.mid
Parsing midi_songs/dontbeafraid.mid
Parsing midi_songs/FFVII_BATTLE.mid
Parsing midi_songs/EyesOnMePiano.mid
Parsing midi_songs/sera_.mid
Parsing midi_songs/ff4pclov.mid
Parsing midi_songs/mining.mid
Parsing midi_songs/Kingdom_Hearts_Dearly_Beloved.mid
Parsing midi_songs/FF4.mid
Parsing midi_songs/traitor.mid
Parsing midi_songs/BlueStone_LastDungeon.mid
Parsing midi_songs/pkelite4.mid
Parsing midi_songs/Fiend_Battle_(Piano).mid
Parsing midi_songs/Rachel_Piano_tempofix.mid
Parsing midi_songs/sandy.mid
Parsing midi_songs/dayafter.mid
Parsing midi_songs/DOS.mid
Parsing midi_songs/Fie

In [0]:
pitchnames = sorted(set(item for item in notes))
n_vocab = len(set(notes))
input_sequences, output_sequences = prepare_sequences(notes, pitchnames, n_vocab)

In [0]:
preds = np.array(generate(0, notes))

In [13]:
np.array(preds) / n_vocab

array([0.96089385, 0.79608939, 0.85195531, 0.96089385, 0.79608939,
       0.85195531, 0.99441341, 0.79608939, 0.85195531, 0.96089385,
       0.79608939, 0.85195531, 0.96089385, 0.79608939, 0.85195531,
       0.96648045, 0.88826816, 0.83240223, 0.79608939, 0.96089385,
       0.79608939, 0.83240223, 0.96089385, 0.79608939, 0.83240223,
       0.99441341, 0.79608939, 0.83240223, 0.96089385, 0.79608939,
       0.83240223, 0.96089385, 0.79608939, 0.83240223, 0.96648045,
       0.88826816, 0.81284916, 0.79608939, 0.99441341, 0.79608939,
       0.83240223, 0.96089385, 0.79608939, 0.81284916, 0.99441341,
       0.79608939, 0.83240223, 0.99441341, 0.79608939, 0.83240223,
       0.96089385, 0.79608939, 0.83240223, 0.96648045, 0.88826816,
       0.81284916, 0.79608939, 0.99441341, 0.79608939, 0.83240223,
       0.96089385, 0.79608939, 0.81284916, 0.99441341, 0.79608939,
       0.83240223, 0.99441341, 0.79608939, 0.83240223, 0.96089385,
       0.79608939, 0.83240223, 0.96648045, 0.88826816, 0.81284

# Define Training Loop

In [0]:
def transfer_style(input_melody, trained_model, prog_model, hparams):
  
  # Parse the dictionary of hyperparameters.
  learning_rate = None
  alpha = None
  beta = None
  epse = None
  if not hparams['learning_rate']:
    learning_rate = 0.001
  else:
    learning_rate = hparams['learning_rate']
  if not hparams['alpha']:
    alpha = 1.0
  else:
    alpha = hparams['alpha']
  if not hparams['beta']:
    beta = 1.0
  else:
    beta = hparams['beta']
  if not hparams['eps']:
    eps = 0.1
  else:
    eps = hparams['eps']
  
  # Make sure we're not backpropagating to the pre-trained model layers.
  for layer in trained_model.layers:
    layer.trainable = False
    
  vars_to_update = []
      
  # Define the training loop.
  last_tot_loss = 999999999999.99
  output_sequence = None
  with tf.Session() as sess:
    
     # Assume that the prog_model has randomly initialized all variables in LSTM
    # and Dense layers.
    var_init = tf.global_variables_initializer()
    sess.run(var_init)
    
    for i in range(len(prog_model.layers)):
      layer = prog_model.layers[i]
      layer_name = layer.name
      
      # Check if layer is LSTM or Dense
      # TODO: Do we really want to update the Dense layers here?
      if not (str(type(layer) == "<class 'keras.layers.recurrent.LSTM'>")) and \
          not (str(type(layer) == "<class 'keras.layers.core.Dense'>")):
        continue        
      
      vars_to_update += \
        [tf_var for tf_var in tf.trainable_variables() if layer_name in tf_var.name]
    
    # Pass input seq and output seq and two different models to total_loss.
    output_sequence = generate_from_melody(input_melody, notes)
    output_sequence /= n_vocab
    cont_loss = content_loss(input_melody, output_sequence)
    sty_loss = style_loss(trained_model, prog_model)
    tot_loss = total_loss(input_melody, output_sequence, trained_model, prog_model, alpha, beta)
    
    # Update the weights in each layer of prog_model based on the total loss.
    update_op = tf.train.RMSPropOptimizer(learning_rate, name="ERIK_RMS_PROP") \
      .minimize(tot_loss, var_list=vars_to_update)
    
    # Assume that the prog_model has randomly initialized all variables in LSTM
    # and Dense layers.
    var_init = tf.global_variables_initializer()
    sess.run(var_init)
    
    satisfied = False
    counter = 1
    while counter <= 20:
      print("Epoch {}:".format(counter))
      print("\tWe are not satisfied!")
      # Generate an output sequence from the input sequence.
      if counter != 1:
        output_sequence = generate_from_melody(input_melody, notes)
        output_sequence /= n_vocab
      tl = sess.run(tot_loss)
      print("\tLoss: {}".format(tl))
      tot_loss = total_loss(input_melody, output_sequence, trained_model, prog_model, alpha, beta)
      sess.run(update_op)
      
      # Decide when we are satisfied.
      loss_diff = sess.run(abs(last_tot_loss - tot_loss))
      last_tot_loss = tl
      satisfied = (loss_diff < eps)
      counter += 1

  return output_sequence
        

# Generate Input Melodies

In [0]:
# Parse a MIDI file and extrac the notes.
twinkle_stream = music21.converter.parse("/content/drive/My Drive/deep_learning/midis/melodies/twinkle_twinkle.mid")
only_part = twinkle_stream.parts[0]
melody_notes = []
for the_note in only_part:
  if isinstance(the_note, music21.note.Note):
    melody_notes.append(the_note)

# Convert the notes in the melody to our vocab.
melody = []
for the_note in melody_notes:
  note_str = the_note.step + str(the_note.octave)
  index = pitchnames.index(note_str)
  melody.append(index)

# Sample Training

In [16]:
melody = np.array(melody) / n_vocab
curr_ind = 0
while len(melody) < 100:
  melody = np.concatenate((melody, [melody[curr_ind]]))
  curr_ind += 1

print("Twinkle Length: {}".format(len(melody)))
print(melody)

Twinkle Length: 100
[0.86871508 0.86871508 0.99162011 0.99162011 0.79608939 0.79608939
 0.99162011 0.95810056 0.95810056 0.92458101 0.92458101 0.88826816
 0.88826816 0.86871508 0.99162011 0.99162011 0.95810056 0.95810056
 0.92458101 0.92458101 0.88826816 0.99162011 0.99162011 0.95810056
 0.95810056 0.92458101 0.92458101 0.88826816 0.86871508 0.86871508
 0.99162011 0.99162011 0.79608939 0.79608939 0.99162011 0.95810056
 0.95810056 0.92458101 0.92458101 0.88826816 0.88826816 0.86871508
 0.86871508 0.86871508 0.99162011 0.99162011 0.79608939 0.79608939
 0.99162011 0.95810056 0.95810056 0.92458101 0.92458101 0.88826816
 0.88826816 0.86871508 0.99162011 0.99162011 0.95810056 0.95810056
 0.92458101 0.92458101 0.88826816 0.99162011 0.99162011 0.95810056
 0.95810056 0.92458101 0.92458101 0.88826816 0.86871508 0.86871508
 0.99162011 0.99162011 0.79608939 0.79608939 0.99162011 0.95810056
 0.95810056 0.92458101 0.92458101 0.88826816 0.88826816 0.86871508
 0.86871508 0.86871508 0.99162011 0.991620

In [0]:
# Load pre-trained model
from keras.layers import LSTM, Dropout, Dense, Activation
from keras import Sequential
from keras import backend as K

K.clear_session()
tf.reset_default_graph()

model = Sequential()
model.add(LSTM(512, input_shape=(100, 1), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(359))
model.add(Activation('softmax'))
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
# Load the weights to each node
model.load_weights('new_weights.hdf5')

# Make a copy of the trained model that we'll modify for style transfer purposes.
hparams = {}
hparams['learning_rate'] = 1e-3
hparams['alpha'] = 5e-2   # Weight of content loss
hparams['beta'] = 5e-4    # Weight of style loss
hparams['eps'] = 5.0      # Delta loss stopping criteria

transfer_model = Sequential()
transfer_model.add(LSTM(512, input_shape=(100, 1), return_sequences=True))
transfer_model.add(Dropout(0.3))
transfer_model.add(LSTM(512, return_sequences=True))
transfer_model.add(Dropout(0.3))
transfer_model.add(LSTM(512))
transfer_model.add(Dense(256))
transfer_model.add(Dropout(0.3))
transfer_model.add(Dense(359))
transfer_model.add(Activation('softmax'))

In [56]:
out_song = transfer_style(melody, model, transfer_model, hparams)

Epoch 1:
	We are not satisfied!
	Loss: 4.2150325444789285
Epoch 2:
	We are not satisfied!
	Loss: 3.9830627866545614
Epoch 3:
	We are not satisfied!
	Loss: 3.9830542035857137
Epoch 4:
	We are not satisfied!
	Loss: 3.983045143679708
Epoch 5:
	We are not satisfied!
	Loss: 3.983035845355123
Epoch 6:
	We are not satisfied!
	Loss: 3.983026308611959
Epoch 7:
	We are not satisfied!
	Loss: 3.98301534135732
Epoch 8:
	We are not satisfied!
	Loss: 3.9830043741026815
Epoch 9:
	We are not satisfied!
	Loss: 3.9829929300108846
Epoch 10:
	We are not satisfied!
	Loss: 3.9829807706633504
Epoch 11:
	We are not satisfied!
	Loss: 3.982967896060079
Epoch 12:
	We are not satisfied!
	Loss: 3.982953829363912
Epoch 13:
	We are not satisfied!
	Loss: 3.982939762667745
Epoch 14:
	We are not satisfied!
	Loss: 3.9829247422972616
Epoch 15:
	We are not satisfied!
	Loss: 3.9829085298338827
Epoch 16:
	We are not satisfied!
	Loss: 3.9828918405333456
Epoch 17:
	We are not satisfied!
	Loss: 3.982873959139913
Epoch 18:
	We a

# Future Work

One of the things we didn't have much time for was hyperparameter optimization.  Since the training loop takes a LONG time to run, it was infeasible for us to fine tune the learning rate and the weights for style and content loss.