# **Melody Generation from Images**

## **Importing Libraries**

In [27]:
from numpy import array
from pickle import load, dump
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import add
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from glob import glob
import os
from numpy import argmax
from music21 import converter, instrument, note, chord, stream, meter,duration, interval, pitch, tempo, midi
from tensorflow.keras.applications.inception_v3 import InceptionV3

from os import listdir
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model

**Mount Drive**

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

To include data folder into drive,
go to the below link and click on add shortcut to drive
https://drive.google.com/drive/folders/1TTg5wH78eE0yR_OHJZTJzF3wGv8t0Q1x?usp=sharing

**Declare Constants**

In [2]:
#data folder path
data_dir = r'E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\' 
#midi note and duration separator
separator = "@"
#image classifier model labels
label_VGG16 = "VGG16"
label_InceptionV3 = "InceptionV3"
#image model currently in use
current_ImgModel = label_VGG16

## **Midi Preprocessing**

Obtain all midi files from folder

In [6]:
songs = glob(data_dir + 'midi/*.MID')

### **Function Definitions**

Define function to convert list to string

In [3]:
def listToString(listObj): 
    
    # initialize an empty string
    stringText = "" 
    
    # traverse in the string  
    for element in listObj: 
        stringText = stringText + " " + element  
    
    # return string  
    return stringText 

Function to transpose a song to C maj if Major or A min if Minor

In [4]:
def transpose(song):
    #Transposes song to C maj/A min

    # get key using music21
    key = song.analyze("key")
    # get interval for transposition. E.g., Bmaj -> Cmaj
    if key.mode == "major":
        intervalSong = interval.Interval(key.tonic, pitch.Pitch("C"))
    elif key.mode == "minor":
        intervalSong = interval.Interval(key.tonic, pitch.Pitch("A"))

    # transpose song by calculated interval
    tranposed_song = song.transpose(intervalSong)
    return tranposed_song

Function to check if rests are more than one bar length

In [5]:
#function to check if rest duration is acceptables
def isRestAcceptable(rest, timeSignature):
  #check is time signature denominator is 4
  if (timeSignature.denominator == 4):
      #if denominator 4, length of one bar is 4 beats
      barDuration = timeSignature.numerator 
  #check is time signature denominator is 8 
  elif (timeSignature.denominator == 8):
      #if denominator is 8, length of one bar is numerator/2
      barDuration = timeSignature.numerator/2.0
  #check is time signature denominator is 2
  elif (timeSignature.denominator == 2):
      #if denominator is 2, length of one bar is numerator * 2
      barDuration = timeSignature.numerator * 2.0
  #return false if rest duration is greater than 2 bars
  if barDuration * 2 < rest.quarterLength:
      return False
  
  return True    

Function to get bar in which 1st note occurs

In [6]:
def getStartingBar(score):
    for i in range(51):
        bar = score.measures(i-1,i)
        barflat = bar.flat
        for event in barflat:
            if isinstance(event, note.Note):
                num = i
                break
        else:
            continue
        break;
    return num

Define function to create midi data file

In [7]:
def createDatafile():
  #create a file named midiData.txt and append
  midiData_file = open(data_dir + "midiData.txt", "w")
#iterate over each midi file
  print("Writing midi data to file...")
  songCount = 0
  for file in songs:
    print(file)
    try:
        midi = converter.parse(file) #convert to midi
    except:
        print("Error in parsing " + str(file))
        continue
    #midi = midi.stripTies()
    #tranpose song
    try:
        midi = transpose(midi)
    except:
        print("Error in obtaining key for " + str(file))
        continue
    #obtain 30 bars
    songTempo = ''
    try:
        firstbar = midi.measures(0,3)
        firstbarNotes = firstbar.flat
        for element in firstbarNotes:  
            if isinstance(element, tempo.MetronomeMark): 
                songTempo = str(element.number) + separator + "tempo"
        startBar = getStartingBar(midi)
        midi = midi.measures(startBar, startBar + 7)
    except Exception as e:
      print("Error in measures for " + str(file))
      continue
    midi = midi.flat #combine all parts and get a single notes part
    
    songCount += 1
    #initialize timeSignature, notes list and count variables
    timeSignature = ''
    notes = []
    #count = 0

    #loop over each midi event
    #midi events include timesignatures, instruments, notes, rests and chords
    for event in midi: 
      #only obtain 50 events
      # if count == 50:
      #   break;
      # count += 1 #increase count
      #check if event is a timeSignature
      if isinstance(event, meter.TimeSignature ):
        #save time signature to file as '3/4@time'
        timeSignature = event.ratioString + separator + "time" 
        timeSignatureEvent = event
      #check if event is a note
      if isinstance(event, note.Note):
        #save note to file as 'C1@0.5' where 'C1' is midi note and '0.5' is the duration
        notes.append(str(event.pitch) + separator + str(event.quarterLength))  
      #check if event is a chord
      elif(isinstance(event, chord.Chord)):
        #save chord to file as '1.8.2@0.5' where '1.8.2' are notes in the chord and '0.5' is the duration
        notes.append(('.'.join(str(n) for n in event.normalOrder))+ separator + str(event.quarterLength))
      #check if event is a rest
      elif(isinstance(event, note.Rest)):
        #check if rest duration is acceptable
        if isRestAcceptable(event, timeSignatureEvent):
          #save rest as 'r@0.5' where 'r' indicates that it is a rest and '0.5'  is the duration
          notes.append('r' + separator + str(event.quarterLength))

    #save the sequence to the file
    jpgFilenamesList = glob(data_dir + 'images/' + os.path.basename(file).split('.')[0] + '*.*')
    simImageCount = 0
    for image in jpgFilenamesList:
        simImageCount += 1
        #first write the file id which is same as the image id
        sequence = os.path.basename(image).split('.')[0] + " " + "<start> " + songTempo + " " + timeSignature + listToString(notes) + " <end>" #append <start> and <end> tokens
        #if current song is the last in the list, don't add a newline character to the end
        if((len(songs)== songs.index(file) +1) and simImageCount == len(jpgFilenamesList)):
            pass
        else:
            sequence = sequence + "\n"
        midiData_file.write(sequence)
  
  print(f"Finishes writing midi data. {songCount} songs written.")
  #close file
  midiData_file.close()

Function to load a file

In [8]:
def load_file(fileName):
  #open file in read mode
  file = open(fileName, 'r')
  #obtain content in file
  content = file.read()
  #close file
  file.close()

  return content

Function to map file content to dict, create vocabulary and obtain max length of a midi sequence

In [9]:
def dict_vocab_maxLength(content):
  #initialize dictionary
  midiDict = dict()
  vocabulary = set()
  maxLength = 0
  #loop over each line in content -> each midi song is separated by new line
  for line in content.split('\n'):
    #split line into tokens by white space
    tokens = line.split()
    #first token is image id, rest are midi events
    imageId, midi = tokens[0], tokens[1:]
    #create a list if not created
    if imageId not in midiDict:
      midiDict[imageId] = list()
    midiDict[imageId].append(midi)
    #build vocabulary
    vocabulary.update(midi)
    #store max length
    if len(tokens) > maxLength:
      maxLength = len(tokens)
  #return dict, vocabulary and maxlength
  return midiDict, vocabulary, maxLength

Function to tokenize vocabulary and return tokenizer

In [10]:
def tokenize(vocab):
  #filter 50 midi events with low frequency
  num_words= len(vocab) - 10
  #initialize tokenizer object without any filters
  tokenizer = Tokenizer(num_words=num_words, filters='')
  #generate tokens
  tokenizer.fit_on_texts(vocab)
  print("Number of tokens: " + str(len(tokenizer.word_index)))
  return tokenizer

### **Midi Preprocessing steps**

Create a text file with all midi data of all songs

In [180]:
createDatafile()

Writing midi data to file...
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00008.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00009.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00016.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00018.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00021.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00024.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00027.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00028.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00029.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00030.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\midi\00033.MID
E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\mi

Load Saved file

In [11]:
content = load_file(data_dir + "midiData.txt")

Create a Dictionary with midi data, obtain vocabulary and get the max length of a midi sequence

In [12]:
train_midi, vocabulary, maxLength = dict_vocab_maxLength(content)
#print vocabulary length
vocabSize = len(vocabulary) + 1
print("Vocabulary length = " + str(vocabSize))
#print max length of sequence
print("Maximum length sequence = " + str(maxLength))

Vocabulary length = 340
Maximum length sequence = 114


Tokenize vocabulary and save tokenizer pickle

In [183]:
tokenizer = tokenize(vocabulary)

Number of tokens: 339


In [184]:
dump(tokenizer, open(data_dir + 'tokenizer.pkl', 'wb'))

## **Image Preprocessing**

### **Function Definitions**

Function to extract image features

In [8]:
def extract_Image_features(directory, modelName = current_ImgModel):
  #check for the model needed and initialize the model and image size
  if modelName == label_VGG16:
    model = VGG16()
    imgSize = (224, 224)
  else:
    model = InceptionV3(weights='imagenet')
    imgSize = (299, 299)

  #remove the last layer of the model to obtain the features
  #VGG16 has 4096 and InceptionV3 has 2048
  model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

  #print model summary
  print(model.summary())

  #intialize dictionary to extract features
  features = dict()

  #loop over each image in directory
  for imgName in listdir(directory):
    #get image path
    path = directory + '/' + imgName
    #load image and resize
    image = load_img(path, target_size=imgSize)
    #convert image to numpy array
    image = img_to_array(image)
    #reshape image to suit model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    #preprocess image for the model
    image = preprocess_input(image)
    #extract features
    feature = model.predict(image, verbose=0)
    #get image id
    image_id = imgName.split('.')[0]
    #append to dictionary
    features[image_id] = feature

    #print image name
    print("Features extracted for " + imgName)
  
  return features

Function to load photo features from pickle file

In [13]:
def load_featuresPickle(path):
  #load all features
  features = load(open(path, 'rb'))
  return features

### Image Preprocessing Steps

In [187]:
features = extract_Image_features(data_dir + 'images')

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

Dump feature to pickle file

In [188]:
#print length of features
print("Extracted features : %d" % len(features))
# save to pickle file
dump(features, open(data_dir + 'features.pkl', 'wb'))

Extracted features : 300


# **Model Creation, Training and related Functions**

## **Function Definitions**

Function to create a deep learning model

In [14]:
def create_Model(vocabSize, maxLength):
  #image feature extractor model
  if current_ImgModel == label_VGG16:
    inputs1 = Input(shape = (4096, ))
  elif current_ImgModel == label_InceptionV3:
    inputs1 = Input(shape = (2048, ))

  fe1 = Dropout(0.5)(inputs1)
  fe2 = Dense(256, activation='relu')(fe1)

  #midi sequence model
  inputs2 = Input(shape=(maxLength,))
  #Embedding layer
  se1 = Embedding(vocabSize, 256, mask_zero=True)(inputs2)
  se2 = Dropout(0.2)(se1)
  se3 = LSTM(128, return_sequences=True)(se2)
  se4 = Dropout(0.2)(se3)
  se5 = LSTM(256)(se4)

  #decoder model
  decoder1 = add([fe2, se5])
  decoder2 = Dense(256, activation='relu')(decoder1)
  outputs = Dense(vocabSize, activation='softmax')(decoder2)

  #Merger model
  model = Model(inputs=[inputs1, inputs2], outputs=outputs)
  #compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam')
  #print model summary
  print("Model Summary")
  model.summary()
  plot_model(model, to_file=data_dir + 'model.png', show_shapes=True)
  return model

Function to generate input sequences

In [15]:
def create_sequences(tokenizer, max_length, midi_list, photo, vocab_size):
  #initialize input lists
  X1, X2, y = list(), list(), list()
  #loop through each midi song for the image
  for midi in midi_list:
    #encode the midi sequence
    seq = tokenizer.texts_to_sequences([midi])[0]
    #generate multiple X,y pairs from one midi file
    for i in range(1, len(seq)):
      #generate input and output pair
      in_seq, out_seq = seq[:i], seq[i]
      #pad input sequence
      in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
      #encode output Sequence
      out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
      #append to input lists
      X1.append(photo)
      X2.append(in_seq)
      y.append(out_seq)
  
  return array(X1), array(X2), array(y)

Function to generate data for progressive loading

In [16]:
def data_generator(midiData, photos, tokenizer, max_length, vocab_size):
  #loop forever
  while True:
    for key, midi_list in midiData.items():
      #get photo features
      photo = photos[key][0]
      in_img, in_seq, out_word = create_sequences(tokenizer, max_length, midi_list, photo, vocab_size)
      yield [in_img, in_seq], out_word

Function to map output integer to midi vocabulary

In [17]:
def get_midiString_for_Integer(integer, tokenizer):
  #loop through tokenizer to find a match
  for midiString, index in tokenizer.word_index.items():
    if index == integer:
      return midiString
  return None

Function to generate midi string for image

In [18]:
def generate_midiSequence(model, tokenizer, photo, maxLength):
  #create initial token
  midiSequence = '<start>' #string with midi events including <start> and <end> tokens
  prediction_list = [] #list with midi events without <start> and <end> tokens

  #iterate over max length of a sequence
  for i in range(maxLength):
    #encode sequence
    sequence =  tokenizer.texts_to_sequences([midiSequence])[0]
    #pad sequence
    sequence = pad_sequences([sequence], maxlen=maxLength)
    #predict next midi event string
    yhat = model.predict([photo,sequence], verbose=0)
    #print(yhat)
    #obtain event with highest probability
    #import numpy as np
    #yhat2 = np.argsort(np.max(yhat, axis=0))[-2]
    yhat = argmax(yhat)
    #map integer to midi string
    midiString = get_midiString_for_Integer(yhat, tokenizer)
    #stop if cannot find
    if midiString is None:
      break
    #append midiString to sequence
    if not midiString == '<start>':
        midiSequence += ' ' + midiString
    #stop if end of midi
    if midiString == '<end>':
      break
    #append midi event string to prediction list
    if not midiString == '<start>':
        prediction_list.append(midiString)
  
  return midiSequence, prediction_list

Function to create a midi from prediction list

In [19]:
def create_midi(prediction_output, midiName):
  #initiate offset to 0
  offset = 0.0
  #initiate midi stram
  midi_stream = stream.Stream()

  #loop over each midiString patter in prediction Output
  for pattern in prediction_output:
    #Seperate midiString into event and time by @ symbol
    patternString = pattern.split('@')[0] #0 position stores midi event
    if pattern.split('@')[1] == 'tempo':
      #if event is a tempo, append to midiStream and continue to next iteration
      tp0 = tempo.MetronomeMark(patternString)
      tp0.setQuarterBPM(int(float(patternString))) 
      midi_stream.append(tp0)
      continue
    #check if position 1 == time to check if event is a timeSignature
    if pattern.split('@')[1] == 'time':
      #if event is a timeSignature, append to midiStream and continue to next iteration
      ts0 = meter.TimeSignature(patternString)
      midi_stream.append(ts0)
      continue
    #check if '.' in patterString  or it patternString is a digit to detemine if the event is a chord
    if ('.' in patternString) or patternString.isdigit():
        #if event is a chord, obtain notes
        notes_in_chord = patternString.split('.')
        notes = []
        #loop for each note and create a notes list
        for current_note in notes_in_chord:
          new_note = note.Note(int(current_note))
          new_note.storedInstrument = instrument.Piano()
          notes.append(new_note)
        #create a chord using notes list
        new_chord = chord.Chord(notes)
        #set duration of chord 
        new_chord.quarterLength = eval(pattern.split('@')[1])
        #update offset
        new_chord.offset = offset 
        offset += new_chord.quarterLength
        #append chord to midi Stream
        midi_stream.append(new_chord)
    # pattern is a note or rest
    else:
      #if pattern is a rest
      if patternString == 'r':
        #create rest event
        new_note = note.Rest()
        #set duration
        new_note.quarterLength = eval(pattern.split('@')[1])
        #update offest
        new_note.offset = offset 
        offset += new_note.quarterLength
        #append to midi stream
        midi_stream.append(new_note)
      else:
        #if pattern is a note
        #create note
        new_note = note.Note(patternString)
        #set note duration
        new_note.quarterLength = eval(pattern.split('@')[1])
        #update offset
        new_note.offset = offset 
        offset += new_note.quarterLength
        new_note.storedInstrument = instrument.Piano()
        midi_stream.append(new_note)
              
  midi_stream.makeMeasures(inPlace = True)

  print('Saving Output file as midi....')

  midi_stream.write('midi', fp=data_dir + midiName + '.mid')

Function to extract features of image to generate music

In [20]:
def extract_featuresPredict(path, modelName = current_ImgModel):
  #initialize model and resize image
  if modelName == label_VGG16:
    model = model = VGG16()
    image = load_img(path, target_size=(224, 224))
  elif modelName == label_InceptionV3:
    model = InceptionV3(weights='imagenet')
    image = load_img(filename, target_size=(299, 299))

  #remove last layer of model
  model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
  #convert image to numpy array
  image = img_to_array(image)
  #reshape array 
  image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
  #preprocess image
  image = preprocess_input(image)
  #obtain features
  features = model.predict(image, verbose=0)

  return features

Model Creation and Training Steps

Load training photo features

In [17]:
train_features = load_featuresPickle(data_dir + 'features.pkl')

Training midi data is already loaded in train_midi dictionary

Next create the model

In [17]:
#create model
model = create_Model(vocabSize= vocabSize, maxLength=maxLength)
#initialize epochs
epochs = 250
#initials steps to generate data
steps = len(train_midi)

Model Summary
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 114)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 114, 256)     87040       ['input_2[0][0]']                
                                                                                                  
 dropout_1 (Dropout)            (None, 114, 256)     0           ['embedding[0][0]']              
                                                                                                  
 input_1 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                

Define path to save best model and initialize Checkpoint

In [239]:
#save weights
filepath = data_dir + 'training_2/cp.ckpt'
checkpoint_dir = os.path.dirname(filepath)
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_weights_only=True, save_best_only=True, mode='min')

Train model

In [202]:
for i in range(epochs):
  #create data generator
  generator  = data_generator(train_midi, train_features, tokenizer, maxLength, vocabSize)
  #fit model for one epoch
  print("Epoch No : " + str(i + 1))
  model.fit(generator, epochs=1, steps_per_epoch=steps, callbacks=[checkpoint], verbose=1)

Epoch No : 1
Epoch 1: loss improved from inf to 5.09580, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 2
Epoch 1: loss improved from 5.09580 to 4.60135, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 3
Epoch 1: loss improved from 4.60135 to 4.34320, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 4
Epoch 1: loss improved from 4.34320 to 4.04429, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 5
Epoch 1: loss improved from 4.04429 to 3.93800, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 6
Epoch 1: loss improved from 3.93800 to 3.68447, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 7
Epoch 1: loss

KeyboardInterrupt: 

In [None]:
#resume training
#load weights
filepath = data_dir + 'training_2/cp.ckpt'
checkpoint_dir = os.path.dirname(filepath)
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_weights_only=True, save_best_only=True, mode='min')

In [203]:
model = create_Model(vocabSize, maxLength)
model.load_weights(filepath)
tokenizer = load(open(data_dir + 'tokenizer.pkl', 'rb'))

Model Summary
Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_16 (InputLayer)          [(None, 114)]        0           []                               
                                                                                                  
 embedding_6 (Embedding)        (None, 114, 256)     87040       ['input_16[0][0]']               
                                                                                                  
 dropout_19 (Dropout)           (None, 114, 256)     0           ['embedding_6[0][0]']            
                                                                                                  
 input_15 (InputLayer)          [(None, 4096)]       0           []                               
                                                                              

In [204]:
for i in range(epochs):
  #create data generator
  generator = data_generator(train_midi, train_features, tokenizer, maxLength, vocabSize)
  #fit model for one epoch
  print("Epoch No : " + str(i + 1))
  model.fit(generator, epochs=1, steps_per_epoch=steps, callbacks=[checkpoint], verbose=1)

Epoch No : 1
Epoch 1: loss improved from 0.15950 to 0.14733, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 2
Epoch 1: loss did not improve from 0.14733
Epoch No : 3
Epoch 1: loss did not improve from 0.14733
Epoch No : 4
Epoch 1: loss did not improve from 0.14733
Epoch No : 5
Epoch 1: loss did not improve from 0.14733
Epoch No : 6
Epoch 1: loss did not improve from 0.14733
Epoch No : 7
Epoch 1: loss did not improve from 0.14733
Epoch No : 8
Epoch 1: loss did not improve from 0.14733
Epoch No : 9
Epoch 1: loss did not improve from 0.14733
Epoch No : 10
Epoch 1: loss did not improve from 0.14733
Epoch No : 11
Epoch 1: loss improved from 0.14733 to 0.13522, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 12
Epoch 1: loss did not improve from 0.13522
Epoch No : 13
Epoch 1: loss did not improve from 0.13522
Epoch No : 14
Epoch 1: loss improved from 0.13522 to 0

KeyboardInterrupt: 

In [9]:
#resume training
#load weights
filepath = data_dir + 'training_2/cp.ckpt'
checkpoint_dir = os.path.dirname(filepath)
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_weights_only=True, save_best_only=True, mode='min')

In [86]:
model = create_Model(vocabSize, maxLength)
model.load_weights(filepath)
tokenizer = load(open(data_dir + 'tokenizer.pkl', 'rb'))

Model Summary
Model: "model_50"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_72 (InputLayer)          [(None, 114)]        0           []                               
                                                                                                  
 embedding_20 (Embedding)       (None, 114, 256)     87040       ['input_72[0][0]']               
                                                                                                  
 dropout_61 (Dropout)           (None, 114, 256)     0           ['embedding_20[0][0]']           
                                                                                                  
 input_71 (InputLayer)          [(None, 4096)]       0           []                               
                                                                             

In [87]:
for i in range(epochs):
  #create data generator
  generator = data_generator(train_midi, train_features, tokenizer, maxLength, vocabSize)
  #fit model for one epoch
  print("Epoch No : " + str(i + 1))
  model.fit(generator, epochs=1, steps_per_epoch=steps, callbacks=[checkpoint], verbose=1)

Epoch No : 1
Epoch 1: loss improved from inf to 0.12512, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 2
Epoch 1: loss improved from 0.12512 to 0.10249, saving model to E:\uniSlides\CS\Research\FinalDataset\FinalData\unique_2VGG_FINAL\\training_2\cp.ckpt
Epoch No : 3
Epoch 1: loss did not improve from 0.10249
Epoch No : 4
Epoch 1: loss did not improve from 0.10249
Epoch No : 5

KeyboardInterrupt: 

Generate a melody for a test image

In [25]:
import tensorflow as tf
#load tokenizer from pickle
tokenizer = load(open(data_dir + 'tokenizer.pkl', 'rb'))
#load best model
model = create_Model(vocabSize, maxLength)
model.load_weights(filepath)

images = glob(data_dir + 'testImages/*.jpg')
for img in images:
    photo = extract_featuresPredict(img)
    #generate midi
    midiSequence, prediction_output = generate_midiSequence(model, tokenizer, photo, maxLength)

    print("Midi Sequnce in text")
    print(midiSequence)

    #create midi file
    create_midi(prediction_output, img.split('.')[0].rsplit('\\')[-1])

Model Summary
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 114)]        0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 114, 256)     87040       ['input_8[0][0]']                
                                                                                                  
 dropout_10 (Dropout)           (None, 114, 256)     0           ['embedding_3[0][0]']            
                                                                                                  
 input_7 (InputLayer)           [(None, 4096)]       0           []                               
                                                                              

# Evalution

Function to calculate average tone span (semitones between lowest and highest note in the melody)

In [31]:
def getTonespan(testingSongs):
    scale = {"C": 0, "C#": 1, "D-" : 1, "D" : 2, 'D#' : 3, "E-": 3, "E" : 4, "F-": 4, "E#" : 5, "F" : 5, "F#" : 6, "G-" : 6, "G" : 7, 
             "G#": 8, "A-": 8, "A": 9, "A#": 10, "B-": 10, "B": 11, "C-":11, "B#": 12 }
    tonespans = []
    for file in testingSongs:
        #print(file)
        try:
            midi = converter.parse(file) #convert to midi
        except:
            print("Error in parsing " + str(file))
            continue
        #tranpose song
        try:
            midi = transpose(midi)
        except:
            print("Error in obtaining key for " + str(file))
            continue
        startBar = getStartingBar(midi)
        midi = midi.measures(startBar, startBar + 7)
        midi = midi.flat
        minNote = {"value": float('inf'), "name": ''}
        minOct = float('inf')
        maxNote = {"value": float('-inf'), "name": ''}
        maxOct = float('-inf')
        for event in midi:
            if isinstance(event, note.Note):
                if(int(str(event.pitch)[-1]) < minOct):
                    minOct = int(str(event.pitch)[-1])
                if(int(str(event.pitch)[-1]) > maxOct):
                    maxOct = int(str(event.pitch)[-1])
        #print("Min : " + str(minOct) + " Max: " + str(maxOct))
        for event in midi:
            if isinstance(event, note.Note):
                if(int(str(event.pitch)[-1]) == minOct):
                    if( scale[str(event.pitch)[:-1]] < minNote["value"]):
                        minNote["value"] = scale[str(event.pitch)[:-1]] 
                        minNote["name"] = event
                if(int(str(event.pitch)[-1]) == maxOct):
                    if( scale[str(event.pitch)[:-1]] > maxNote["value"]):
                        maxNote["value"] = scale[str(event.pitch)[:-1]] 
                        maxNote["name"] = event
        #print("MinNote: " + str(minNote["value"]) + " name: " + str(minNote["name"].pitch))
        #print("MaxNote: " + str(maxNote["value"]) + " name: " + str(maxNote["name"].pitch))

        semitones = interval.Interval(minNote["name"], maxNote["name"]).semitones
        #print("semitones: " + str(semitones))
        tonespans.append(int(semitones))

    avg = sum(tonespans)/len(tonespans)
    return avg
    

Calculate avg tonespan for all midi files in dataset

In [32]:
datasetSongs = glob(data_dir + 'midi/*.mid')
print("Avg tone span in dataset: " + str(getTonespan(datasetSongs)))

Avg tone span in dataset: 11.883333333333333


Calculate avg tonespan for a sample of 16 midi files generated from the model

In [33]:
testingSongs = glob(data_dir + 'testing/*.mid')
print("Avg tone span in dataset: " + str(getTonespan(testingSongs)))

Avg tone span in dataset: 16.533333333333335


Function to calculate the scale consistency of a set of midi files

In [34]:
def getAvgScaleConsistency(testingSongs):
    A_minor = ["A", "B", "C", "D", 'E', "F", "G#"]
    C_major = ["C", "D", "E", "F", 'G', "A", "B"]

    allRatios = []

    for file in testingSongs:
        try: 
            midi = converter.parse(file) #convert to midi
        except:
            print("Error in parsing " + str(file))
            continue
        #tranpose song
        try:
            midi = transpose(midi)
        except:
            print("Error in obtaining key for " + str(file))
            continue

        startBar = getStartingBar(midi)
        midi = midi.measures(startBar, startBar + 7)
        midi = midi.flat

        correctNotes = 0
        FG_list = []
        GF_list = []
        totNotes = 0
        key = midi.analyze("key")
        if key.mode == "major":
            for event in midi:
                if isinstance(event, note.Note):
                    totNotes += 1
                    if (str(event.pitch)[:-1] in C_major):
                                correctNotes += 1
        elif key.mode == "minor":
            for event in midi:
                if isinstance(event, note.Note):
                    totNotes += 1
                    if((str(event.pitch)[:-1] == "F#") & (len(FG_list) == 0)):
                        FG_list.append("F#")
                        continue
                    if(str(event.pitch)[:-1] == "G#"):
                        if(len(FG_list) == 1):
                            if(FG_list[0] == "F#"):
                                correctNotes += 2
                                FG_list = []
                                continue




                    if((str(event.pitch)[:-1] == "G")  & (len(GF_list) == 0)):
                        GF_list.append("G")
                        continue
                    if(str(event.pitch)[:-1] == "F"):
                        if(len(FG_list) == 1):
                            if(FG_list[0] == "G"):
                                correctNotes += 2
                                GF_list = []
                                continue


                    if (str(event.pitch)[:-1] in A_minor):
                        correctNotes += 1

        allRatios.append(correctNotes/totNotes)

    print("Final avg scale consistency percentage : " + str(sum(allRatios)/len(allRatios) * 100))

Calculate scale consistency of original dataset

In [35]:
testingSongs = glob(data_dir + 'midi/*.mid')
getAvgScaleConsistency(testingSongs)

Final avg scale consistency percentage : 95.35331443422385


Calculate scale consistency of testing dataset

In [36]:
testingSongs = glob(data_dir + 'testing/*.mid')
getAvgScaleConsistency(testingSongs)

Final avg scale consistency percentage : 96.23026037490698
