In [20]:
# convert to a '.py' file
#!jupyter nbconvert --to script *.ipynb

In [19]:
!pip3 install jams
!pip install matplotlib
!pip install audiolazy

Collecting audiolazy
  Downloading audiolazy-0.6-py2.py3-none-any.whl (121 kB)
Installing collected packages: audiolazy
Successfully installed audiolazy-0.6


In [1]:
import numpy as np
import os
import pprint

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

import scipy.io.wavfile as wav
import jams

import librosa
import audiolazy as al

## Process input

In [16]:
def timeKey(t):
    return t[0]

def loadNoteData(f):
    #f = open('/content/JamsFiles/04_Rock1-90-C_solo.jams', 'r')

   # data = f.read()
    data = jams.load(f)
    notes = []

    for i in range (0, 6):
        for j in data.annotations["note_midi"][i]["data"]:
            notes.append([j[0],j[1],j[2]])
            
    notes.sort(key=timeKey)

    #pprint.pprint(notes)
    return notes

In [17]:
def load_audio_file(song_path, scale = False):
    srate, source_audio = wav.read(song_path)
    if scale:
      source_audio = source_audio.astype(np.float32) / max(max(source_audio),abs(min(source_audio)))

    return source_audio, srate

In [18]:
#function to process the raw audio data.
#local uses sklearn's MinMaxScaler
#global divides all numbers in data by the largest absolute value in dataset
def x_data_process(raw_data, method='local'):
  scaled = raw_data
  if method == 'local':
    scaler = preprocessing.MinMaxScaler((-1,1))
    scaled = scaler.fit_transform(raw_data)
  elif method == 'global':
    scaled = np.array(raw_data)
    scaled = scaled / max(scaled.max(), abs(scaled.min()))
  return scaled

In [19]:
#TODO: Change this so it pads to match the shape of frame. not just the length
def pad(proper_size, frame):
  padsize = winSize-len(frame)
  temp = None
  if len(frame.shape) > 1:
    temp = np.zeros((frame.shape[0],padsize))
  else:
    temp = np.zeros(padsize)

  return np.concatenate((frame,temp))


In [38]:
#iterate through each song, one frame at a time based off hop and window size
def songProcess(song_audio,notes,sr,hopSize = 1024,winSize = 4096,features = [],labels = []):

  testCount = 0

  offsets = np.arange(0,len(song_audio),hopSize)
  for (i,o) in enumerate(offsets):
    testCount += 1

    original_frame_size = 0

    frame = song_audio[o:o+winSize]

    note = 0
    tiebreak = []

    #Create labels from jams file data -- if multiple notes in frame, take longest duration
    for j,(time,duration,value) in enumerate(notes):
      note_start = time*sr
      note_end = (time+duration)*sr
      if o <= note_start < o+winSize:                 #note starts in frame
        tiebreak.append(j)
      elif o <= note_end < o+winSize:                  #note ends in frame
        tiebreak.append(j)
      elif note_start < o and o+winSize <= note_end: #note continuous thru frame
        note = value

    #if multiple notes in frame, choose one that played the longest in frame
    if len(tiebreak) > 0:
      if len(tiebreak) == 1:
        note = notes[tiebreak[0]][2]
      else:
        max_dur = 0
        max_note = 0
        for index in tiebreak:
          note_start = notes[index][0] * sr
          note_duration = notes[index][1] * sr
          frame_dur = 0
          if note_start < o:
            frame_dur = note_duration+note_start-o
          elif note_start+note_duration > o+winSize:
            frame_dur = o+winSize - note_start
          else:
            frame_dur = note_duration

          if frame_dur > max_dur:
            max_dur = frame_dur
            max_note = notes[index][2]
        note = max_note


    #pad feature matrix
    if len(frame) < winSize:
      frame = pad(winSize, frame)

    #append to feature and labels
    print(frame.min())
    features.append(frame)
    labels.append(round(note))  #quantize to the nearest midi value

  return features,labels,testCount

## Predict

In [21]:
"""Taking the song path, trained model, and same hopSize and winSize as used in training,
Converts song into predicted midi information"""
def predict(song_path, model, hopSize, winSize):

  song_audio,sr = load_audio_file(song_path, scale=False)
  offsets = np.arange(0,len(song_audio),hopSize)
  x_data = []
  for (i,o) in enumerate(offsets):
    frame = song_audio[o:o+winSize]
    #pad
    if len(frame) < winSize:
      frame = pad(winSize, frame)

    #append to features
    x_data.append(frame)

  #preprocess data
  x_data = x_data_process(x_data)

  #column 0 is note, column 1 is time
  midi_info = np.zeros((len(offsets),2))
  raw_results = model.predict(x_data)

  for i,frame_note in enumerate(raw_results):
    midi_info[i][0] = frame_note
    midi_info[i][1] = hopSize*i/sr

    #TODO: create proper midi file from this note / time information
      #Also determine method to covert midi into sheet music

  return midi_info

## Data classification using Neural Network

In [22]:
def neuralNetwork(XTrain, XTest, YTrain, YTest):
  #Currently, using successful values from A1, further adjustment with the processed dataset will be needed
  MLPC = MLPClassifier(hidden_layer_sizes=(71, 21), max_iter=10000, alpha=0.0001, learning_rate_init=0.001, solver='adam')
  MLPC.fit(XTrain, YTrain)

  return accuracy_score(YTest, MLPC.predict(XTest)), MLPC

## Data classification using SVM

In [23]:
def gaussianSVM(XTrain, XTest, YTrain, YTest):
  #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
  svc = SVC(kernel = 'rbf', gamma = 0.1, C = 10)
  svc.fit(XTrain, YTrain)

  return accuracy_score(YTest, svc.predict(XTest))

In [24]:
def linearSVM(XTrain, XTest, YTrain, YTest):
  #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
  svc = LinearSVC(C = 0.0225)

  svc.fit(XTrain, YTrain)
  accuracy_score(YTest, svc.predict(XTest))

## Driver

In [25]:
# LOAD SONG AND JAM FILES
song_path = r'DataSets/audio_mono-mic'
jam_path = r'DataSets/annotation'
MODE = 'solo'
inputFiles = list(zip([x for x in os.listdir(song_path) if MODE in x],[x for x in os.listdir(jam_path) if MODE in x]))

In [44]:
##Set up features and labels for ML
numSongs = 5
counter = numSongs

notecounter = 0

hopSize = 1024
winSize = 1024

XData,YData = [],[]
XData_pre,YData_pre = [],[]
for song_file,jam_file in inputFiles:
  song = os.path.join(song_path,song_file)
  jam = os.path.join(jam_path,jam_file)

  audio,sr = load_audio_file(song, scale=False)
  note_info = loadNoteData(jam)

# Can pass in a feature matrix and label array if we want to concat multiple songs together
  XData,YData,notecount = songProcess(audio,note_info,sr, hopSize, winSize, XData, YData)

  #Prescaled Xdata for experiement
  audio_prescaled,sr = load_audio_file(song,scale=True)
  XData_pre,YData_pre,notecount = songProcess(audio_prescaled,note_info,sr,hopSize,winSize, XData_pre, YData_pre)

  notecounter += notecount
  counter -= 1
  if counter == 0:
    break

-8785
-10499
-7694
-8317
-6570
-5774
-5971
-6422
-6841
-6734
-7234
-5364
-5676
-5233
-4716
-4503
-4544
-4536
-4495
-3141
-1944
-5627
-4889
-4323
-4093
-3675
-3429
-3240
-3289
-3289
-3289
-3494
-3355
-3182
-3240
-3043
-2608
-2288
-2264
-3978
-3347
-3158
-2518
-2329
-2961
-2584
-2822
-2518
-2379
-2444
-2362
-1977
-1681
-1755
-1821
-1485
-1173
-1353
-1214
-910
-763
-7185
-10122
-6701
-6242
-5561
-4880
-4651
-4085
-3839
-3814
-3642
-2789
-2576
-2444
-2108
-2042
-2215
-2223
-2034
-1977
-2133
-6160
-3896
-4216
-3847
-3576
-2945
-2502
-2346
-2288
-2288
-2362
-2092
-2100
-2083
-2075
-2092
-1903
-3691
-2731
-3076
-3043
-3035
-2633
-2887
-2805
-2879
-2904
-2617
-2469
-2190
-2051
-1977
-1969
-1804
-1772
-1616
-1066
-1870
-5996
-4011
-4150
-3593
-3675
-3371
-2961
-2781
-2428
-2247
-2338
-2247
-2190
-2239
-2182
-2174
-1977
-2059
-1796
-4470
-5807
-5044
-4257
-4077
-4126
-3896
-3675
-3461
-3560
-3371
-3371
-3109
-2912
-2863
-2822
-2764
-2781
-2789
-2280
-5668
-5742
-6324
-4610
-3478
-3871
-3830
-325

In [13]:
#Train model
XData_postscaled = x_data_process(XData)

XTrain,XTest,YTrain,YTest = train_test_split(XData_postscaled,YData,test_size=0.2)
accuracy, trained_model = neuralNetwork(XTrain,XTest,YTrain,YTest)
print("NN Model Test Accuracy:",accuracy)


NN Model Test Accuracy: 0.6886446886446886


In [14]:
## Call predict function
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

midi_info = predict(song,trained_model,hopSize,winSize)

# print(midi_info[:10])

## Testing Ground

In [49]:
### XDATA SCALING EXPERIMENT
#Method 1: Entire X dataset is scaled by largest abs value
#Method 2: perform a -1,1 scale at train test split via sklearn. This scaled audio files together, after they are loaded and concatenated.
#Method 3: perform a -1,1 scale on the audio file at initial loading. This scales audio files individually, not recognizing the values of other files

XData_global = x_data_process(XData,method='global')
XData_local = x_data_process(XData)

N_avg = 3
results = {}
for j,xdata in enumerate((XData_global,XData_local,XData_pre)):
  results[j] = 0
  for i in range(N_avg):
    XTrain,XTest,YTrain,YTest = train_test_split(xdata,YData,test_size=0.2)
    results[j] += neuralNetwork(XTrain,XTest,YTrain,YTest)[0]
  results[j] /= N_avg

print("Method 1 accuracy:",results[0])
print("Method 2 accuracy:",results[1])
print("Method 3 accuracy:",results[2])

0
0
1
2
1
0
1
2
Method 1 accuracy: 0.6706349206349206
Method 2 accuracy: 0.68009768009768


In [13]:
f = open('/content/JamsFiles/04_Rock1-90-C_solo.jams', 'r')

# data = f.read()
data = jams.load(f)

In [30]:
print("\ttime\tduration\tnotes")
pprint.pprint(loadNoteData(data))

	time	duration	notes
[[1.0279111111111092, 0.13351473922902812, 49.193803230707154],
 [1.304237641723354, 0.08707482993197146, 52.114584297971554],
 [1.3941242630385489, 0.23800453514738962, 53.06152785960328],
 [1.679385034013606, 0.13931972789115576, 56.14443073972567],
 [1.9566866213151926, 0.2902494331065739, 56.17702536590502],
 [2.3031265306122393, 0.09287981859410621, 53.08339188734166],
 [2.6073442176870714, 0.12190476190475863, 57.14782805032149],
 [2.9405641723355984, 0.12190476190475863, 60.91999064053075],
 [3.2740108843537357, 0.09868480725623385, 60.33037973363317],
 [3.5828317460317436, 0.5050340136054388, 61.522481510672925],
 [4.3526956916099735, 0.11029478458049624, 57.98474034208964],
 [4.642673015873008, 0.09287981859410621, 59.10588569730126],
 [4.9913351473922845, 0.7372335600907007, 61.11095551176679],
 [7.287298866213149, 0.2844444444444463, 63.46856030989381],
 [7.575303401360543, 0.7488435374149631, 61.16104776571266],
 [8.350292063492056, 0.4063492063492049, 

In [96]:
#File loading Test
for song,jam in inputFiles:
  if song[:-13] != jam[:-10]:
    print("error with:",song,jam)

In [97]:
#Test XData padding
for x in XData:
  if len(x) < 4096:
    print("Error with Xdata padding")

In [98]:
#Check to see if XData and YData arrays are being appended properly
if notecounter != len(XData):
  print("Error: XData and YData arrays are not being apppended properly")

In [205]:
# From the predict() function above to get the note and time info we can see what note and freq it represents

# Reference: https://pythonhosted.org/audiolazy/lazy_midi.html

print('MIDI \tNote  \tFreq \t\t\tTime')
for note, time in midi_info:
    if note != 0.0:
        print(note,'\t',al.midi2str(note),'\t',al.midi2freq(note),'\t',time)

MIDI 	Note  	Freq 			Time
50.0 	 D3 	 146.8323839587038 	 1.8575963718820863
60.0 	 C4 	 261.6255653005986 	 1.9504761904761905
60.0 	 C4 	 261.6255653005986 	 3.9706122448979593
50.0 	 D3 	 146.8323839587038 	 5.828208616780046
60.0 	 C4 	 261.6255653005986 	 7.894784580498866
60.0 	 C4 	 261.6255653005986 	 20.456780045351476
60.0 	 C4 	 261.6255653005986 	 21.91963718820862
69.0 	 A4 	 440.0 	 22.709115646258503
