In [114]:
!pip3 install jams
!pip install matplotlib



In [113]:
import numpy as np
import os
import pprint

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import scipy.io.wavfile as wav
import jams


## Process input

In [116]:
def timeKey(t):
    return t[0]

def loadNoteData(f):
    #f = open('/content/JamsFiles/04_Rock1-90-C_solo.jams', 'r')

   # data = f.read()
    data = jams.load(f)
    notes = []

    for i in range (0, 6):
        for j in data.annotations["note_midi"][i]["data"]:
            notes.append([j[0],j[1],j[2]])
            
    notes.sort(key=timeKey)

    #pprint.pprint(notes)
    return notes

In [117]:
#function to process the raw audio data.
def x_data_process(raw_frame):
    return raw_frame

In [118]:
def foo():
    return

In [128]:
#iterate through each song, one frame at a time based off hop and window size
def songProcess(audio,notes,sr,hopSize = 1024,winSize = 4096,features = [],labels = []):

    testCount = 0

    offsets = np.arange(0,len(audio),hopSize)
    for (i,o) in enumerate(offsets):
        testCount += 1

        frame = audio[o:o+winSize]
        note = 0
        tiebreak = []

        #Create labels from jams file data -- if multiple notes in frame, take longest duration
        for j,(time,duration,value) in enumerate(notes):
            note_start = time*sr
            note_end = (time+duration)*sr
            if o <= note_start < o+winSize:                 #note starts in frame
                tiebreak.append(j)
            elif o <= note_end < o+winSize:                  #note ends in frame
                tiebreak.append(j)
            elif note_start < o and o+winSize <= note_end: #note continuous thru frame
                note = value

        #if multiple notes in frame, choose one that played the longest in frame
        if len(tiebreak) > 0:
            if len(tiebreak) == 1:
                note = notes[tiebreak[0]][2]
            else:
                max_dur = 0
                max_note = 0
                for index in tiebreak:
                    note_start = notes[index][0] * sr
                    note_duration = notes[index][1] * sr
                    frame_dur = 0
                    if note_start < o:
                        frame_dur = note_duration+note_start-o
                    elif note_start+note_duration > o+winSize:
                        frame_dur = o+winSize - note_start
                    else:
                        frame_dur = note_duration

                    if frame_dur > max_dur:
                        max_dur = frame_dur
                        max_note = notes[index][2]
                note = max_note


        #pad feature matrix
        if len(frame) < winSize:
            pad = winSize-len(frame)
            temp = np.zeros(pad)
            frame = np.concatenate((frame,temp))
        features.append(x_data_process(frame))    #process raw frame... somehow??
        labels.append(round(note))  #quantize to the nearest midi value


    return features,labels,testCount

## Data classification using Neural Network

In [129]:
def neuralNetwork(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A1, further adjustment with the processed dataset will be needed
    MLPC = MLPClassifier(hidden_layer_sizes=(71, 21), max_iter=10000, alpha=0.0001, learning_rate_init=0.001, solver='adam')
    MLPC.fit(XTrain, YTrain)

    return accuracy_score(YTest, MLPC.predict(XTest)), MLPC

## Data classification using SVM

In [130]:
def gaussianSVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    svc = SVC(kernel = 'rbf', gamma = 0.1, C = 10)
    svc.fit(XTrain, YTrain)

    return accuracy_score(YTest, svc.predict(XTest))

In [131]:
def linearSVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    svc = LinearSVC(C = 0.0225)

    svc.fit(XTrain, YTrain)
    accuracy_score(YTest, svc.predict(XTest))

## Predict

In [132]:
"""Taking the song path, trained model, and same hopSize and winSize as used in training,
Converts song into predicted midi information"""
def predict(song_path, model, hopSize, winSize):

    sample_rate,audio = wav.read(song_path)
    offsets = np.arange(0,len(audio),hopSize)
    x_data = []
    for (i,o) in enumerate(offsets):
        frame = audio[o:o+winSize]
        frame = x_data_process(frame)
        #pad
        if len(frame) < winSize:
            pad = winSize-len(frame)
            temp = np.zeros(pad)
            frame = np.concatenate((frame,temp))
        x_data.append(frame)

      #column 0 is note, column 1 is time
    midi_info = np.zeros((len(offsets),2))
    raw_results = model.predict(x_data)

    for i,frame_note in enumerate(raw_results):
        midi_info[i][0] = frame_note
        midi_info[i][1] = hopSize*i/sample_rate

    #TODO: create proper midi file from this note / time information
      #Also determine method to covert midi into sheet music

    return midi_info

## Driver

In [133]:
# LOAD SONG AND JAM FILES
song_path = r'DataSets/audio_mono-mic'
jam_path = r'DataSets/annotation'
MODE = 'solo'
inputFiles = list(zip([x for x in os.listdir(song_path) if MODE in x],[x for x in os.listdir(jam_path) if MODE in x]))

In [134]:
XData,YData = [],[]

numSongs = 5
counter = numSongs

notecounter = 0

hopSize = 1024
winSize = 1024


for song_file,jam_file in inputFiles:
    song = os.path.join(song_path,song_file)
    jam = os.path.join(jam_path,jam_file)

    sampleRate,audio = wav.read(song)
    note_info = loadNoteData(jam)
    #NOTE: window size used in previous works with dataset where 0.2s
    XData,YData,notecount = songProcess(audio,note_info,sampleRate, hopSize=hopSize, winSize = winSize) #default window and hop. Can pass in a feature matrix and label array if we want to concat multiple songs together
    notecounter += notecount
    counter -= 1
    if counter == 0:
        break


#Split data
XTrain, XTest, YTrain, YTest = train_test_split(XData, YData, test_size = 0.2)

In [135]:
#Call ML functions..
# gaussianSVM(XTrain,YTrain)
# linearSVM(XTrain,YTrain)
song_result, trained_model = neuralNetwork(XTrain,XTest,YTrain,YTest)
print(str(numSongs)+" song neural network accuracy:",song_result)

5 song neural network accuracy: 0.25386493083807976


In [136]:
## Call predict function
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

midi_info = predict(song, trained_model, hopSize, winSize)



for note,time in midi_info:
    print(note,time)

69.0 0.0
0.0 0.023219954648526078
0.0 0.046439909297052155
0.0 0.06965986394557823
0.0 0.09287981859410431
0.0 0.11609977324263039
0.0 0.13931972789115646
0.0 0.16253968253968254
0.0 0.18575963718820862
0.0 0.2089795918367347
0.0 0.23219954648526078
0.0 0.25541950113378686
0.0 0.2786394557823129
0.0 0.301859410430839
0.0 0.3250793650793651
0.0 0.34829931972789113
0.0 0.37151927437641724
0.0 0.3947392290249433
0.0 0.4179591836734694
0.0 0.44117913832199546
0.0 0.46439909297052157
0.0 0.4876190476190476
0.0 0.5108390022675737
0.0 0.5340589569160997
61.0 0.5572789115646258
0.0 0.5804988662131519
0.0 0.603718820861678
61.0 0.626938775510204
0.0 0.6501587301587302
0.0 0.6733786848072563
0.0 0.6965986394557823
0.0 0.7198185941043084
0.0 0.7430385487528345
0.0 0.7662585034013606
0.0 0.7894784580498866
0.0 0.8126984126984127
0.0 0.8359183673469388
0.0 0.8591383219954648
0.0 0.8823582766439909
0.0 0.905578231292517
0.0 0.9287981859410431
0.0 0.9520181405895691
0.0 0.9752380952380952
0.0 0.99845

## Testing Ground

In [13]:
f = open('/content/JamsFiles/04_Rock1-90-C_solo.jams', 'r')

# data = f.read()
data = jams.load(f)

In [30]:
print("\ttime\tduration\tnotes")
pprint.pprint(loadNoteData(data))

	time	duration	notes
[[1.0279111111111092, 0.13351473922902812, 49.193803230707154],
 [1.304237641723354, 0.08707482993197146, 52.114584297971554],
 [1.3941242630385489, 0.23800453514738962, 53.06152785960328],
 [1.679385034013606, 0.13931972789115576, 56.14443073972567],
 [1.9566866213151926, 0.2902494331065739, 56.17702536590502],
 [2.3031265306122393, 0.09287981859410621, 53.08339188734166],
 [2.6073442176870714, 0.12190476190475863, 57.14782805032149],
 [2.9405641723355984, 0.12190476190475863, 60.91999064053075],
 [3.2740108843537357, 0.09868480725623385, 60.33037973363317],
 [3.5828317460317436, 0.5050340136054388, 61.522481510672925],
 [4.3526956916099735, 0.11029478458049624, 57.98474034208964],
 [4.642673015873008, 0.09287981859410621, 59.10588569730126],
 [4.9913351473922845, 0.7372335600907007, 61.11095551176679],
 [7.287298866213149, 0.2844444444444463, 63.46856030989381],
 [7.575303401360543, 0.7488435374149631, 61.16104776571266],
 [8.350292063492056, 0.4063492063492049, 

In [96]:
#File loading Test
for song,jam in inputFiles:
    if song[:-13] != jam[:-10]:
        print("error with:",song,jam)

In [None]:
#Test XData padding
for x in XData:
    if len(x) < 4096:
        print("Error with Xdata padding")

Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding
Error with Xdata padding


In [137]:
#Check to see if XData and YData arrays are being appended properly
if notecounter != len(XData):
    print("Error: XData and YData arrays are not being apppended properly")