In [181]:
!pip3 install jams
!pip install matplotlib
!pip install audiolazy



In [155]:
import numpy as np
import os
import pprint

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import scipy.io.wavfile as wav
import jams
import audiolazy as al

## Process input

In [116]:
def timeKey(t):
    return t[0]

def loadNoteData(f):
    #f = open('/content/JamsFiles/04_Rock1-90-C_solo.jams', 'r')

   # data = f.read()
    data = jams.load(f)
    notes = []

    for i in range (0, 6):
        for j in data.annotations["note_midi"][i]["data"]:
            notes.append([j[0],j[1],j[2]])
            
    notes.sort(key=timeKey)

    #pprint.pprint(notes)
    return notes

In [117]:
#function to process the raw audio data.
def x_data_process(raw_frame):
    return raw_frame

In [118]:
def foo():
    return

In [156]:
#iterate through each song, one frame at a time based off hop and window size
def songProcess(audio, notes, sr, hopSize = 1024, winSize = 4096, features = [], labels = []):

    testCount = 0

    offsets = np.arange(0,len(audio),hopSize)
    for (i,o) in enumerate(offsets):
        testCount += 1

        frame = audio[o:o+winSize]
        note = 0
        tiebreak = []

        #Create labels from jams file data -- if multiple notes in frame, take longest duration
        for j,(time,duration,value) in enumerate(notes):
            note_start = time*sr
            note_end = (time+duration)*sr
            if o <= note_start < o+winSize:                 #note starts in frame
                tiebreak.append(j)
            elif o <= note_end < o+winSize:                  #note ends in frame
                tiebreak.append(j)
            elif note_start < o and o+winSize <= note_end: #note continuous thru frame
                note = value

        #if multiple notes in frame, choose one that played the longest in frame
        if len(tiebreak) > 0:
            if len(tiebreak) == 1:
                note = notes[tiebreak[0]][2]
            else:
                max_dur = 0
                max_note = 0
                for index in tiebreak:
                    note_start = notes[index][0] * sr
                    note_duration = notes[index][1] * sr
                    frame_dur = 0
                    if note_start < o:
                        frame_dur = note_duration+note_start-o
                    elif note_start+note_duration > o+winSize:
                        frame_dur = o+winSize - note_start
                    else:
                        frame_dur = note_duration

                    if frame_dur > max_dur:
                        max_dur = frame_dur
                        max_note = notes[index][2]
                note = max_note


        #pad feature matrix
        if len(frame) < winSize:
            pad = winSize-len(frame)
            temp = np.zeros(pad)
            frame = np.concatenate((frame,temp))
        features.append(x_data_process(frame))    #process raw frame... somehow??
        labels.append(round(note))  #quantize to the nearest midi value


    return features,labels,testCount

## Data classification using Neural Network

In [129]:
def neuralNetwork(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A1, further adjustment with the processed dataset will be needed
    MLPC = MLPClassifier(hidden_layer_sizes=(71, 21), max_iter=10000, alpha=0.0001, learning_rate_init=0.001, solver='adam')
    MLPC.fit(XTrain, YTrain)

    return accuracy_score(YTest, MLPC.predict(XTest)), MLPC

## Data classification using SVM

In [130]:
def gaussianSVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    svc = SVC(kernel = 'rbf', gamma = 0.1, C = 10)
    svc.fit(XTrain, YTrain)

    return accuracy_score(YTest, svc.predict(XTest))

In [131]:
def linearSVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    svc = LinearSVC(C = 0.0225)

    svc.fit(XTrain, YTrain)
    accuracy_score(YTest, svc.predict(XTest))

## Predict

In [174]:
"""Taking the song path, trained model, and same hopSize and winSize as used in training,
Converts song into predicted midi information"""
def predict(song_path, model, hopSize, winSize):

    sample_rate, audio = wav.read(song_path)
    offsets = np.arange(0, len(audio), hopSize)
    x_data = []
    
    for (i,o) in enumerate(offsets):
        frame = audio[o:o+winSize]
        frame = x_data_process(frame)
        #padint
        if len(frame) < winSize:
            pad = winSize-len(frame)
            temp = np.zeros(pad)
            frame = np.concatenate((frame,temp))
        x_data.append(frame)

    #column 0 is note, column 1 is time
    midi_info = np.zeros((len(offsets), 2))
    raw_results = model.predict(x_data) 
    
    for i, frame_note in enumerate(raw_results):
        midi_info[i][0] = frame_note
        midi_info[i][1] = hopSize*i/sample_rate
        
    
    # TODO: create proper midi file from this note / time information
    
    
    # Also determine method to covert midi into sheet music
    

    return midi_info

## Driver

In [167]:
# LOAD SONG AND JAM FILES
song_path = r'DataSets/audio_mono-mic'
jam_path = r'DataSets/annotation'
MODE = 'solo'
inputFiles = list(zip([x for x in os.listdir(song_path) if MODE in x],[x for x in os.listdir(jam_path) if MODE in x]))

In [168]:
XData,YData = [],[]

numSongs = 5
counter = numSongs

notecounter = 0

hopSize = 1024
winSize = 1024


for song_file, jam_file in inputFiles:
    song = os.path.join(song_path, song_file)
    jam = os.path.join(jam_path, jam_file)

    sampleRate, audio = wav.read(song)
    note_info = loadNoteData(jam)
    #NOTE: window size used in previous works with dataset where 0.2s
    XData, YData, notecount = songProcess(audio, note_info, sampleRate, hopSize=hopSize, winSize = winSize) #default window and hop. Can pass in a feature matrix and label array if we want to concat multiple songs together
    notecounter += notecount
    counter -= 1
    
    if counter == 0:
        break


#Split data
XTrain, XTest, YTrain, YTest = train_test_split(XData, YData, test_size = 0.2)


In [202]:
#Call ML functions..
# gaussianSVM(XTrain,YTrain)
# linearSVM(XTrain,YTrain)
song_result, trained_model = neuralNetwork(XTrain, XTest, YTrain, YTest)
print(str(numSongs)+" song neural network accuracy:", song_result)

5 song neural network accuracy: 0.27943570265870865


In [204]:
## Call predict function
# song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

ssong = 'DataSets/audio_mono-mic/04_Rock1-90-C#_solo_mic.wav'

midi_info = predict(song, trained_model, hopSize, winSize)

# for note, time in midi_info:
#     print(note,time)

## Testing Ground

In [13]:
f = open('/content/JamsFiles/04_Rock1-90-C_solo.jams', 'r')

# data = f.read()
data = jams.load(f)

In [None]:
print("\ttime\tduration\tnotes")
pprint.pprint(loadNoteData(data))

In [96]:
#File loading Test
for song,jam in inputFiles:
    if song[:-13] != jam[:-10]:
        print("error with:",song,jam)

In [None]:
#Test XData padding
for x in XData:
    if len(x) < 4096:
        print("Error with Xdata padding")

In [137]:
#Check to see if XData and YData arrays are being appended properly
if notecounter != len(XData):
    print("Error: XData and YData arrays are not being apppended properly")

In [205]:
# From the predict() function above to get the note and time info we can see what note and freq it represents

# Reference: https://pythonhosted.org/audiolazy/lazy_midi.html

print('MIDI \tNote  \tFreq \t\t\tTime')
for note, time in midi_info:
    if note != 0.0:
        print(note,'\t',al.midi2str(note),'\t',al.midi2freq(note),'\t',time)

MIDI 	Note  	Freq 			Time
50.0 	 D3 	 146.8323839587038 	 1.8575963718820863
60.0 	 C4 	 261.6255653005986 	 1.9504761904761905
60.0 	 C4 	 261.6255653005986 	 3.9706122448979593
50.0 	 D3 	 146.8323839587038 	 5.828208616780046
60.0 	 C4 	 261.6255653005986 	 7.894784580498866
60.0 	 C4 	 261.6255653005986 	 20.456780045351476
60.0 	 C4 	 261.6255653005986 	 21.91963718820862
69.0 	 A4 	 440.0 	 22.709115646258503
