In [67]:
# convert to a '.py' file
#!jupyter nbconvert --to script *.ipynb

In [86]:
!pip3 install jams
!pip install matplotlib
!pip install audiolazy
!pip install librosa



In [139]:
import numpy as np
import os
import pprint

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import scipy.io.wavfile as wav
import jams

import librosa
import audiolazy as al

## Process input

In [140]:
def timeKey(t):
    return t[0]

def loadNoteData(f):
    #f = open('/content/JamsFiles/04_Rock1-90-C_solo.jams', 'r')

   # data = f.read()
    data = jams.load(f)
    notes = []

    for i in range (0, 6):
        for j in data.annotations["note_midi"][i]["data"]:
            notes.append([j[0],j[1],j[2]])
            
    notes.sort(key=timeKey)

    #pprint.pprint(notes)
    return notes

In [141]:
def load_audio_file(song_path, scale = False):
    srate, source_audio = wav.read(song_path)
    if scale:
        source_audio = audio.astype(np.float32) / max(max(audio),abs(min(audio)))

    return source_audio, srate

In [142]:
#function to process the raw audio data.
def x_data_process(raw_data):
    scaler = preprocessing.MinMaxScaler((-1,1))
    scaled = scaler.fit_transform(raw_data)
    return scaled

In [143]:
#TODO: Change this so it pads to match the shape of frame. not just the length
def pad(proper_size, frame):
    padsize = winSize-len(frame)
    temp = None
    if len(frame.shape) > 1:
        temp = np.zeros((frame.shape[0],padsize))
    else:
        temp = np.zeros(padsize)

    return np.concatenate((frame,temp))

In [144]:
# iterate through each song, one frame at a time based off hop and window size
def songProcess(song_audio,notes,sr,hopSize = 1024,winSize = 4096,features = [],labels = []):

  testCount = 0

  offsets = np.arange(0,len(song_audio),hopSize)
  for (i,o) in enumerate(offsets):
    testCount += 1

    original_frame_size = 0

    frame = audio[o:o+winSize]

    if original_frame_size == 0:
      original_frame_size = frame.shape

    note = 0
    tiebreak = []

    #Create labels from jams file data -- if multiple notes in frame, take longest duration
    for j,(time,duration,value) in enumerate(notes):
      note_start = time*sr
      note_end = (time+duration)*sr
      if o <= note_start < o+winSize:                 #note starts in frame
        tiebreak.append(j)
      elif o <= note_end < o+winSize:                  #note ends in frame
        tiebreak.append(j)
      elif note_start < o and o+winSize <= note_end: #note continuous thru frame
        note = value

    #if multiple notes in frame, choose one that played the longest in frame
    if len(tiebreak) > 0:
      if len(tiebreak) == 1:
        note = notes[tiebreak[0]][2]
      else:
        max_dur = 0
        max_note = 0
        for index in tiebreak:
          note_start = notes[index][0] * sr
          note_duration = notes[index][1] * sr
          frame_dur = 0
          if note_start < o:
            frame_dur = note_duration+note_start-o
          elif note_start+note_duration > o+winSize:
            frame_dur = o+winSize - note_start
          else:
            frame_dur = note_duration

          if frame_dur > max_dur:
            max_dur = frame_dur
            max_note = notes[index][2]
        note = max_note


    #pad feature matrix
    if len(frame) < winSize:
      frame = pad(winSize, frame)

    #append to feature and labels
    features.append(frame)
    labels.append(round(note))  #quantize to the nearest midi value

  return features,labels,testCount

## Predict

In [145]:
"""Taking the song path, trained model, and same hopSize and winSize as used in training,
Converts song into predicted midi information"""
def predict(song_path, model, hopSize, winSize):
    
    audio,sr = load_audio_file(song_path, scale=False)
    offsets = np.arange(0,len(audio),hopSize)
    x_data = []
    for (i,o) in enumerate(offsets):
        frame = audio[o:o+winSize]
        #pad
        if len(frame) < winSize:
            frame = pad(winSize, frame)

        #append to features
        x_data.append(frame)

    #preprocess data
    x_data = x_data_process(x_data)

    #column 0 is note, column 1 is time
    midi_info = np.zeros((len(offsets),2))
    raw_results = model.predict(x_data)

    for i,frame_note in enumerate(raw_results):
        midi_info[i][0] = frame_note
        midi_info[i][1] = hopSize*i/sr

        #TODO: create proper midi file from this note / time information
          #Also determine method to covert midi into sheet music

    return midi_info

## Data classification using Neural Network

In [146]:
def neuralNetwork(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A1, further adjustment with the processed dataset will be needed
    MLPC = MLPClassifier(hidden_layer_sizes=(71, 21), max_iter=10000, alpha=0.0001, learning_rate_init=0.001, solver='adam')
    MLPC.fit(XTrain, YTrain)

    return accuracy_score(YTest, MLPC.predict(XTest)), MLPC

In [172]:
def shawn_nn(XTrain, XTest, YTrain, YTest, hidden_size, max_iter, solver, activation):
    #Currently, using successful values from A1, further adjustment with the processed dataset will be needed
    MLPC = MLPClassifier(hidden_layer_sizes=hidden_size, max_iter=max_iter, alpha=0.0001, 
                         learning_rate_init=0.001, solver=solver, activation=activation)
    MLPC.fit(XTrain, YTrain)

    return accuracy_score(YTest, MLPC.predict(XTest)), MLPC


## Data classification using SVM

In [147]:
def gaussianSVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    svc = SVC(kernel = 'rbf', gamma = 0.1, C = 10)
    svc.fit(XTrain, YTrain)

    return accuracy_score(YTest, svc.predict(XTest))

In [148]:
def linearSVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    svc = LinearSVC(C = 0.0225)

    svc.fit(XTrain, YTrain)
    accuracy_score(YTest, svc.predict(XTest))

## Driver

In [149]:
# LOAD SONG AND JAM FILES
song_path = r'DataSets/audio_mono-mic'
jam_path = r'DataSets/annotation'
MODE = 'solo'
inputFiles = list(zip([x for x in os.listdir(song_path) if MODE in x],[x for x in os.listdir(jam_path) if MODE in x]))

In [197]:
##Set up features and labels for ML
numSongs = 5
counter = numSongs

notecounter = 0

hopSize = 1024
winSize = 1024

XData,YData = [],[]
for song_file,jam_file in inputFiles:
    song = os.path.join(song_path,song_file)
    jam = os.path.join(jam_path,jam_file)
    
    audio_prescaled,sr = load_audio_file(song,scale=True)
    audio,sr = load_audio_file(song, scale=False)
    note_info = loadNoteData(jam)

    #NOTE: window size used in previous works with dataset where 0.2s
    XData_prescaled,YData,notecount = songProcess(audio_prescaled,note_info,sr, hopSize=hopSize, winSize = winSize) # Can pass in a feature matrix and label array if we want to concat multiple songs together


    XData,YData,notecount = songProcess(audio,note_info,sr, hopSize=hopSize, winSize = winSize) # Can pass in a feature matrix and label array if we want to concat multiple songs together


    notecounter += notecount
    counter -= 1
    if counter == 0:
        break


In [196]:
# XData_prescaled = np.array(XData_prescaled)
# XData_prescaled = XData_prescaled / max(XData_prescaled.max(), abs(XData_prescaled.min()))
# print(X1.min(axis=0))
# print(X1.max(axis=0))

# XData_post = np.array(x_data_process(XData))
# print(X2.min(axis=0))
# print(X2.max(axis=0))

In [153]:
### XDATA SCALING EXPERIMENT
#Method 1: perform a -1,1 scale on the audio file at initial loading. This scales audio files individually, not recognizing the values of other files
#Method 2: perform a -1,1 scale at train test split via sklearn. This scaled audio files together, after they are loaded and concatenated.
#Method 3: control. No scaling
XData_prescaled = np.array(XData_prescaled)
XData_prescaled = XData_prescaled / max(XData_prescaled.max(), abs(XData_prescaled.min()))
XData_postscaled = x_data_process(XData)

N_avg = 3
results = {}
for j,xdata in enumerate((XData_prescaled,XData_postscaled,XData)):
    results[j] = 0
    XTrain,XTest,YTrain,YTest = train_test_split(xdata,YData,test_size=0.2)
    for i in range(N_avg):
        results[j] += neuralNetwork(XTrain,XTest,YTrain,YTest)[0]
    results[j] /= N_avg

print("Method 1 accuracy:",results[0])
print("Method 2 accuracy:",results[1])
print("Control. No scaling accuracy:",results[2])

Method 1 accuracy: 0.5180363439110388
Method 2 accuracy: 0.47206400867914294
Control. No scaling accuracy: 0.279224301600217


In [None]:
# using shawns models to compare method accuracies 
XData_prescaled = np.array(XData_prescaled)
XData_prescaled = XData_prescaled / max(XData_prescaled.max(), abs(XData_prescaled.min()))
XData_postscaled = x_data_process(XData)

hidden_size=(60,60)
max_iter=100000
solver='lbfgs'
activation='relu'

N_avg = 3
results = {}
for j,xdata in enumerate((XData_prescaled,XData_postscaled,XData)):
    results[j] = 0
    XTrain,XTest,YTrain,YTest = train_test_split(xdata,YData,test_size=0.2)
    for i in range(N_avg):
        results[j] = shawn_nn(XTrain,XTest,YTrain,YTest,hidden_size, max_iter, solver, activation)[0]
    results[j] /= N_avg

print("Method 1 accuracy:",results[0])
print("Method 2 accuracy:",results[1])
print("Control. No scaling accuracy:",results[2])

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


## NN Training - Different Models

In [170]:
#Train model
XData_postscaled = x_data_process(XData)

XTrain,XTest,YTrain,YTest = train_test_split(XData_postscaled,YData,test_size=0.2)
accuracy, trained_model = neuralNetwork(XTrain,XTest,YTrain,YTest)
print("NN Model Test Accuracy:",accuracy)

NN Model Test Accuracy: 0.45402766476810413


In [191]:
# Shawn's Testing - best model so far
XData_postscaled = x_data_process(XData)
XTrain,XTest,YTrain,YTest = train_test_split(XData_postscaled,YData,test_size=0.2)
hidden_size=(60,60)
max_iter=100000
solver='lbfgs'
activation='relu'
accuracy, trained_model = shawn_nn(XTrain,XTest,YTrain,YTest,hidden_size, max_iter, solver, activation)
print("NN Model Test Accuracy:",accuracy)


NN Model Test Accuracy: 0.6265256305939788


In [195]:
# Shawn's Testing - best model so far
XData_postscaled = x_data_process(XData)
XTrain,XTest,YTrain,YTest = train_test_split(XData_prescaled,YData,test_size=0.2)
hidden_size=(60,60)
max_iter=100000
solver='lbfgs'
activation='relu'
accuracy, trained_model = shawn_nn(XTrain,XTest,YTrain,YTest,hidden_size, max_iter, solver, activation)
print("NN Model Test Accuracy:",accuracy)

NN Model Test Accuracy: 0.7046379170056957


In [103]:
## Call predict function
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

midi_info = predict(song, trained_model, hopSize, winSize)

# print(midi_info[:10])

[[0.         0.        ]
 [0.         0.02321995]
 [0.         0.04643991]
 [0.         0.06965986]
 [0.         0.09287982]
 [0.         0.11609977]
 [0.         0.13931973]
 [0.         0.16253968]
 [0.         0.18575964]
 [0.         0.20897959]]


## Testing Ground

In [13]:
f = open('/content/JamsFiles/04_Rock1-90-C_solo.jams', 'r')

# data = f.read()
data = jams.load(f)

In [None]:
print("\ttime\tduration\tnotes")
pprint.pprint(loadNoteData(data))

In [96]:
#File loading Test
for song,jam in inputFiles:
    if song[:-13] != jam[:-10]:
        print("error with:",song,jam)

In [None]:
#Test XData padding
for x in XData:
    if len(x) < 4096:
        print("Error with Xdata padding")

In [137]:
#Check to see if XData and YData arrays are being appended properly
if notecounter != len(XData):
    print("Error: XData and YData arrays are not being apppended properly")

In [205]:
# From the predict() function above to get the note and time info we can see what note and freq it represents

# Reference: https://pythonhosted.org/audiolazy/lazy_midi.html

print('MIDI \tNote  \tFreq \t\t\tTime')
for note, time in midi_info:
    if note != 0.0:
        print(note,'\t',al.midi2str(note),'\t',al.midi2freq(note),'\t',time)

MIDI 	Note  	Freq 			Time
50.0 	 D3 	 146.8323839587038 	 1.8575963718820863
60.0 	 C4 	 261.6255653005986 	 1.9504761904761905
60.0 	 C4 	 261.6255653005986 	 3.9706122448979593
50.0 	 D3 	 146.8323839587038 	 5.828208616780046
60.0 	 C4 	 261.6255653005986 	 7.894784580498866
60.0 	 C4 	 261.6255653005986 	 20.456780045351476
60.0 	 C4 	 261.6255653005986 	 21.91963718820862
69.0 	 A4 	 440.0 	 22.709115646258503
