In [1]:
!pip3 install jams
!pip install matplotlib
!pip install audiolazy
!pip install librosa
!pip3 install music21
!pip3 install MIDIUtil

Collecting audiolazy
  Using cached audiolazy-0.6-py2.py3-none-any.whl (121 kB)
Installing collected packages: audiolazy
Successfully installed audiolazy-0.6


In [1]:
import numpy as np
import os
import pprint

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE

import scipy.io.wavfile as wav
import jams

import librosa
import audiolazy as al

import music21
from midiutil import MIDIFile


## Processing Data Input (Guitar Dataset)

In [2]:
# Function to extract note/time information from JAM file
def loadNoteData(f):
    data = jams.load(f)
    notes = []

    for i in range (0, 6):
        for j in data.annotations["note_midi"][i]["data"]:
            notes.append([j[0],j[1],j[2]])
            
    notes.sort(key=timeKey)

    return notes

# Helper function for JAM file processing
def timeKey(t):
    return t[0]

In [3]:
# Function to handle loading audio files
def load_audio_file(song_path, scale = False):
    srate, source_audio = wav.read(song_path)
    if scale:
        source_audio = source_audio.astype(np.float32) / max(max(source_audio),abs(min(source_audio)))

    return source_audio, srate

In [43]:
# Function to process the raw audio data (x data)
# local - uses sklearn MinMaxScaler. each "frame" has local -1,1 scaling
# global - entire x dataset scaled by largest absolute value to -1,1
def x_data_process(raw_data, scale = 'local'):
    if scale == 'local':
        scaler = preprocessing.MinMaxScaler((-1,1))
        scaled = scaler.fit_transform(raw_data)
    elif scale == 'global':
        scaled = raw_data.copy() / max(np.absolute(np.array(raw_data).flatten()))
    else:
        scaled = raw_data
    return scaled

In [5]:
# Function to pad x data so all frames are same length
def pad(proper_size, frame):
    padsize = winSize-len(frame)
    temp = None
    if len(frame.shape) > 1:
        temp = np.zeros((frame.shape[0],padsize))
    else:
        temp = np.zeros(padsize)

    return np.concatenate((frame,temp))

In [6]:
# Function to create x data and y data lists from audio files and jam files
def songProcess(song_audio,notes,sr,hopSize = 1024,winSize = 4096,features = [],labels = []):

  offsets = np.arange(0,len(song_audio),hopSize)
  for (i,o) in enumerate(offsets):
    original_frame_size = 0

    frame = audio[o:o+winSize]

    if original_frame_size == 0:
      original_frame_size = frame.shape

    note = 0
    tiebreak = []

    #Create labels from jams file data -- if multiple notes in frame, take longest duration
    for j,(time,duration,value) in enumerate(notes):
      note_start = time*sr
      note_end = (time+duration)*sr
      if o <= note_start < o+winSize:                 #note starts in frame
        tiebreak.append(j)
      elif o <= note_end < o+winSize:                  #note ends in frame
        tiebreak.append(j)
      elif note_start < o and o+winSize <= note_end: #note continuous thru frame
        note = value

    #if multiple notes in frame, choose one that played the longest in frame
    if len(tiebreak) > 0:
      if len(tiebreak) == 1:
        note = notes[tiebreak[0]][2]
      else:
        max_dur = 0
        max_note = 0
        for index in tiebreak:
          note_start = notes[index][0] * sr
          note_duration = notes[index][1] * sr
          frame_dur = 0
          if note_start < o:
            frame_dur = note_duration+note_start-o
          elif note_start+note_duration > o+winSize:
            frame_dur = o+winSize - note_start
          else:
            frame_dur = note_duration

          if frame_dur > max_dur:
            max_dur = frame_dur
            max_note = notes[index][2]
        note = max_note

    #pad feature matrix
    if len(frame) < winSize:
      frame = pad(winSize, frame)

    #append to feature and labels
    features.append(frame)
    labels.append(round(note))  #quantize to the nearest midi value

  return features,labels

In [7]:
# Function to perform T-SNE dimension reduction
def tsneFit(X, comps):
    tsne = TSNE(comps, learning_rate='auto', init='pca')
    result = tsne.fit_transform(X)
    return result

## Predict Function

In [47]:
# Function to use input song and a trained model, to predict song's note/time information in MIDI
def predict(song_audio, sr, model, hopSize, winSize):
    
    offsets = np.arange(0,len(song_audio),hopSize)
    x_data = []

    for (i,o) in enumerate(offsets):
        frame = song_audio[o:o+winSize]
        #pad
        if len(frame) < winSize:
            frame = pad(winSize, frame)
        #append to features
        x_data.append(frame)

    #preprocess data
    x_data = x_data_process(x_data,scale='global')

    #predict. column 0 is note, column 1 is time
    raw_results = model.predict(x_data)
    midi_info = np.zeros((len(offsets),3))

    for i,frame_note in enumerate(raw_results):
        midi_info[i][0] = frame_note
        midi_info[i][1] = hopSize*i/sr
    midi_info[:,2] = np.absolute(np.array(x_data)).mean(axis=1) #amplitude of each frame

    return midi_info

### Sheet Music

In [30]:
# Function to create sheet music from the note/time information output from predict()
# TODO: Condense note/time input into min 8th notes
#          Save stream to midi file
def to_midi_sheet(midi_info,outfile,bpm=120):
    degrees = []
    error = 0.2
    q_length = 60/bpm #length of quarter note in seconds
    min_note = q_length/2/2/2 # length of 32nd note
    curr_note_dur = 0

    s = music21.stream.Stream([music21.clef.TrebleClef()])

    for i in range(1,len(midi_info)):
        # curr_note = int(midi_info[i][0])
        # prev_note = int(midi_info[i-1][0])
        # 
        # if curr_note != prev_note:
        #     curr_note_start = midi_info[i][1] 
        # 
        # 
        # 
        # if midi_info[i][0] == midi_info[i-1][0] and midi_info[i][2] > (1+error)* midi_info[i-1][2]:


        if pitch == 0:
            s.append(music21.note.Rest(quarterLength=1))
        else:
            s.append(music21.note.Note(pitch,quarterLength=1))

    s.show('musicxml.png')

In [21]:
s = music21.stream.Stream()
s.append(music21.note.Note(70,quarterLength = 1))
s.append(music21.note.Rest(quarterLength = 1))
s.append(music21.note.Rest(quarterLength = 1))
s.append(music21.note.Note(72,quarterLength = 1))
s.show('musicxml.png')

## Driver to load data and # of songs

In [12]:
# LOAD SONG AND JAM FILES
song_path = r'DataSets/audio_mono-mic'
jam_path = r'DataSets/annotation'
MODE = 'solo'
inputFiles = list(zip([x for x in os.listdir(song_path) if MODE in x],[x for x in os.listdir(jam_path) if MODE in x]))

In [13]:
##Set up features and labels for ML
numSongs = 5
counter = numSongs

hopSize = 1024
winSize = 1024

XData,YData = [],[]
for song_file,jam_file in inputFiles:
    song = os.path.join(song_path,song_file)
    jam = os.path.join(jam_path,jam_file)

    audio,sr = load_audio_file(song)
    note_info = loadNoteData(jam)

    # Can pass in a feature matrix and label array if we want to concat multiple songs together
    XData,YData = songProcess(audio,note_info,sr, hopSize, winSize)

    counter -= 1
    if counter == 0:
        break

## Neural Network Model Testing

In [14]:
def neural_network(XTrain, XTest, YTrain, YTest, hidden_size, max_iter, solver, activation):
    #Currently, using successful values from A1, further adjustment with the processed dataset will be needed
    MLPC = MLPClassifier(hidden_layer_sizes=hidden_size, max_iter=max_iter, alpha=0.0001,
                         learning_rate_init=0.001, solver=solver, activation=activation)
    MLPC.fit(XTrain, YTrain)

    return accuracy_score(YTest, MLPC.predict(XTest)), MLPC

In [48]:
# NN Using No Scaling

XTrain,XTest,YTrain,YTest = train_test_split(XData,YData,test_size=0.2)
hidden_size=(60,120)
max_iter=100000
solver='adam'
activation='tanh'
accuracy, trained_model = neural_network(XTrain,XTest,YTrain,YTest,hidden_size, max_iter, solver, activation)
print("NN Model Test Accuracy:",accuracy)

# Song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

# Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

# Predict
midi_info = predict(predict_song_audio,predict_song_sr, trained_model,hopSize,winSize)

NN Model Test Accuracy: 0.5311355311355311


In [55]:
error = 0.15

for i in range(1,len(midi_info)):
    if midi_info[i][0] == midi_info[i-1][0] and midi_info[i][2] > (1+0.25)* midi_info[i-1][2]:
        print(midi_info[i][0],
              midi_info[i-1][2],
              midi_info[i][2])

0.0 0.01389017516934801 0.017814734977924275
0.0 0.010834799693056731 0.0242384064654651
0.0 0.020743831838937944 0.03468657387806943
0.0 0.016221190992802708 0.03017326512791823
54.0 0.1287523294801621 0.2149207693238176
62.0 0.030298126209628642 0.039167397483972424
66.0 0.09823176862828112 0.12572967540371355
54.0 0.11123385901778153 0.23328113659731464
62.0 0.030673772604935284 0.03891590340359259
62.0 0.05491135217581952 0.07475398704941333
0.0 0.03706389579805249 0.06261210327960566
54.0 0.15372950718973027 0.21352000990383452
0.0 0.013277564412725292 0.024828809196201763
54.0 0.24957119609592354 0.3350823728755897
59.0 0.04642138925849764 0.059240263434438126
67.0 0.06211596653108746 0.07812370059423007
0.0 0.039511385629611706 0.10927801707844442
0.0 0.0071130653123865975 0.018916276687431958
0.0 0.009802244617152536 0.020144215140014514
0.0 0.004697588113886536 0.006246597919438731
0.0 0.006047552581045119 0.008880020903895005
0.0 0.006371104617757349 0.010532156276460626
0.0 

In [23]:

# Convert to sheet music
to_midi_sheet(midi_info,outfile="unscaled_nn_sheet_music",bpm=round(bpm))

In [None]:
# NN Using local Scaling

XData_local = x_data_process(XData, scale='local')

XTrain,XTest,YTrain,YTest = train_test_split(XData_local,YData,test_size=0.2)
hidden_size=(60,120)
max_iter=100000
solver='adam'
activation='tanh'
accuracy, trained_model = neural_network(XTrain,XTest,YTrain,YTest,hidden_size, max_iter, solver, activation)
print("NN Model Test Accuracy:",accuracy)

# Song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

# Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

midi_info = predict(predict_song_audio,predict_song_sr,trained_model,hopSize,winSize)
to_midi_sheet(midi_info,outfile="local_nn_sheet_music",bpm=round(bpm))

In [None]:
# NN Using global Scaling
XData_global = x_data_process(XData,scale='global')

XTrain,XTest,YTrain,YTest = train_test_split(XData_global,YData,test_size=0.2)
hidden_size=(60,120)
max_iter=10000
solver='adam'
activation='tanh'
accuracy, trained_model = neural_network(XTrain,XTest,YTrain,YTest,hidden_size, max_iter, solver, activation)
print("NN Model Test Accuracy:",accuracy)

# song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

# Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

midi_info = predict(predict_song_audio,predict_song_sr,trained_model,hopSize,winSize)
to_midi_sheet(midi_info,outfile="global_nn_sheet_music",bpm=round(bpm))

In [None]:
# NN Using TSNE Reduction with local scaling
XData_local = x_data_process(XData,scale='local')
XDataTSNE = tsneFit(XData_local, 2)

XTrain,XTest,YTrain,YTest = train_test_split(XDataTSNE,YData,test_size=0.2)
hidden_size=(60,120)
max_iter=10000
solver='adam'
activation='tanh'
accuracy, trained_model = neural_network(XTrain,XTest,YTrain,YTest,hidden_size, max_iter, solver, activation)
print("NN Model Test Accuracy:",accuracy)

# song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

#Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

midi_info = predict(predict_song_audio,predict_song_sr,trained_model,hopSize,winSize)
to_midi_sheet(midi_info,outfile="tsne_local_nn_sheet_music",bpm=round(bpm))

## Support Vector Machine Model Testing

In [None]:
def gaussianSVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    clf = SVC(kernel = 'rbf', gamma = 0.01, C = 150, max_iter=100000)
    clf.fit(XTrain, YTrain)

    return accuracy_score(YTest, clf.predict(XTest)), clf

In [None]:
def polySVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    clf = SVC(kernel = 'poly', gamma = 0.01, C = 150, max_iter=100000)
    clf.fit(XTrain, YTrain)

    return accuracy_score(YTest, clf.predict(XTest)), clf

In [None]:
def sigSVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    clf = SVC(kernel = 'sigmoid', gamma = 0.01, C = 150, max_iter=100000)
    clf.fit(XTrain, YTrain)

    return accuracy_score(YTest, clf.predict(XTest)), clf


In [None]:
def preSVM(XTrain, XTest, YTrain, YTest):
    #Currently, using successful values from A2, further adjustment with the processed dataset will be needed
    clf = SVC(kernel = 'precomputed', C = 150, max_iter=100000)
    clf.fit(XTrain, YTrain)

    return accuracy_score(YTest, clf.predict(XTest)), clf

### RBF Kernel - Gaussian SVM Testing

In [None]:
# RBF Kernel - No Scaling

XTrain,XTest,YTrain,YTest = train_test_split(XData,YData,test_size=0.2)
accuracy, trained_model = gaussianSVM(XTrain,XTest,YTrain,YTest)
print("Gaussian SVM Model Test Accuracy:",accuracy)

# song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

#Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

midi_info = predict(predict_song_audio,predict_song_sr,trained_model,hopSize,winSize)
to_midi_sheet(midi_info,outfile="unscaled_rbf_sheet_music",bpm=round(bpm))

In [None]:
# RBF Kernel - Local Scaling
XDATA_local = x_data_process(XData,scale='local')

XTrain,XTest,YTrain,YTest = train_test_split(XData_local,YData,test_size=0.2)
accuracy, trained_model = gaussianSVM(XTrain,XTest,YTrain,YTest)
print("Gaussian SVM Model Test Accuracy:",accuracy)

# song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

#Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

midi_info = predict(predict_song_audio,predict_song_sr,trained_model,hopSize,winSize)
to_midi_sheet(midi_info,outfile="local_rbf_sheet_music", bpm=round(bpm))

In [None]:
# RBF Kernel - Global Scaling
XData_global = x_data_process(XData,scale='global')

XTrain,XTest,YTrain,YTest = train_test_split(XData_global,YData,test_size=0.2)
accuracy, trained_model = gaussianSVM(XTrain,XTest,YTrain,YTest)
print("Gaussian SVM Model Test Accuracy:",accuracy)

#song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

#Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

midi_info = predict(predict_song_audio,predict_song_sr,trained_model,hopSize,winSize)
to_midi_sheet(midi_info,outfile="global_rbf_sheet_music",bpm=round(bpm))

In [None]:
# RBF Kernel - TSNE Scaling with local scaling
XData_local = x_data_process(XData,scale='local')
XDataTSNE = tsneFit(XData_local, 2)

XTrain,XTest,YTrain,YTest = train_test_split(XDataTSNE,YData,test_size=0.2)
accuracy, trained_model = gaussianSVM(XTrain,XTest,YTrain,YTest)
print("Gaussian SVM Model Test Accuracy:",accuracy)

# song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

#Get bpm
predict_song_audio,predict_song_sr = librosa.load(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

midi_info = predict(predict_song_audio,predict_song_sr,trained_model,hopSize,winSize)
to_midi_sheet(midi_info,outfile="tsne_local_rbf_sheet_music",bpm=round(bpm))

### Poly Kernel - Polynomial SVM Testing

In [None]:
%%time
# Poly Kernel - No Scaling

XTrain,XTest,YTrain,YTest = train_test_split(XData,YData,test_size=0.2)
PSVM_1_accuracy, PSVM_1 = polySVM(XTrain,XTest,YTrain,YTest)
print("Poly SVM Model Test Accuracy:",PSVM_1_accuracy)

# Song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

# Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

# Predict
midi_info = predict(predict_song_audio,predict_song_sr, trained_model,hopSize,winSize)

# Convert to sheet music
to_midi_sheet(midi_info,outfile="unscaled_poly_sheet_music",bpm=round(bpm))

In [None]:
%%time
# Poly Kernel - Local Scaling
XData_local = x_data_process(XData,scale='local')

XTrain,XTest,YTrain,YTest = train_test_split(XData_local,YData,test_size=0.2)
PSVM_2_accuracy, PSVM_2 = polySVM(XTrain,XTest,YTrain,YTest)
print("Poly SVM Model Test Accuracy:",PSVM_2_accuracy)

# Song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

# Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

# Predict
midi_info = predict(predict_song_audio,predict_song_sr, trained_model,hopSize,winSize)

# Convert to sheet music
to_midi_sheet(midi_info,outfile="local_poly_sheet_music",bpm=round(bpm))

In [None]:
%%time
# Poly Kernel - Global Scaling
XData_global = x_data_process(XData,scale='global')

XTrain,XTest,YTrain,YTest = train_test_split(XData_global,YData,test_size=0.2)
PSVM_3_accuracy, PSVM_3 = polySVM(XTrain,XTest,YTrain,YTest)
print("Poly SVM Model Test Accuracy:",PSVM_3_accuracy)

# Song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

# Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

# Predict
midi_info = predict(predict_song_audio,predict_song_sr, trained_model,hopSize,winSize)

# Convert to sheet music
to_midi_sheet(midi_info,outfile="global_poly_sheet_music",bpm=round(bpm))

In [None]:
%%time
# Poly Kernel - TSNE Scaling with local scaling
XData_local = x_data_process(XData, scale='local')
XDataTSNE = tsneFit(XData_local, 2)

XTrain,XTest,YTrain,YTest = train_test_split(XDataTSNE,YData,test_size=0.2)
PSVM_4_accuracy, PSVM_4 = polySVM(XTrain,XTest,YTrain,YTest)
print("Poly SVM Model Test Accuracy:",PSVM_4_accuracy)

# Song to predict
song = 'DataSets/audio_mono-mic/00_BN3-119-G_solo_mic.wav'

# Get bpm
predict_song_audio,predict_song_sr = load_audio_file(song)
bpm = librosa.beat.beat_track(y=predict_song_audio.astype(float), sr=predict_song_sr)[0]

# Predict
midi_info = predict(predict_song_audio,predict_song_sr, trained_model,hopSize,winSize)

# Convert to sheet music
to_midi_sheet(midi_info,outfile="tsne_local_poly_sheet_music",bpm=round(bpm))

### Sigmoid Kernel - Sigmoid SVM Testing