In [381]:
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats import norm
import math

In [382]:
def MFCCnormalizer(mfcc):
    mfcc = np.array(mfcc).astype(np.float)
    eps = 2**-30
    mean = np.mean(mfcc)
    mean_vec = np.tile(mean, (len(mfcc), 1)).T
    mean_subtracted = mfcc - mean_vec
    stdev = np.std(mfcc)
    stdev_vec = np.tile(stdev, (len(mfcc), 1)).T
    output = mean_subtracted / (stdev_vec)
    output = np.delete(output, 0)
    return(output)

In [393]:
def fileProcessor(F, trainingSize):
    x_train = []   #Training Data
    x_test = []    #Testing Data 
    y_train = []   #Training Classifiers
    y_test = []    #Testing Classifiers
    for file in F:    #F is a vector of file names
        x = [] #data
        y = [] #classifier
        tree = ET.parse(file) #Create XML file tree
        root = tree.getroot() #Get root of tree... In our case root is 'feature_vector_file'
        for child in root:    #Loop through subelements of root
            if child.tag == 'data_set':      #if subelement is 'data_set', perform:
                fileID = child.find('data_set_id').text   #'data_set_id' contains the classifer (i.e the composer name)
                composerName = fileID.split('/')[-2]      #extract composer name from 'data_set_id'
                y.append(composerName)                    #append composer name to classifier vector
                FeatureList = []                          #empty array for List of Features, Features have a name and set of values
                for feature in child.iter('feature'):  #Loop through 'data_set' subelement for features
                    if feature.find('name').text in ['MFCC Overall Average']:#['Power Spectrum Overall Average', 'Spectral Flux Overall Average', 'Beat Histogram Overall Average', 'MFCC Overall Average']:
                        v = []                                #create empty array to contain feature values
                        name = feature.find('name').text      #extract name of feature
                        vals = feature.findall('v')           #extract values of feature
                        for i in vals:                    #loop through extracted values
                            v.append(i.text)                  #append text of values to values array
                        if name == 'MFCC Overall Average':
                            v = MFCCnormalizer(v)
                        feat = [name, v]                  #join feature name and values
                        FeatureList.append(feat)          #append feature name and values to List of Features
                x.append(FeatureList)           #append List of Features for midi file being analysed to the array of data
        x_trainC, x_testC, y_trainC, y_testC = train_test_split(x, y,test_size=trainingSize) #split data into training and testing data
        #the following is done to ensure that each composer has an equal ratio of music split
        #into training and testing data... although the specific songs by each composer are chosen randomly
        #each composer has an equal ratio of their music split into testing and training data
        x_train = x_train + x_trainC    
        x_test = x_test + x_testC
        y_train = y_train + y_trainC
        y_test = y_test + y_testC
    training = [x_train, y_train]
    testing = [x_test, y_test]
    return(training, testing)

In [394]:
def featureProcessor(data):
    x_train = data[0]
    y_train = data[1]
    beethovenTrain = [x_train[i] for i in range(len(x_train)) if y_train[i]== 'Beethoven']
    chopinTrain = [x_train[i] for i in range(len(x_train)) if y_train[i]== 'Schubert']
    
    prior_beethoven = len(beethovenTrain)/(len(beethovenTrain)+len(chopinTrain))
    prior_chopin = len(chopinTrain)/(len(beethovenTrain)+len(chopinTrain))
    
    priors = [prior_beethoven, prior_chopin]
    
    dataTrain = [beethovenTrain, chopinTrain]
    
    for composerData in range(len(dataTrain)):
        featureList = []
        for feature in range(len(dataTrain[composerData][0])):
            featureData = []
            for value in range(len(dataTrain[composerData][0][feature][1])):
                vals = []
                for song in range(len(dataTrain[composerData])):
                    vals.append(dataTrain[composerData][song][feature][1][value])
                vals = np.array(vals).astype(np.float)
                mean = np.mean(vals)
                stdev = np.std(vals)
                featureInfo = [mean, stdev]
                featureData.append(featureInfo)
            nameAndData = [dataTrain[composerData][0][feature][0], featureData]
            featureList.append(nameAndData)
        dataTrain[composerData] = featureList
    return(dataTrain, priors)

In [395]:
def likelihood(x, mean, stdev):
    like = (1/(2*math.pi*stdev)**(1/2))*(math.exp((-1/2)*(((x-mean)**2)/stdev)))
    return like

In [386]:
def NaiveBayesTesting(trainingData, processedDataD, testingData):
    processedData = processedDataD[0]
    priors = processedDataD[1]
    x_test = testingData[0]
    y_test = testingData[1]
    testProbabilities = []
    for testSong in range(len(x_test)):
        composerProbability = []
        for composer in range(len(processedData)):
            p = 0
            for feature in range(len(x_test[testSong])):
                for value in range(len(x_test[testSong][feature][1])):
                    probability = likelihood(float(x_test[testSong][feature][1][value]), processedData[composer][feature][1][value][0], processedData[composer][feature][1][value][1])
                    p = p + probability
            if composer == 0:
                prob = ['Beethoven', p]
            else:
                prob = ['Chopin', p]
            composerProbability.append(prob)
        testProbabilities.append(composerProbability)
    prior_beethoven = priors[0]
    prior_chopin = priors[1]
    results = []
    for song in range(len(testProbabilities)):
        prob_beethoven = (testProbabilities[song][0][1]*prior_beethoven)/((testProbabilities[song][0][1]*prior_beethoven)+(testProbabilities[song][1][1]*prior_chopin))
        prob_chopin = (testProbabilities[song][1][1]*prior_chopin)/((testProbabilities[song][0][1]*prior_beethoven)+(testProbabilities[song][1][1]*prior_chopin))
        if prob_beethoven > prob_chopin:
            results.append('Beethoven')
        elif prob_beethoven < prob_chopin:
            results.append('Chopin')
    print('Result | True')
    for i in range(len(y_test)):
        print(results[i], y_test[i])

In [392]:
v = ['beethovenValues.xml', 'schubertValues.xml']
d = fileProcessor(v, 0.33)
p = featureProcessor(d[0])
NaiveBayesTesting(d[0], p, d[1])

Result | True
Beethoven Beethoven
Beethoven Beethoven
Beethoven Beethoven
Beethoven Beethoven
Beethoven Beethoven
Beethoven Beethoven
Beethoven Beethoven
Beethoven Beethoven
Beethoven Beethoven
Beethoven Beethoven
Beethoven Schubert
Beethoven Schubert
Beethoven Schubert
Beethoven Schubert
Beethoven Schubert
Beethoven Schubert
Beethoven Schubert
Beethoven Schubert
Beethoven Schubert
Beethoven Schubert


hi bye
