In [324]:
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.stats import norm
import math
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [424]:
def standardize(v):
    v = np.array(v).astype(np.float)
    v = v.reshape(1, -1)
    return(v.T)

In [529]:
def fileProcessor(F, featName, trainingSize):
    x_train = []   #Training Data
    x_test = []    #Testing Data 
    y_train = []   #Training Classifiers
    y_test = []    #Testing Classifiers
    first = True
    file1 = True
    for file in F:    #F is a vector of file names
        x = []
        y = []
        tree = ET.parse(file) #Create XML file tree
        root = tree.getroot() #Get root of tree... In our case root is 'feature_vector_file'
        for child in root:    #Loop through subelements of root
            if child.tag == 'data_set':      #if subelement is 'data_set', perform:
                fileID = child.find('data_set_id').text   #'data_set_id' contains the classifer (i.e the composer name)
                composerName = fileID.split('/')[-2]      #extract composer name from 'data_set_id'
                if composerName == 'Beethoven':
                    y.append(0)
                elif composerName == 'Schubert':
                    y.append(1)
                elif composerName == 'Mozart':
                    y.append(2)
                else:
                    y.append(3)
                for feature in child.iter('feature'):  #Loop through 'data_set' subelement for features
                    if feature.find('name').text in [featName]:#['Power Spectrum Overall Average', 'Spectral Flux Overall Average', 'Beat Histogram Overall Average', 'MFCC Overall Average']:
                        v = []                                #create empty array to contain feature values
                        vals = feature.findall('v')           #extract values of feature
                        for i in vals:                    #loop through extracted values
                            v.append(i.text)                  #append text of values to values array
                        v = standardize(v)
                        if first == True:
                            x = v
                            first = False
                        else:
                            x = np.c_[x, v]
#         print(x.shape)
        first = True
        y = np.array(y)
        x_trainC, x_testC, y_trainC, y_testC = train_test_split(x.T, y, test_size=trainingSize)
        if file1 == True:
            x_train = x_trainC.T
#             print(x_train)
            x_test = x_testC.T
            y_train = y_trainC
#             print(y_train)
            y_test = y_testC
            file1 = False
        else:
            x_train = np.c_[x_train, x_trainC.T]    
            x_test = np.c_[x_test, x_testC.T]  
            y_train = np.concatenate((y_train, y_trainC))
#             print(y_train)
            y_test = np.concatenate((y_test, y_testC)) 
    
#     y = np.array(y)
#     x_train, x_test, y_train, y_test = train_test_split(x.T, y, test_size=trainingSize)
#     print(len(x_train))
    scaleX = StandardScaler()
    x_train = scaleX.fit_transform(x_train.T)
    x_test = scaleX.transform(x_test.T)
    
    return(x_train, x_test, y_train, y_test)

In [530]:
def fileProcessor2(F, featName, trainingSize):
    x_train = []   #Training Data
    x_test = []    #Testing Data 
    y_train = []   #Training Classifiers
    y_test = []    #Testing Classifiers
    file1 = True
    for file in F:    #F is a vector of file names
        x = []
        y = []
        first2 = True
        tree = ET.parse(file) #Create XML file tree
        root = tree.getroot() #Get root of tree... In our case root is 'feature_vector_file'
        for child in root:    #Loop through subelements of root
            xx = []
            first = True
            if child.tag == 'data_set':      #if subelement is 'data_set', perform:
                fileID = child.find('data_set_id').text   #'data_set_id' contains the classifer (i.e the composer name)
                composerName = fileID.split('/')[-2]      #extract composer name from 'data_set_id'
                if composerName == 'Beethoven':
                    y.append(0)
                elif composerName == 'Schubert':
                    y.append(1)
                elif composerName == 'Mozart':
                    y.append(2)
                else:
                    y.append(3)
                for feature in child.iter('feature'):  #Loop through 'data_set' subelement for features
                    if feature.find('name').text in featName:#['Power Spectrum Overall Average', 'Spectral Flux Overall Average', 'Beat Histogram Overall Average', 'MFCC Overall Average']:
                        v = []                                #create empty array to contain feature values
#                         print('here')
                        vals = feature.findall('v')           #extract values of feature
                        for i in vals:                    #loop through extracted values
                            v.append(i.text)                  #append text of values to values array
                        v = standardize(v)
#                         print(v)
                        if first == True:
                            xx = v
                            first = False
                        else:
                            xx = np.r_[xx, v]
#                         print('feature done')
#                 print('song done')
                if first2 == True:
                    x = xx
                    first2 = False
                else:
#                     print(x)
#                     print(xx)
                    x = np.c_[x, xx]
#             print(x)
        first = True
        y = np.array(y)
        x_trainC, x_testC, y_trainC, y_testC = train_test_split(x.T, y, test_size=trainingSize)
        if file1 == True:
            x_train = x_trainC.T
#             print(x_train)
            x_test = x_testC.T
            y_train = y_trainC
#             print(y_train)
            y_test = y_testC
            file1 = False
        else:
            x_train = np.c_[x_train, x_trainC.T]    
            x_test = np.c_[x_test, x_testC.T]  
            y_train = np.concatenate((y_train, y_trainC))
#             print(y_train)
            y_test = np.concatenate((y_test, y_testC)) 
    
#     y = np.array(y)
#     x_train, x_test, y_train, y_test = train_test_split(x.T, y, test_size=trainingSize)
#     print(len(x_train))
    scaleX = StandardScaler()
    x_train = scaleX.fit_transform(x_train.T)
    x_test = scaleX.transform(x_test.T)
    
    return(x_train, x_test, y_train, y_test)

In [531]:
def NaiveBayes(data):
    x_train = data[0]
    x_test = data[1]
    y_train = data[2]
    y_test = data[3]
    
    epsilon = (1e-9)*np.var(x_train, axis=0).max()
    
    featureLength = x_train.shape[1]
    composerSize = len(np.unique(y_train))
    mean = np.zeros((composerSize, featureLength))
    var = np.zeros((composerSize, featureLength))
    
    composer_count = np.zeros(composerSize, dtype=np.float64)
    composer_prior = np.zeros(len(np.unique(y_train)),dtype=np.float64)
    
    composers = np.unique(y_train)
    
    for y_i in composers:
        i = np.searchsorted(composers, y_i)
        composerData = x_train[y_train == y_i, :]

        N_i = composerData.shape[0]

        new_var = np.var(composerData, axis=0)
        new_mean = np.mean(composerData, axis=0)
        
        mean[i, :] = new_mean
        var[i, :] = new_var
        composer_count[i] += N_i
        
    var[:, :] += epsilon
    
    composer_prior = composer_count/sum(composer_count)
    return(composers, composer_prior, var, mean, x_test, y_test)

In [532]:
def testing(data):
    composers = data[0]
    composer_prior = data[1]
    var = data[2]
    mean = data[3]
    X = data[4]
    y_test = data[5]
    likelihood = []
    for i in range(np.size(composers)):
        joint = np.log(composer_prior[i])
        nij = - 0.5 * np.sum(np.log(2. * np.pi * var[i, :]))
        nij -= 0.5 * np.sum(((X - mean[i, :]) ** 2) /
                             (var[i, :]), 1)
        likelihood.append(joint + nij)

    likelihood = np.array(likelihood).T
    y_pred = composers[np.argmax(likelihood, axis=1)].T
    
#     print('Results  |  True')
#     for i in range(len(y_pred)):
#         print(y_pred[i], y_test[i])
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

In [536]:
v = ['beethovenValues.xml', 'schubertValues.xml', 'mozartValues.xml', 'chopinValues.xml']
features = ['Power Spectrum Overall Average', 'Spectral Flux Overall Average', 'Beat Histogram Overall Average', 'MFCC Overall Average']
for i in range(len(features)):
    print(features[i])
    d = fileProcessor(v, features[i], 0.33)
    n = NaiveBayes(d)
    testing(n)
    print('')
print('All Features')
d = fileProcessor2(v, features, 0.33)
n = NaiveBayes(d)
testing(n)

Power Spectrum Overall Average
[[6 0 4 0]
 [5 1 3 1]
 [0 0 7 0]
 [4 2 6 4]]

Spectral Flux Overall Average
[[4 2 4 0]
 [1 0 6 3]
 [0 3 4 0]
 [2 1 7 6]]

Beat Histogram Overall Average
[[4 1 3 2]
 [3 1 3 3]
 [3 0 3 1]
 [5 1 2 8]]

MFCC Overall Average
[[4 4 1 1]
 [1 7 1 1]
 [1 0 5 1]
 [2 7 2 5]]

All Features
[[5 2 2 1]
 [6 1 2 1]
 [0 1 5 1]
 [6 1 2 7]]
