In [None]:
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.stats import norm
import math
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [None]:
def fileProcessor(F, featName, trainingSize):
    x_train = []   #Training Data
    x_test = []    #Testing Data 
    y_train = []   #Training Classifiers
    y_test = []    #Testing Classifiers
    file1 = True
    for file in F:    #F is a vector of file names
        x = []
        y = []
        print(file)
        tree = ET.parse(file) #Create XML file tree
        root = tree.getroot() #Get root of tree... In our case root is 'feature_vector_file'
        for child in root:    #Loop through subelements of root
            xx = []
            if child.tag == 'data_set':      #if subelement is 'data_set', perform:
                fileID = child.find('data_set_id').text   #'data_set_id' contains the classifer (i.e the composer name)
                composerName = fileID.split('/')[-2]      #extract composer name from 'data_set_id'
                composerID = 0
                if composerName == 'Beethoven':
#                     y.append(0)
                    composerID = 0
                elif composerName == 'Schubert':
#                     y.append(1)
                    composerID = 1
                elif composerName == 'Mozart':
#                     y.append(2)
                    composerID = 2
                else:
#                     y.append(3)
                    composerID = 3
                first = True
                for section in child.iter('section'):  #Loop through 'data_set' subelement for features
                    if first == False:
                        y.append(composerID)
                        for feature in section.iter('feature'):
                            if feature.find('name').text in featName:#['Power Spectrum Overall Average', 'Spectral Flux Overall Average', 'Beat Histogram Overall Average', 'MFCC Overall Average']:
                                v = []                                #create empty array to contain feature values
                                vals = feature.findall('v')           #extract values of feature
                                for i in vals:                    #loop through extracted values
                                    v.append(i.text)                  #append text of values to values array
                                v = np.array(v).astype(np.float)
                                v = v.reshape(1, -1)
                                v = v.T
                                if len(xx) == 0:
                                    xx = v
                                else:
                                    if len(featName) == 1:
                                        xx = np.c_[x, v]
                                    else:
                                        xx = np.r_[xx, v]
                        if len(x) == 0:
                            x = xx
                        else:
                            x = np.c_[x, xx]
                    first = False
        y = np.array(y)
        x_trainC, x_testC, y_trainC, y_testC = train_test_split(x.T, y, test_size=trainingSize)
        if file1 == True:
            x_train = x_trainC.T
            x_test = x_testC.T
            y_train = y_trainC
            y_test = y_testC
            file1 = False
        else:
            x_train = np.c_[x_train, x_trainC.T]    
            x_test = np.c_[x_test, x_testC.T]  
            y_train = np.concatenate((y_train, y_trainC))
            y_test = np.concatenate((y_test, y_testC)) 
    
    scaleX = StandardScaler()
    x_train = scaleX.fit_transform(x_train.T)
    x_test = scaleX.transform(x_test.T)
    
    return(x_train, x_test, y_train, y_test)

In [None]:
def LogisticRegression(data):
    #init
    x_train = data[0]
    x_test = data[1]
    y_train = data[2]
    y_test = data[3]
    learningRate = 0.1
    iter_n = 100

    #fit
    x_train = np.insert(x_train, 0, 1, axis=1)
    weights = []
    m = x_train.shape[0]
    for i in np.unique(y_train):
        y_copy = np.where(y_train == i, 1, 0)
        w = np.ones(x_train.shape[1])
        for _ in range(iter_n):
            output = x_train.dot(w)
            errors = y_copy - 1 / (1 + np.exp(-output))
            w += learningRate / m * errors.dot(x_train)
        weights.append((w, i))

    pred = [max((i.dot(w), c) for w, c in weights)[1] for i in np.insert(x_test, 0, 1, axis=1)]
    return(y_test, pred)

In [None]:
def NaiveBayes(data):
    x_train = data[0]
    x_test = data[1]
    y_train = data[2]
    y_test = data[3]
    
    epsilon = (1e-9)*np.var(x_train, axis=0).max()
    
    featureLength = x_train.shape[1]
    composerSize = len(np.unique(y_train))
    mean = np.zeros((composerSize, featureLength))
    var = np.zeros((composerSize, featureLength))
    
    composer_count = np.zeros(composerSize, dtype=np.float64)
    composer_prior = np.zeros(len(np.unique(y_train)),dtype=np.float64)
    
    composers = np.unique(y_train)
    
    for y_i in composers:
        i = np.searchsorted(composers, y_i)
        composerData = x_train[y_train == y_i, :]

        N_i = composerData.shape[0]

        new_var = np.var(composerData, axis=0)
        new_mean = np.mean(composerData, axis=0)
        
        mean[i, :] = new_mean
        var[i, :] = new_var
        composer_count[i] += N_i
        
    var[:, :] += epsilon
    
    composer_prior = composer_count/sum(composer_count)
    
    likelihood = []
    for i in range(np.size(composers)):
        joint = np.log(composer_prior[i])
        nij = - 0.5 * np.sum(np.log(2. * np.pi * var[i, :]))
        nij -= 0.5 * np.sum(((x_test - mean[i, :]) ** 2) /
                             (var[i, :]), 1)
        likelihood.append(joint + nij)

    likelihood = np.array(likelihood).T
    y_pred = composers[np.argmax(likelihood, axis=1)].T
    
    
    return(y_test, y_pred)

In [None]:
def plot(data):
    y_test = data[0]
    y_pred = data[1]
    
    cm = confusion_matrix(y_test, y_pred)
#     print(cm)
#     print(accuracy_score(y_test, y_pred, normalize=True)*100)
    return(cm, accuracy_score(y_test, y_pred, normalize=True)*100)

In [None]:
def printM(matrix):
    targets = ['Beethoven', 'Schubert', 'Mozart', 'Chopin']
    plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.Pastel1)
    plt.title('Confusion matrix')
    tick_marks = np.arange(len(targets))
    plt.xticks(tick_marks, targets, rotation=90)
    plt.yticks(tick_marks, targets)
    plt.tight_layout()
    width, height = matrix.shape
    for x in range(width):
        for y in range(height):
            plt.annotate(str(matrix[x][y]), xy=(y,x),
                        horizontalalignment = 'center',
                        verticalalignment= 'center')
    plt.ylabel('Prediction')
    plt.xlabel('Actual')
    plt.show()

In [None]:
v = ['beethovenValues.xml', 'schubertValues.xml', 'mozartValues.xml', 'chopinValues.xml']
features = ['Spectral Flux', 'Compactness', 'Spectral Variability', 'Root Mean Square', 'Zero Crossings', 'Strongest Frequency Via Zero Crossings', 'Strongest Frequency Via Spectral Centroid', 'Strongest Frequency Via FFT Maximum', 'MFCC', 'LPC', 'Method of Moments', 'Partial Based Spectral Centroid', 'Partial Based Spectral Flux', 'Peak Based Spectral Smoothness', 'Relative Difference Function', 'Area Method of Moments', 'Area Method of Moments of MFCCs']

for i in range(len(features)):
    print(features[i])
    d = fileProcessor(v, [features[i]], 0.33)
    print('Naive Bayes')
    n = NaiveBayes(d)
    printM(plot(n)[0])
    print('Accuracy = '+str(plot(n)[1]))
    print('')
    print('Logistic Regression')
    l = LogisticRegression(d)
    printM(plot(l)[0])
    print('Accuracy = '+str(plot(l)[1]))
    print('')
    print('')
print('All Features')
d = fileProcessor(v, features, 0.33)
print('Naive Bayes')
n = NaiveBayes(d)
printM(plot(n)[0])
print('Accuracy = '+str(plot(n)[1]))
print('Logistic Regression')
l = LogisticRegression(d)
printM(plot(l)[0])
print('Accuracy = '+str(plot(l)[1]))
print('')

nm = 1000
print('Performing Methods Over '+str(nm)+' Iterations')
for i in range(len(features)):
    print(features[i])
    sumN = 0
    sumL = 0
    for j in range(nm):
        d1 = fileProcessor(v, [features[i]], 0.33)
        n1 = NaiveBayes(d1)
        sumN = sumN + plot(n1)[1]
        l1 = LogisticRegression(d1)
        sumL = sumL + plot(l1)[1]
    print('Naive Bayes Accuracy')
    print(sumN/nm)
    print('')
    print('Logistic Regression Accuracy')
    print(sumL/nm)
    print('')
    
    
# print('Over All Features')
# sumN = 0
# sumL = 0
# for k in range(nm):
#     dj = fileProcessor(v, features, 0.33)
#     nj = NaiveBayes(dj)
#     sumN = sumN + plot(nj)[1]
#     lj = LogisticRegression(dj)
#     sumL = sumL + plot(lj)[1]
# print('Naive Bayes Accuracy')
# print(sumN/nm)
# print('')
# print('Logistic Regression Accuracy')
# print(sumL/nm)
# print('')