In [14]:
# import the necessary packages
import numpy as np
import tensorflow as tf
import pandas as pd
import math
from random import shuffle
import os
import sox
import librosa

In [15]:
# defining some constants
input_dim = 500
numexamples = 8000
num_classes = 4
alpha = 0.0001
num_epochs = 200
batch_size = 512
classificationweight = 0.1

In [16]:
# returns a list of where the 1 indicates the genre
# encodes the labels
def get_one_hot(label_num, num_classes = 4):
    one_hot = np.zeros((1,num_classes))
    one_hot[0, int(label_num)] = 1
    return one_hot

In [17]:
# loads the data for processing
# preprocesses the data
def load_data():
	print('Reading data...')
	tfm = sox.Transformer()
	songs = np.zeros((numexamples, input_dim))
	onehotlabels = np.zeros((numexamples, num_classes))
	counter = 0
	# used four classes
	# jazz removed due to corrupt files in the dataset
	allgenres = ['classical', 'country', 'metal', 'pop']

	# splits of 1 second each
	numsplit = 20
	sizesplit = input_dim
	for index in range(len(allgenres)):
		for filename in os.listdir('./genres_original/' + allgenres[index]):
			if filename.endswith(".wav"):
				audio, sr = librosa.core.load('./genres_original/' + allgenres[index] + '/' + filename)
				# creates a samling rate of 500Hz for each song by taking mean of values
				audio = audio[:600000]
				audio = audio.reshape(15000, 40)
				audio = np.mean(audio, axis=1)

				for j in range(numsplit):
					songs[counter] = audio[(sizesplit * j) : (sizesplit * (j + 1))]
					onehotlabels[counter] = get_one_hot(index)
					counter += 1
	songs = pd.DataFrame(songs)
	onehotlabels = pd.DataFrame(onehotlabels)
	print('Data reading done :)')
	return songs, onehotlabels

In [18]:
# get_placeholders returns the placeholders for the input and output data and the dropout rate for the network 
def get_placeholders():
	inputs_placeholder = tf.compat.v1.placeholder(tf.float32, (None, input_dim))
	labels_placeholder = tf.compat.v1.placeholder(tf.float32, (None, num_classes))
	tf.compat.v1.add_to_collection('inputs_placeholder', inputs_placeholder)
	tf.compat.v1.add_to_collection('labels_placeholder', labels_placeholder)
	keep_prob = tf.compat.v1.placeholder_with_default(1.0, shape=(), name='keep_prob')
	return inputs_placeholder, labels_placeholder, keep_prob

In [19]:
# adds weights and biases to the network and returns the weights and biases for the network
def add_parameters():
	weights = {}
	# encoder for the first layer of the network with 500 input nodes and 100 hidden nodes
	weights["W1_encoder"] = tf.compat.v1.get_variable(name="W1_encoder", shape = (input_dim, 256), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
	weights["W2_encoder"] = tf.compat.v1.get_variable(name="W2_encoder", shape = (256, 192), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
	weights["W3_encoder"] = tf.compat.v1.get_variable(name="W3_encoder", shape = (192, 128), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
	weights["W4_encoder"] = tf.compat.v1.get_variable(name="W4_encoder", shape = (128, 64), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))

	# decoder for the first layer of the network with 100 hidden nodes and 500 output nodes
	weights["W1_decoder"] = tf.compat.v1.get_variable(name="W1_decoder", shape = (64, 128), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
	weights["W2_decoder"] = tf.compat.v1.get_variable(name="W2_decoder", shape = (128, 192), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
	weights["W3_decoder"] = tf.compat.v1.get_variable(name="W3_decoder", shape = (192, 256), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
	weights["W4_decoder"] = tf.compat.v1.get_variable(name="W4_decoder", shape = (256, input_dim), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))

	# encoder for the second layer of the network with 100 input nodes and 50 hidden nodes
	weights["b1_encoder"] = tf.compat.v1.get_variable(name="b1_encoder", initializer = tf.zeros((1,256)))
	weights["b2_encoder"] = tf.compat.v1.get_variable(name="b2_encoder", initializer = tf.zeros((1,192)))
	weights["b3_encoder"] = tf.compat.v1.get_variable(name="b3_encoder", initializer = tf.zeros((1,128)))
	weights["b4_encoder"] = tf.compat.v1.get_variable(name="b4_encoder", initializer = tf.zeros((1,64)))

	# decoder for the second layer of the network with 50 hidden nodes and 100 output nodes
	weights["b1_decoder"] = tf.compat.v1.get_variable(name="b1_decoder", initializer = tf.zeros((1,128)))
	weights["b2_decoder"] = tf.compat.v1.get_variable(name="b2_decoder", initializer = tf.zeros((1,192)))
	weights["b3_decoder"] = tf.compat.v1.get_variable(name="b3_decoder", initializer = tf.zeros((1,256)))
	weights["b4_decoder"] = tf.compat.v1.get_variable(name="b4_decoder", initializer = tf.zeros((1, input_dim)))

	# softmax classifier weights
	weights["W1_softmax"] = tf.compat.v1.get_variable(name="W1_softmax", shape = (64, 32), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
	weights["b1_softmax"] = tf.compat.v1.get_variable(name="b1_softmax", initializer = tf.zeros((1,32)))
	weights["W2_softmax"] = tf.compat.v1.get_variable(name="W2_softmax", shape = (32, 16), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
	weights["b2_softmax"] = tf.compat.v1.get_variable(name="b2_softmax", initializer = tf.zeros((1,16)))
	weights["W3_softmax"] = tf.compat.v1.get_variable(name="W3_softmax", shape = (16, num_classes), initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
	weights["b3_softmax"] = tf.compat.v1.get_variable(name="b3_softmax", initializer = tf.zeros((1,num_classes)))
	return weights

In [20]:
# encoder returns the hidden layer output and the weights of the hidden layer
def encoder(inputs_batch, weights, keep_prob):
	a_1 = tf.nn.sigmoid(tf.add(tf.matmul(inputs_batch, weights["W1_encoder"]),weights["b1_encoder"]))
	a_2 = tf.nn.tanh(tf.add(tf.matmul(a_1, weights["W2_encoder"]),weights["b2_encoder"]))
	a_2 = tf.nn.dropout(a_2, rate=1 - (keep_prob))
	a_3 = tf.nn.relu(tf.add(tf.matmul(a_2, weights["W3_encoder"]),weights["b3_encoder"]))
	a_4 = tf.nn.relu(tf.add(tf.matmul(a_3, weights["W4_encoder"]),weights["b4_encoder"]))
	return a_4

In [21]:
# decoder returns the reconstructed input and the weights of the hidden layer
def decoder(inputs_batch, weights, keep_prob):
	a_5 = tf.nn.sigmoid(tf.add(tf.matmul(inputs_batch, weights["W1_decoder"]),weights["b1_decoder"]))
	a_6 = tf.nn.sigmoid(tf.add(tf.matmul(a_5, weights["W2_decoder"]),weights["b2_decoder"]))
	a_6 = tf.nn.dropout(a_6, rate=1 - (keep_prob))
	a_7 = tf.nn.relu(tf.add(tf.matmul(a_6, weights["W3_decoder"]),weights["b3_decoder"]))
	a_8 = tf.nn.relu(tf.add(tf.matmul(a_7, weights["W4_decoder"]),weights["b4_decoder"]))
	return a_8

In [22]:
# softmax classifier returns the output of the softmax classifier (Convention of h for hidden layers of classifier)
def softmaxclassifier(inputs_batch, weights, keep_prob):
	h_1  = tf.nn.tanh(tf.add(tf.matmul(inputs_batch, weights["W1_softmax"]), weights["b1_softmax"]))
	h_2  = tf.nn.tanh(tf.add(tf.matmul(h_1, weights["W2_softmax"]), weights["b2_softmax"]))

	# Remove softmax from here
	h_3 = tf.add(tf.matmul(h_2, weights["W3_softmax"]), weights["b3_softmax"])
	return h_3

In [23]:
# get_batch returns a batch of data and labels
def get_batches(seq, size=batch_size):
    return [seq[pos:pos + size] for pos in range(0, len(seq), size)]

In [24]:
# trains the network and returns the trained weights of the network and the loss of the network for each epoch 
def train(X, Y, X_dev, Y_dev):
	tf.compat.v1.reset_default_graph()
	inputs_batch, labels_batch, keep_prob = get_placeholders()
	weights = add_parameters()
	encoding = encoder(inputs_batch, weights, keep_prob)
	decoding = decoder(encoding, weights, keep_prob)
	tf.compat.v1.add_to_collection("encoding", encoding)
	tf.compat.v1.add_to_collection("decoding", decoding)
	y_hat = softmaxclassifier(encoding, weights, keep_prob)
	tf.compat.v1.add_to_collection("y_hat", y_hat)
	# checks shape of y_hat: need to be shape (batch_size, num_classes) according to documentation
	loss = tf.reduce_mean(input_tensor=tf.pow(decoding - inputs_batch, 2)) + tf.reduce_mean(input_tensor=(classificationweight * tf.nn.softmax_cross_entropy_with_logits(labels=tf.stop_gradient(labels_batch), logits=y_hat)))
	tf.compat.v1.add_to_collection("loss", loss)
	optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = alpha).minimize(loss)
	init = tf.compat.v1.global_variables_initializer()
	saver = tf.compat.v1.train.Saver(max_to_keep=5)
	with tf.compat.v1.Session() as sess:
		summary_writer = tf.compat.v1.summary.FileWriter('./tensorboardlogs/softmaxautoencoder', sess.graph)
		sess.run(init)
		# Shuffling training set
		ind_list=[i for i in range(X.shape[0])]
		shuffle(ind_list)
		X = X.iloc[ind_list]
		Y = Y.iloc[ind_list]
		train_accuracies = []
		dev_accuracies = []
		loss_per_epoch = []

		for iteration in range(num_epochs):
			inputs_batches = get_batches(X)
			labels_batches = get_batches(Y)
			cost_list = []
			currnumcorrect = 0
			for i in range(len(inputs_batches)):
				batch = inputs_batches[i]
				batchlabel = labels_batches[i]
				bottleneck, reconstruction, preds, _, curr_loss = sess.run([encoding, decoding, y_hat, optimizer, loss], feed_dict={inputs_batch: batch, labels_batch: batchlabel, keep_prob : 0.8})
				# checks if the index of the max value of the prediction is the same as the index of the max value of the label (i.e. if the prediction is correct)
				predictions = tf.math.argmax(input=preds, axis=1)
				truelabels = tf.math.argmax(input=batchlabel, axis=1)
				numequal = tf.math.equal(predictions, truelabels)
				numcorrect = tf.math.count_nonzero(numequal)
				currnumcorrect += numcorrect.eval()
				cost_list.append(curr_loss)

			accuracy = currnumcorrect / float(X.shape[0])
			print("Epoch " + str(iteration+1) + ", Train Accuracy: " + str(accuracy))
			_, preds = sess.run([encoding, y_hat], feed_dict={inputs_batch : X_dev, labels_batch : Y_dev, keep_prob : 1.0})
			predictions = tf.math.argmax(input=preds, axis=1)
			truelabels = tf.math.argmax(input=Y_dev, axis=1)
			numequal = tf.math.equal(predictions, truelabels)
			numcorrect = tf.math.count_nonzero(numequal)
			devaccuracy = numcorrect.eval() / float(X_dev.shape[0])
			print("Epoch " + str(iteration+1) + ", Dev Accuracy: " + str(devaccuracy))
			train_accuracies.append(accuracy)
			dev_accuracies.append(devaccuracy)
			train_smoothed_cost = float(sum(cost_list)) / len(cost_list)
			loss_per_epoch.append(train_smoothed_cost)
			saver.save(sess, './modelWeights/softmaxautoencoder', global_step = (iteration+1))
			objectives_summary = tf.compat.v1.Summary()
			objectives_summary.value.add(tag='train_accuracy', simple_value=accuracy)
			objectives_summary.value.add(tag='dev_accuracy', simple_value=devaccuracy)
			objectives_summary.value.add(tag='train_smoothed_cost', simple_value=train_smoothed_cost)
			summary_writer.add_summary(objectives_summary, iteration+1)
			summary_writer.flush()

In [25]:
tf.compat.v1.disable_eager_execution()
songs, labels = load_data()

Reading data...
Data reading done :)


In [26]:
# shuufles the data
ind_list=[i for i in range(songs.shape[0])]
shuffle(ind_list)
songs = songs.iloc[ind_list]
labels = labels.iloc[ind_list]

In [27]:
# splits the data into training and dev sets
songs_train = songs.iloc[0:6000]
songs_dev = songs.iloc[6000:]
labels_train = labels.iloc[0:6000]
labels_dev = labels.iloc[6000:]

In [28]:
# writes the data to csv files
songs_dev.to_csv('songs_dev.csv', index = False)
labels_dev.to_csv('labels_dev.csv', index = False)

In [29]:
# trains the model on the training set and saves the model weights after each epoch in the modelWeights folder
train(songs_train, labels_train, songs_dev, labels_dev)

2022-12-01 12:39:31.271391: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-01 12:39:31.272490: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-01 12:39:31.272570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-12-01 12:39:31.272711: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-12-01 12:39:31.273009: W tensorflow/c

Epoch 1, Train Accuracy: 0.2505
Epoch 1, Dev Accuracy: 0.2815
Epoch 2, Train Accuracy: 0.2355
Epoch 2, Dev Accuracy: 0.258
Epoch 3, Train Accuracy: 0.24966666666666668
Epoch 3, Dev Accuracy: 0.25
Epoch 4, Train Accuracy: 0.25783333333333336
Epoch 4, Dev Accuracy: 0.2365
Epoch 5, Train Accuracy: 0.24616666666666667
Epoch 5, Dev Accuracy: 0.241
Epoch 6, Train Accuracy: 0.253
Epoch 6, Dev Accuracy: 0.2485
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Epoch 7, Train Accuracy: 0.25266666666666665
Epoch 7, Dev Accuracy: 0.2245
Epoch 8, Train Accuracy: 0.25283333333333335
Epoch 8, Dev Accuracy: 0.2135
Epoch 9, Train Accuracy: 0.25433333333333336
Epoch 9, Dev Accuracy: 0.246
Epoch 10, Train Accuracy: 0.24983333333333332
Epoch 10, Dev Accuracy: 0.2455
Epoch 11, Train Accuracy: 0.25833333333333336
Epoch 11, Dev Accuracy: 0.255
Epoch 12, Train Accuracy: 0.25933333333333336
Epoch 12, Dev Accuracy: 0.2575
Epoch 13, Train Accuracy: 0.25633333333333336
Epoch 13, 