In [1]:
import numpy as np
from sklearn import preprocessing
raw_csv_data = np.loadtxt('1.1 Audiobooks_data.csv', delimiter=',')


In [49]:
unscaled_data = raw_csv_data[:,1:-1]
targets_all=raw_csv_data[:,-1] 
targets_all.shape

(14084,)

# Balancing the data

In [4]:
num_one_targets= int(np.sum(targets_all))
zeroes_target_counter=0
indices_to_remove=[]

for i in range(targets_all.shape[0]):
    if targets_all[i]==0:
        zeroes_target_counter+=1
        if zeroes_target_counter > num_one_targets:
            indices_to_remove.append(i)
            
unscaled_data_equal_prior=np.delete(unscaled_data, indices_to_remove, axis=0)
targets_all_equal_prior = np.delete(targets_all, indices_to_remove, axis=0)

# Standardising the data

In [5]:
scaled_inputs=preprocessing.scale(unscaled_data_equal_prior)

targets_all_equal_prior.shape

(4474,)

# Shuffle the data

In [6]:
shuffled_indices= np.arange(targets_all_equal_prior.shape[0])
np.random.shuffle(shuffled_indices)
print(shuffled_indices)

[1416 1405  498 ...  467 2528 2740]


In [53]:
shuffled_inputs=scaled_inputs[shuffled_indices]
shuffled_targets=targets_all_equal_prior[shuffled_indices]

(4474,)

# Split the data set to Traning, validation, Test

In [11]:
sample_count=int(shuffled_inputs.shape[0])

training_sample_count=int(sample_count*0.8)
validation_sample_count= int(sample_count*0.1)
test_sample_count = sample_count - training_sample_count - validation_sample_count

training_inputs = shuffled_inputs[: training_sample_count, :]
traning_targets = shuffled_targets[: training_sample_count]

validation_inputs = shuffled_inputs[training_sample_count:training_sample_count+ validation_sample_count]
validation_targets = shuffled_targets[training_sample_count:training_sample_count + validation_sample_count]

test_inputs =shuffled_inputs[training_sample_count + validation_sample_count:]
test_targets = shuffled_targets[training_sample_count + validation_sample_count:]


In [12]:
print(training_inputs.shape[0],traning_targets.shape[0])

3579 3579


In [13]:
print(validation_inputs.shape[0], validation_targets.shape[0])

447 447


In [55]:
print(test_inputs.shape[0], test_targets.shape[0])
print(test_targets.shape, test_inputs.shape)

448 448
(448,) (448, 10)


# saving the data to npz file

In [15]:
np.savez('Audiobooks_data_train', inputs=training_inputs, targets=traning_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets )

# Creating the class to load the data

In [39]:
import numpy as np
class Audiobooks_Data_Reader():
       # Dataset is a mandatory arugment, while the batch_size is optional\n",
       # If you don't input batch_size, it will automatically take the value: None\n",
    
        def __init__(self, dataset, batch_size = None):
            # The dataset that loads is one of \"train\", \"validation\", \"test\".\n",
            # e.g. if I call this class with x('train',5), it will load 'Audiobooks_data_train.npz' with a batch size of 5.\n",
            npz = np.load('Audiobooks_data_{0}.npz'.format(dataset))
            # Two variables that take the values of the inputs and the targets. Inputs are floats, targets are integers\n",
            self.inputs, self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int),
            # Counts the batch number, given the size you feed it later\n",
            # If the batch size is None, we are either validating or testing, so we want to take the data in a single batch\n",
            if batch_size is None:
                self.batch_size = self.inputs.shape[0]
            else:
                self.batch_size = batch_size
            self.curr_batch = 0
            self.batch_count = self.inputs.shape[0] // self.batch_size
            

        # A method which loads the next batch\n",
        def __next__(self):
            if self.curr_batch >= self.batch_count:
                self.curr_batch = 0,
                raise StopIteration()
                
                
            # You slice the dataset in batches and then the \"next\" function loads them one after the other\n",
            batch_slice = slice(self.curr_batch * self.batch_size, (self.curr_batch + 1) * self.batch_size)
            inputs_batch = self.inputs[batch_slice]
            targets_batch = self.targets[batch_slice]
            self.curr_batch += 1
            
            # One-hot encode the targets. In this example it's a bit superfluous since we have a 0/1 column \n",
            # as a target already but we're giving you the code regardless, as it will be useful for any \n",
            # classification task with more than one target column\n",
            classes_num = 2
            targets_one_hot = np.zeros((targets_batch.shape[0], classes_num))
            targets_one_hot[range(targets_batch.shape[0]), targets_batch] = 1
            
            # The function will return the inputs batch and the one-hot encoded targets\n",
            
            return inputs_batch, targets_one_hot
        # A method needed for iterating over the batches, as we will put them in a loop\n",
        # This tells Python that the class we're defining is iterable, i.e. that we can use it like:\n",
        # for input, output in data: \n",
            # do things\n",
        # An iterator in Python is a class with a method __next__ that defines exactly how to iterate through its objects\n",
        def __iter__(self):
            return self

In [40]:
import tensorflow as tf
input_size = 10
output_size = 2
hidden_layer_size = 50
tf.reset_default_graph()
inputs = tf.placeholder(tf.float32, [None, input_size])
targets = tf.placeholder(tf.int32, [None, output_size])
weights_1 = tf.get_variable("weights_1", [input_size, hidden_layer_size])
biases_1 = tf.get_variable("biases_1", [hidden_layer_size])
outputs_1 = tf.nn.relu(tf.matmul(inputs,weights_1) + biases_1)
weights_2 = tf.get_variable("weights_2", [hidden_layer_size, hidden_layer_size])
biases_2 = tf.get_variable("biases_2",[hidden_layer_size])
outputs_2 = tf.nn.relu(tf.matmul(outputs_1,weights_2) + biases_2)
weights_3 = tf.get_variable("weights_3", [hidden_layer_size, output_size])
biases_3 = tf.get_variable("biases_3", [output_size])
outputs = tf.matmul(outputs_2, weights_3) + biases_3
loss = tf.nn.softmax_cross_entropy_with_logits(logits = outputs, labels = targets)

mean_loss = tf.reduce_mean(loss)

optimize = tf.train.AdamOptimizer(learning_rate=0.001).minimize(mean_loss)

out_equals_target = tf.equal(tf.argmax(outputs,1), tf.argmax(targets,1))

accuracy = tf.reduce_mean(tf.cast(out_equals_target, tf.float32))
sess = tf.InteractiveSession()

initializer = tf.global_variables_initializer()

sess.run(initializer)

batch_size = 100

max_epochs = 50
prev_validation_loss = 9999999.






In [47]:

for epoch_counter in range(max_epochs):
        curr_epoch_loss = 0.
        train_data = Audiobooks_Data_Reader('train', 100)
        validation_data = Audiobooks_Data_Reader ('validation')
        for input_batch, target_batch in train_data:
            _ , batch_loss = sess.run([optimize, mean_loss],feed_dict={inputs: input_batch,targets:target_batch})
            curr_epoch_loss += batch_loss
        curr_epoch_loss /= train_data.batch_count
        validation_loss = 0.
        validation_accuracy = 0
        for input_batch, target_batch in validation_data:
            validation_loss, validation_accuracy = sess.run([mean_loss, accuracy],
            feed_dict={inputs: input_batch, targets: target_batch})
    
        print('Epoch '+str(epoch_counter+1)+
              '. Training loss: '+'{0:.3f}'.format(curr_epoch_loss)+
              '. Validation loss: '+'{0:.3f}'.format(validation_loss)+
              '. Validation accuracy: '+'{0:.2f}'.format(validation_accuracy * 100.)+'%')
        if validation_loss > prev_validation_loss:
            break
        prev_validation_loss = validation_loss
print('End of training.')

Epoch 1. Training loss: 0.316. Validation loss: 0.317. Validation accuracy: 83.89%
End of training.


# Testing the module

In [57]:
# Load the test data, following the same logic as we did for the train_data and validation data\n",
test_data = Audiobooks_Data_Reader('test')
for inputs_batch, targets_batch in test_data:
    test_accuracy = sess.run([accuracy],
                     feed_dict={inputs: inputs_batch, targets: targets_batch})
test_accuracy_percent = test_accuracy[0] * 100.
   
print('Test accuracy: '+'{0:.2f}'.format(test_accuracy_percent)+'%')

Test accuracy: 82.37%
