In [307]:
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

raw_csv_data = np.loadtxt('Audiobooks_data.csv', delimiter = ',')

unscaled_input_all = raw_csv_data[:, 1:-1]
targets_all = raw_csv_data[:,-1]

In [308]:
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []
#num_one_targets

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_input_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

In [309]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

In [310]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

In [311]:
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count: train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count: train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets),test_samples_count, np.sum(test_targets)/test_samples_count)

1785.0 3579 0.49874266554903607
230.0 447 0.5145413870246085
222.0 448 0.4955357142857143


In [312]:
np.savez('Audiobook_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobook_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobook_data_test', inputs=test_inputs, targets=test_targets)

### Batching

In [313]:
# A class that will do the batching for the algorithm
class Audiobooks_Data_Reader():
    # Dataset is a mendatory argurment, while batch_size is optional.
    # If you dont input batch_size, it will automatically take the value: None
    def __init__(self, dataset, batch_size=None):
        
        # The dataset that loads is one of 'train', 'validation', 'test'
        # eg-> if i call this class with x('train', 5), it will load 'Audiobooks_data_train.npz' with a batch size of 5.
        npz = np.load('Audiobook_data_{0}.npz'.format(dataset))
        self.inputs, self.targets = npz['inputs'].astype(float), npz['targets'].astype(int)
        if batch_size is None:
            self.batch_size = self.inputs.shape[0]
        else:
            self.batch_size = batch_size
        self.curr_batch = 0
        self.batch_count = self.inputs.shape[0] // self.batch_size

    def __next__(self):
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration()

        # You slice the dataset in batches and then the "next" function loads them one after the other.
        batch_slice = slice(self.curr_batch * self.batch_size, (self.curr_batch + 1) * self.batch_size)
        inputs_batch = self.inputs[batch_slice]
        targets_batch = self.targets[batch_slice]
        self.curr_batch += 1

        # One-hot encode the targets.
        classes_num = 2
        #targets_one_hot = np.zeros((targets_batch.size[0], classes_num))
        #targets_one_hot[range(targets_batch.size[0]), targets_batch] = 1
        targets_one_hot = np.zeros((len(targets_batch), classes_num))
        targets_one_hot[np.arange(len(targets_batch)), targets_batch] = 1

        return inputs_batch, targets_one_hot

    def __iter__(self):
        return self

### Machine Learning Algo

In [314]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [315]:
input_size = 10
output_size = 2
hidden_layer_size = 50

tf.compat.v1.reset_default_graph()#It will let us train our model from scratch. Simply by rerunning the cell.

inputs = tf.placeholder(tf.float32, [None, input_size])
targets = tf.placeholder(tf.int32, [None, output_size])

weights_1 = tf.get_variable("weights_1", [input_size, hidden_layer_size])
biases_1 = tf.get_variable("biases_1", [hidden_layer_size])

outputs_1 = tf.nn.relu(tf.matmul(inputs, weights_1) + biases_1)

weights_2 = tf.get_variable("weights_2", [hidden_layer_size, hidden_layer_size])
biases_2 = tf.get_variable("biases_2", [hidden_layer_size])

outputs_2 = tf.nn.relu(tf.matmul(outputs_1, weights_2) + biases_2)

weights_3 = tf.get_variable("weights_3", [hidden_layer_size, output_size])
biases_3 = tf.get_variable("biases_3", [output_size])

#outputs_3 = tf.nn.relu(tf.matmul(outputs_2, weights_3) + biases_3)

#weights_4 = tf.get_variable("weights_4", [hidden_layer_size, output_size])
#biases_4 = tf.get_variable("biases_4", [output_size])

outputs = tf.matmul(outputs_2, weights_3 + biases_3)

#tf.nn.softmax_cross_entropy_with_logits()-> is a function that applies a softmax activation and calculates a cross entropy loss simultaniously.
#Logits here have a meaning of unscaled probability
loss = tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=targets)
mean_loss = tf.reduce_mean(loss) #is a method which find the mean of the elements of a tensor across a dimension.

optimize = tf.train.AdamOptimizer(learning_rate=0.001).minimize(mean_loss)

out_equals_target = tf.equal(tf.arg_max(outputs, 1), tf.arg_max(targets, 1))

accuracy = tf.reduce_mean(tf.cast(out_equals_target, tf.float32)) #cast-> changes the data type

sess = tf.InteractiveSession()

initializer = tf.global_variables_initializer()

sess.run(initializer)

batch_size = 100

max_epochs = 50

prev_validation_loss = 9999999.

train_data = Audiobooks_Data_Reader('train', batch_size)
validation_data = Audiobooks_Data_Reader('validation')

for epoch_counter in range(max_epochs):
    curr_epoch_loss = 0.
    #Training
    for input_batch, target_batch in train_data:
        #we optimize the mean loss by feeding the algo with data from train data in batched.
        _, batch_loss = sess.run([optimize, mean_loss],
                                feed_dict={inputs: input_batch, targets: target_batch})
        curr_epoch_loss += batch_loss #Record the sum of all batch losses

    curr_epoch_loss /= train_data.batch_count

    #Validation
    validation_loss = 0.
    validation_accuracy = 0.
    for input_batch, target_batch in validation_data:
        #forward propogate
        validation_loss, validation_accuracy = sess.run([mean_loss, accuracy],
                                feed_dict={inputs: input_batch, targets: target_batch})
    print('Epoch' + str(epoch_counter+1)+
         '. Training loss: ' + '{0: .3f}'.format(curr_epoch_loss)+
         '. Validation loss: ' + '{0: .3f}'.format(validation_loss)+
         '. Validation accuracy' + '{0: .3f}'.format(validation_accuracy * 100.)+'%')

    if validation_loss > prev_validation_loss:
        break

    prev_validation_loss = validation_loss

print('End of training')



Epoch1. Training loss:  1.254. Validation loss:  0.732. Validation accuracy 51.454%
Epoch2. Training loss:  0.648. Validation loss:  0.601. Validation accuracy 61.298%
Epoch3. Training loss:  0.554. Validation loss:  0.545. Validation accuracy 69.575%
Epoch4. Training loss:  0.498. Validation loss:  0.503. Validation accuracy 73.154%
Epoch5. Training loss:  0.455. Validation loss:  0.472. Validation accuracy 74.497%
Epoch6. Training loss:  0.423. Validation loss:  0.447. Validation accuracy 76.510%
Epoch7. Training loss:  0.400. Validation loss:  0.429. Validation accuracy 78.076%
Epoch8. Training loss:  0.383. Validation loss:  0.418. Validation accuracy 78.076%
Epoch9. Training loss:  0.371. Validation loss:  0.409. Validation accuracy 78.076%
Epoch10. Training loss:  0.362. Validation loss:  0.401. Validation accuracy 78.523%
Epoch11. Training loss:  0.355. Validation loss:  0.396. Validation accuracy 78.523%
Epoch12. Training loss:  0.350. Validation loss:  0.392. Validation accura

### Test the model

In [316]:
test_data = Audiobooks_Data_Reader('test')

for input_batch, target_batch in test_data:
    test_accuracy = sess.run([accuracy],
                            feed_dict = {inputs: input_batch, targets: target_batch})

test_accuracy_percentage = test_accuracy[0]*100
print('Test accuracy: ' + '{0: .2f}'.format(test_accuracy_percentage)+'%')

Test accuracy:  81.92%
