# Importing the libraries

In [1]:
import numpy as np
from sklearn import preprocessing

# Importing the data

In [2]:
raw_csv = np.loadtxt('Audiobooks_data.csv', delimiter = ',')
type(raw_csv)

numpy.ndarray

In [3]:
raw_csv.shape

(14084, 12)

In [4]:
unscaled_inputs_all = raw_csv[:,1:-1]
unscaled_inputs_all.shape

(14084, 10)

In [5]:
targets_all = raw_csv[:,-1]
targets_all

array([1., 1., 1., ..., 0., 0., 0.])

# Shuffle the data

In [6]:
shuffled_indices = np.arange(unscaled_inputs_all.shape[0])
#print(shuffled_indices)
np.random.shuffle(shuffled_indices)
#print(shuffled_indices)

# Use the shuffled indices to shuffle the inputs and targets.
unscaled_inputs_all = unscaled_inputs_all[shuffled_indices]
targets_all = targets_all[shuffled_indices]

# Balance the dataset

In [7]:
onesCount = int(np.sum(targets_all))
onesCount

2237

In [8]:
zero_targets_counter = 0 
indices_to_remove = []
for i in range(targets_all.shape[0]):
    if int(targets_all[i]) == 0:
        zero_targets_counter += 1
        if zero_targets_counter > onesCount:
            indices_to_remove.append(i)

In [9]:
# Create two new variables, one that will contain the inputs, and one that will contain the targets.
# We delete all indices that we marked "to remove" in the loop above.
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
unscaled_inputs_equal_priors.shape

(4474, 10)

In [10]:
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)
targets_equal_priors.shape

(4474,)

# Standardize the inputs

In [11]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)
scaled_inputs.shape

(4474, 10)

# Shuffle the scaled data

In [12]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

# Split the data into train, validation and test

In [13]:
sample_count = shuffled_indices.shape[0]

train_sample_count = int(0.8 * sample_count)
vaidation_sample_count = int(0.1 * sample_count)
test_sample_count = sample_count - train_sample_count - vaidation_sample_count
print(train_sample_count, vaidation_sample_count, test_sample_count)

3579 447 448


In [14]:
train_inputs = shuffled_inputs[:train_sample_count]
train_targets = shuffled_targets[:train_sample_count]

validation_inputs = shuffled_inputs[train_sample_count:train_sample_count+vaidation_sample_count]
validation_targets = shuffled_targets[train_sample_count:train_sample_count+vaidation_sample_count]

test_inputs = shuffled_inputs[train_sample_count+vaidation_sample_count:]
test_targets = shuffled_targets[train_sample_count+vaidation_sample_count:]

In [15]:
print(np.sum(train_targets), train_sample_count, np.sum(train_targets) / train_sample_count)
print(np.sum(validation_targets), vaidation_sample_count, np.sum(validation_targets) / vaidation_sample_count)
print(np.sum(test_targets), test_sample_count, np.sum(test_targets) / test_sample_count)

1790.0 3579 0.5001397038278849
235.0 447 0.5257270693512305
212.0 448 0.4732142857142857


# Saving the data in .npz file

In [16]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

# Creating methods for batching

In [17]:
class Audiobooks_Data_Reader():
    
    def __init__(self, dataset, batch_size = None):
    
        # The dataset that loads is one of "train", "validation", "test".
        # e.g. if I call this class with x('train',5), it will load 'Audiobooks_data_train.npz' with a batch size of 5.
        npz = np.load('Audiobooks_data_{0}.npz'.format(dataset))
        self.inputs, self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
        
        # Counts the batch number, given the size you feed it later
        # If the batch size is None, we are either validating or testing, so we want to take the data in a single batch
        if batch_size is None:
            self.batch_size = self.inputs.shape[0]
        else:
            self.batch_size = batch_size
        self.curr_batch = 0
        self.batch_count = self.inputs.shape[0] // self.batch_size
    
    # A method which loads the next batch
    def __next__(self):
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration()
            
        # You slice the dataset in batches and then the "next" function loads them one after the other
        batch_slice = slice(self.curr_batch * self.batch_size, (self.curr_batch + 1) * self.batch_size)
        inputs_batch = self.inputs[batch_slice]
        targets_batch = self.targets[batch_slice]
        self.curr_batch += 1
        
        # One-hot encode the targets. In this example it's a bit superfluous since we have a 0/1 column 
        # as a target already but we're giving you the code regardless, as it will be useful for any 
        # classification task with more than one target column
        classes_num = 2
        targets_one_hot = np.zeros((targets_batch.shape[0], classes_num))
        targets_one_hot[range(targets_batch.shape[0]), targets_batch] = 1
        
        # The function will return the inputs batch and the one-hot encoded targets
        return inputs_batch, targets_one_hot
    
        
    # A method needed for iterating over the batches, as we will put them in a loop
    # This tells Python that the class we're defining is iterable, i.e. that we can use it like:
    # for input, output in data: 
        # do things
    # An iterator in Python is a class with a method __next__ that defines exactly how to iterate through its objects
    def __iter__(self):
        return self

# Create the machine learning algorithm


In [18]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#import input_data

Instructions for updating:
non-resource variables are not supported in the long term


In [19]:
input_size = 10
output_size = 2
hidden_layer_size = 50

# Reset any variables left in memory from previous runs.
tf.reset_default_graph()

# As in the previous example - declare placeholders where the data will be fed into.
inputs = tf.placeholder(tf.float32, [None, input_size])
targets = tf.placeholder(tf.int32, [None, output_size])


# -----------------------------------     2 Hidden Layers    -------------------------------------------
weights_1 = tf.get_variable("weights_1", [input_size, hidden_layer_size])
biases_1 = tf.get_variable("biases_1", [hidden_layer_size])
outputs_1 = tf.nn.relu(tf.matmul(inputs, weights_1) + biases_1)

weights_2 = tf.get_variable("weights_2", [hidden_layer_size, hidden_layer_size])
biases_2 = tf.get_variable("biases_2", [hidden_layer_size])
outputs_2 = tf.nn.sigmoid(tf.matmul(outputs_1, weights_2) + biases_2)

weights_3 = tf.get_variable("weights_3", [hidden_layer_size, output_size])
biases_3 = tf.get_variable("biases_3", [output_size])
outputs = tf.matmul(outputs_2, weights_3) + biases_3

# ------------------------------------------------------------------------------------------------------


loss = tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=targets)
mean_loss = tf.reduce_mean(loss)

# Get a 0 or 1 for every input indicating whether it output the correct answer
out_equals_target = tf.equal(tf.argmax(outputs, 1), tf.argmax(targets, 1))
accuracy = tf.reduce_mean(tf.cast(out_equals_target, tf.float32))

# Optimize with Adam
optimize = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(mean_loss)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



# Creating Session

In [20]:
sess = tf.InteractiveSession()
initializer = tf.global_variables_initializer()      # Initialize the variables. Default initializer is Xavier.
sess.run(initializer)

In [21]:
batch_size = 100
max_epochs = 1000
prev_validation_loss = 9999999.

# Loading the data

In [22]:
train_data = Audiobooks_Data_Reader('train', batch_size)
validation_data = Audiobooks_Data_Reader('validation')

In [23]:
# Create the loop for epochs 
for epoch_counter in range(max_epochs):
    
    # Set the epoch loss to 0, and make it a float
    curr_epoch_loss = 0.
    
    # Iterate over the training data 
    # Since train_data is an instance of the Audiobooks_Data_Reader class,
    # we can iterate through it by implicitly using the __next__ method we defined above.
    # As a reminder, it batches samples together, one-hot encodes the targets, and returns
    # inputs and targets batch by batch
    for input_batch, target_batch in train_data:
        _, batch_loss = sess.run([optimize, mean_loss], 
            feed_dict={inputs: input_batch, targets: target_batch})
        
        #Record the batch loss into the current epoch loss
        curr_epoch_loss += batch_loss
    
    # Find the mean curr_epoch_loss
    # batch_count is a variable, defined in the Audiobooks_Data_Reader class
    curr_epoch_loss /= train_data.batch_count
    
    # Set validation loss and accuracy for the epoch to zero
    validation_loss = 0.
    validation_accuracy = 0.
    
    # Use the same logic of the code to forward propagate the validation set
    # There will be a single batch, as the class was created in this way
    for input_batch, target_batch in validation_data:
        validation_loss, validation_accuracy = sess.run([mean_loss, accuracy],
            feed_dict={inputs: input_batch, targets: target_batch})
    
    # Print statistics for the current epoch
    print('Epoch '+str(epoch_counter+1)+
          '. Training loss: '+'{0:.3f}'.format(curr_epoch_loss)+
          '. Validation loss: '+'{0:.3f}'.format(validation_loss)+
          '. Validation accuracy: '+'{0:.2f}'.format(validation_accuracy * 100.)+'%')
    
    # Trigger early stopping if validation loss begins increasing.
    if validation_loss > prev_validation_loss:
        print("BREAKING")
        print(validation_loss, prev_validation_loss)
        break
        
    # Store this epoch's validation loss to be used as previous in the next iteration.
    prev_validation_loss = validation_loss
    
print('End of training.')

Epoch 1. Training loss: 1.034. Validation loss: 1.024. Validation accuracy: 47.43%
Epoch 2. Training loss: 0.942. Validation loss: 0.932. Validation accuracy: 47.43%
Epoch 3. Training loss: 0.864. Validation loss: 0.856. Validation accuracy: 47.43%
Epoch 4. Training loss: 0.802. Validation loss: 0.795. Validation accuracy: 47.20%
Epoch 5. Training loss: 0.754. Validation loss: 0.749. Validation accuracy: 47.43%
Epoch 6. Training loss: 0.718. Validation loss: 0.715. Validation accuracy: 48.99%
Epoch 7. Training loss: 0.692. Validation loss: 0.690. Validation accuracy: 51.68%
Epoch 8. Training loss: 0.674. Validation loss: 0.671. Validation accuracy: 56.82%
Epoch 9. Training loss: 0.660. Validation loss: 0.657. Validation accuracy: 61.74%
Epoch 10. Training loss: 0.650. Validation loss: 0.646. Validation accuracy: 66.67%
Epoch 11. Training loss: 0.641. Validation loss: 0.637. Validation accuracy: 67.56%
Epoch 12. Training loss: 0.634. Validation loss: 0.629. Validation accuracy: 69.57%
E

Epoch 100. Training loss: 0.397. Validation loss: 0.392. Validation accuracy: 80.76%
Epoch 101. Training loss: 0.396. Validation loss: 0.392. Validation accuracy: 80.76%
Epoch 102. Training loss: 0.395. Validation loss: 0.391. Validation accuracy: 80.76%
Epoch 103. Training loss: 0.395. Validation loss: 0.391. Validation accuracy: 80.76%
Epoch 104. Training loss: 0.394. Validation loss: 0.390. Validation accuracy: 80.76%
Epoch 105. Training loss: 0.394. Validation loss: 0.390. Validation accuracy: 80.54%
Epoch 106. Training loss: 0.393. Validation loss: 0.389. Validation accuracy: 80.54%
Epoch 107. Training loss: 0.392. Validation loss: 0.389. Validation accuracy: 80.31%
Epoch 108. Training loss: 0.392. Validation loss: 0.388. Validation accuracy: 80.31%
Epoch 109. Training loss: 0.391. Validation loss: 0.388. Validation accuracy: 80.31%
Epoch 110. Training loss: 0.391. Validation loss: 0.387. Validation accuracy: 80.31%
Epoch 111. Training loss: 0.390. Validation loss: 0.387. Validati

Epoch 198. Training loss: 0.363. Validation loss: 0.366. Validation accuracy: 80.98%
Epoch 199. Training loss: 0.363. Validation loss: 0.366. Validation accuracy: 80.98%
Epoch 200. Training loss: 0.363. Validation loss: 0.366. Validation accuracy: 80.98%
Epoch 201. Training loss: 0.362. Validation loss: 0.366. Validation accuracy: 80.98%
Epoch 202. Training loss: 0.362. Validation loss: 0.366. Validation accuracy: 80.98%
Epoch 203. Training loss: 0.362. Validation loss: 0.366. Validation accuracy: 80.98%
Epoch 204. Training loss: 0.362. Validation loss: 0.366. Validation accuracy: 80.98%
Epoch 205. Training loss: 0.362. Validation loss: 0.366. Validation accuracy: 80.98%
Epoch 206. Training loss: 0.362. Validation loss: 0.366. Validation accuracy: 80.98%
Epoch 207. Training loss: 0.361. Validation loss: 0.365. Validation accuracy: 80.98%
Epoch 208. Training loss: 0.361. Validation loss: 0.365. Validation accuracy: 80.98%
Epoch 209. Training loss: 0.361. Validation loss: 0.365. Validati

Epoch 297. Training loss: 0.351. Validation loss: 0.358. Validation accuracy: 80.98%
Epoch 298. Training loss: 0.351. Validation loss: 0.358. Validation accuracy: 80.98%
Epoch 299. Training loss: 0.351. Validation loss: 0.358. Validation accuracy: 80.98%
Epoch 300. Training loss: 0.351. Validation loss: 0.358. Validation accuracy: 80.98%
Epoch 301. Training loss: 0.351. Validation loss: 0.358. Validation accuracy: 80.98%
Epoch 302. Training loss: 0.351. Validation loss: 0.357. Validation accuracy: 80.98%
Epoch 303. Training loss: 0.351. Validation loss: 0.357. Validation accuracy: 80.98%
Epoch 304. Training loss: 0.351. Validation loss: 0.357. Validation accuracy: 80.98%
Epoch 305. Training loss: 0.351. Validation loss: 0.357. Validation accuracy: 80.98%
Epoch 306. Training loss: 0.351. Validation loss: 0.357. Validation accuracy: 80.98%
Epoch 307. Training loss: 0.351. Validation loss: 0.357. Validation accuracy: 80.98%
Epoch 308. Training loss: 0.350. Validation loss: 0.357. Validati

Epoch 396. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 397. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 398. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 399. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 400. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 401. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 402. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 403. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 404. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 405. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 406. Training loss: 0.345. Validation loss: 0.353. Validation accuracy: 81.66%
Epoch 407. Training loss: 0.345. Validation loss: 0.353. Validati

# Testing the model

In [24]:
test_data = Audiobooks_Data_Reader('test')


In [25]:
for input_batch, target_batch in validation_data:
    test_accuracy = sess.run([accuracy],feed_dict={inputs: input_batch, targets: target_batch})
    
test_accuracy_percent = test_accuracy[0] * 100
print('Test Accuracy: '+'{0:.2f}'.format(test_accuracy_percent) + '%')

Test Accuracy: 81.66%


# Model II

In [26]:
npz = np.load('Audiobooks_data_train.npz')
train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)

npz = np.load('Audiobooks_data_test.npz')
test_inputs = npz['inputs'].astype(np.float)
test_targets = npz['targets'].astype(np.int)

In [27]:
input_size = 10
output_size = 2
hidden_layer_size = 50

model = tf.keras.Sequential([
    
    
    # tf.keras.layers.Dense is basically implementing: output = activation(dot(input, weight) + bias)
    # it takes several arguments, but the most important ones for us are the hidden_layer_size and the activation function
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2nd hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 3rd hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 4th hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 5th hidden layer
    
    
    # the final layer is no different, we just make sure to activate it with softmax
    tf.keras.layers.Dense(output_size, activation= 'softmax')
])

In [28]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [29]:
NUM_EPOCHS = 10
batch_size = 100
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)


# note that this time the train, validation and test data are not iterable
model.fit(train_inputs, # train inputs
          train_targets, # train targets
          batch_size=batch_size, # batch size
          epochs=NUM_EPOCHS, # epochs that we will train for (assuming early stopping doesn't kick in)
          # callbacks are functions called by a task when a task is completed
          # task here is to check if val_loss is increasing
          callbacks=[early_stopping], # early stopping
          validation_data=(validation_inputs, validation_targets), # validation data
          #validation_steps=10,
          verbose = 2 # making sure we get enough information about the training process
          )  


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Train on 3579 samples, validate on 447 samples
Epoch 1/10
3579/3579 - 0s - loss: 0.5707 - acc: 0.7220 - val_loss: 0.4871 - val_acc: 0.7562
Epoch 2/10
3579/3579 - 0s - loss: 0.4332 - acc: 0.7695 - val_loss: 0.4152 - val_acc: 0.7987
Epoch 3/10
3579/3579 - 0s - loss: 0.4043 - acc: 0.7930 - val_loss: 0.4181 - val_acc: 0.7897
Epoch 4/10
3579/3579 - 0s - loss: 0.3888 - acc: 0.7985 - val_loss: 0.3816 - val_acc: 0.7942
Epoch 5/10
3579/3579 - 0s - loss: 0.3796 - acc: 0.7966 - val_loss: 0.3759 - val_acc: 0.8009
Epoch 6/10
3579/3579 - 0s - loss: 0.3784 - acc: 0.7963 - val_loss: 0.3839 - val_acc: 0.7830
Epoch 7/10
3579/3579 - 0s - loss: 0.3682 - acc: 0.8053 - val_loss: 0.3871 - val_acc: 0.7897


<tensorflow.python.keras.callbacks.History at 0x127dd05c0>

In [30]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [31]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.39. Test accuracy: 79.91%


In [32]:
prediction = model.predict(test_inputs)

In [33]:
for i in range(len(prediction)):
    print("Predcited", np.argmax(prediction[i]), "Expected", test_targets[i])

Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 1 Expected 0
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 0 Expected 1
Predcited 1 Expected 1
Predcited 0 Expected 0
Predcited 0 Expected 1
Predcited 1 Expected 1
Predcited 0 Expected 1
Predcited 0 Expected 1
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 0 Expected 1
Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 0 Expected 0
Predcited 0 Expected 1
Predcited 0 Expected 0
Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 0 Expected 1
Predcited 0 Expected 1
Predcited 0 Expected 0
Predcited 1 Expected 1
Predcited 0 Expected 1
Predcited 1 Expected 0
Predcited 0 Expected 1
Predcited 0

Predcited 1 Expected 1
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 0 Expected 1
Predcited 0 Expected 1
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 1 Expected 1
Predcited 0 Expected 0
Predcited 0 Expected 0
Predcited 1 Expected 1
Predcited 1 Expected 1
Predcited 0 Expected 0
Predcited 1 Expected 1
Predcited 1 Expected 1
