# Audiobooks business case



## Preprocess the data. Balance the dataset. Create 3 datasets: training, validation, and test. Save the newly created sets in a tensor friendly format (e.g. *.npz)

Since we are dealing with real life data, we will need to preprocess it a bit. This is the relevant code, which is not that hard, but is crucial to creating a good model.

If you want to know how to do that, go through the code. In any case, this should do the trick for most datasets organized in the way: many inputs, and then 1 cell containing the targets (supervised learning datasets). Keep in mind that a specific problem may require additional preprocessing.

Note that we have removed the header row, which contains the names of the categories. We simply want the data.

This code does not include comments - it is the same as the one in the lesson. Please refer to the other file if you want the code with comments.

In [1]:
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

In [2]:
raw_csv_data = np.loadtxt('Audiobooks_data.csv', delimiter = ',')
raw_csv_data.shape


(14084, 12)

In [3]:
unscaled_inputs_all = raw_csv_data[:,1:-1]
unscaled_inputs_all.shape


(14084, 10)

In [4]:
targets_all = raw_csv_data[:,-1]
targets_all.shape

(14084,)

In [5]:
targets_all.shape[0]

14084

# Shuffle the data

Shuffle the data before balancing

In [6]:
unscaled_inputs_all.shape

(14084, 10)

In [7]:
shuffled_indices = np.arange(unscaled_inputs_all.shape[0])
np.random.shuffle(shuffled_indices)

# Use the shuffled indices to shuffle the inputs and targets.
unscaled_inputs_all = unscaled_inputs_all[shuffled_indices]
targets_all = targets_all[shuffled_indices]

In [8]:
unscaled_inputs_all.shape

(14084, 10)

# Balance the dataset

In [9]:
num_one_targets = int(np.sum(targets_all))
num_one_targets

2237

In [10]:
targets_all.shape

(14084,)

In [11]:
zero_targets_counter = 0 
indices_to_remove = []
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)
#indices_to_remove




In [12]:
# Create two new variables, one that will contain the inputs, and one that will contain the targets.
# We delete all indices that we marked "to remove" in the loop above.
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)

In [13]:
unscaled_inputs_equal_priors.shape

(4474, 10)

In [14]:
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)
targets_equal_priors.shape[0]

4474

# Standardize the input

In [15]:
unscaled_inputs_equal_priors

array([[2160.  , 2160.  ,    5.54, ...,  680.4 ,    0.  ,  222.  ],
       [1620.  , 1620.  ,    5.69, ...,    0.  ,    0.  ,   11.  ],
       [2160.  , 2160.  ,    6.58, ...,    0.  ,    0.  ,  257.  ],
       ...,
       [1656.  , 4968.  ,    5.33, ...,    0.  ,    0.  ,  284.  ],
       [1620.  , 1620.  ,    5.33, ...,  567.  ,    0.  ,    5.  ],
       [2160.  , 2160.  ,    6.39, ...,    0.  ,    0.  ,   29.  ]])

In [16]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

In [17]:
scaled_inputs

array([[ 1.21733222,  0.38110245, -0.30843237, ...,  2.0904448 ,
        -0.18915319,  1.619826  ],
       [ 0.1387235 , -0.23404019, -0.27973802, ..., -0.45101755,
        -0.18915319, -0.64569851],
       [ 1.21733222,  0.38110245, -0.1094849 , ..., -0.45101755,
        -0.18915319,  1.9956239 ],
       ...,
       [ 0.21063074,  3.57984418, -0.34860445, ..., -0.45101755,
        -0.18915319,  2.28552514],
       [ 0.1387235 , -0.23404019, -0.34860445, ...,  1.66686774,
        -0.18915319, -0.710121  ],
       [ 1.21733222,  0.38110245, -0.14583107, ..., -0.45101755,
        -0.18915319, -0.45243101]])

# Shuffle the data

In [18]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

# Split the data into train, validation and test

In [19]:
samples_count = shuffled_inputs.shape[0]
samples_count

4474

In [20]:
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

print(train_samples_count, validation_samples_count, test_samples_count, samples_count)

3579 447 448 4474


In [21]:
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

In [22]:
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

1768.0 3579 0.49399273540095
242.0 447 0.5413870246085011
227.0 448 0.5066964285714286


# Save the data in .npz file

In [23]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

# MACHINE LEARNING ALGORITHMS

# Data

In [24]:
npz = np.load('Audiobooks_data_train.npz')
train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)


npz = np.load('Audiobooks_data_validation.npz')
validation_inputs = npz['inputs'].astype(np.float)
validation_targets = npz['targets'].astype(np.int)

npz = np.load('Audiobooks_data_test.npz')
test_inputs = npz['inputs'].astype(np.float)
test_targets = npz['targets'].astype(np.int)

# Model

In [25]:
input_size = 10
output_size = 2
hidden_layer_size = 50

model = tf.keras.Sequential([
    
    
    # tf.keras.layers.Dense is basically implementing: output = activation(dot(input, weight) + bias)
    # it takes several arguments, but the most important ones for us are the hidden_layer_size and the activation function
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2nd hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 3rd hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 4th hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 5th hidden layer
    
    
    # the final layer is no different, we just make sure to activate it with softmax
    tf.keras.layers.Dense(output_size, activation= 'softmax')
])

In [26]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [27]:
NUM_EPOCHS = 100
batch_size = 100
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)


# note that this time the train, validation and test data are not iterable
model.fit(train_inputs, # train inputs
          train_targets, # train targets
          batch_size=batch_size, # batch size
          epochs=NUM_EPOCHS, # epochs that we will train for (assuming early stopping doesn't kick in)
          # callbacks are functions called by a task when a task is completed
          # task here is to check if val_loss is increasing
          callbacks=[early_stopping], # early stopping
          validation_data=(validation_inputs, validation_targets), # validation data
        #validation_steps=10,
          verbose = 2 # making sure we get enough information about the training process
          )  


Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 1s - loss: 0.6525 - accuracy: 0.6351 - val_loss: 0.5556 - val_accuracy: 0.7360
Epoch 2/100
3579/3579 - 0s - loss: 0.4591 - accuracy: 0.7678 - val_loss: 0.4426 - val_accuracy: 0.7427
Epoch 3/100
3579/3579 - 0s - loss: 0.4056 - accuracy: 0.7784 - val_loss: 0.4216 - val_accuracy: 0.7651
Epoch 4/100
3579/3579 - 0s - loss: 0.3853 - accuracy: 0.7935 - val_loss: 0.4700 - val_accuracy: 0.7405
Epoch 5/100
3579/3579 - 0s - loss: 0.3774 - accuracy: 0.7988 - val_loss: 0.4328 - val_accuracy: 0.7450


<tensorflow.python.keras.callbacks.History at 0x130fad710>

# Testing the model

In [28]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [29]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.35. Test accuracy: 81.47%


In [30]:
prediction = model.predict(test_inputs)

In [31]:
for i in range(len(prediction)):
    print(prediction[i], test_targets[i])

[0.460979  0.5390209] 1
[0.49729523 0.50270474] 1
[0.99863225 0.00136767] 0
[0.60345536 0.3965446 ] 1
[9.999988e-01 1.178492e-06] 0
[0.5533525  0.44664752] 0
[0.30026925 0.6997308 ] 0
[9.992901e-01 7.098777e-04] 0
[0.00106896 0.998931  ] 1
[9.9999404e-01 5.9963077e-06] 0
[0.5361681 0.4638319] 1
[0.00770489 0.99229515] 1
[1.0000000e+00 1.0339983e-09] 0
[0.56504256 0.43495747] 0
[0.9978053 0.0021947] 0
[0.24591367 0.7540863 ] 1
[0.25366023 0.74633974] 1
[0.51285034 0.48714963] 0
[0.78317934 0.21682064] 0
[0.59253895 0.4074611 ] 1
[9.9984229e-01 1.5774026e-04] 0
[0.28123963 0.7187604 ] 1
[0.3339983 0.6660018] 0
[9.9996555e-01 3.4422726e-05] 0
[0.96837425 0.03162568] 0
[0.43178368 0.5682163 ] 0
[1.1108751e-05 9.9998891e-01] 1
[0.5799369  0.42006305] 0
[0.48199505 0.51800495] 0
[0.51799643 0.48200363] 0
[0.99612635 0.00387369] 0
[0.5726402  0.42735988] 0
[2.3316195e-05 9.9997663e-01] 1
[0.2808846  0.71911544] 1
[0.36898017 0.63101983] 0
[0.6617531 0.3382469] 0
[0.00100311 0.9989969 ] 1
[0.0

[2.0025812e-04 9.9979979e-01] 1
[0.00413616 0.9958638 ] 1
[0.38089585 0.61910415] 1
[0.0176383  0.98236173] 1
[0.67670393 0.32329604] 1
[0.01906239 0.98093766] 1
[0.6409063 0.3590938] 0
[0.55460477 0.44539523] 1
[0.56504256 0.43495747] 0
[0.00255662 0.99744344] 1
[0.46125382 0.5387462 ] 1
[0.47334802 0.5266519 ] 1
[0.00119071 0.99880934] 1
[0.5586287 0.4413714] 0
[0.01816665 0.9818333 ] 1
[0.56504256 0.43495747] 0
[0.98991245 0.01008763] 0
[0.32500154 0.6749985 ] 0
[0.00101681 0.9989832 ] 1
[0.50336725 0.49663273] 0
[0.9982849  0.00171508] 0
[0.3644401  0.63555986] 0
[0.04362514 0.9563749 ] 1
[0.46870708 0.531293  ] 0
[2.3033557e-04 9.9976963e-01] 1
[0.06199101 0.938009  ] 1
[0.48697305 0.51302695] 1
[0.0842142 0.9157858] 1
[0.95494986 0.04505016] 0
[0.9987778  0.00122216] 0
[0.6025654  0.39743462] 1
[0.56504256 0.43495747] 0
[0.5981697 0.4018303] 0
[0.53056836 0.4694317 ] 1
[1.000000e+00 8.932945e-09] 0
[9.9999201e-01 7.9842675e-06] 0
[6.8165816e-04 9.9931836e-01] 1
[0.01316018 0.9868

In [36]:
for i in range(len(prediction)):
    print(np.argmax(prediction[i]), test_targets[i])

1 1
1 1
0 0
0 1
0 0
0 0
1 0
0 0
1 1
0 0
0 1
1 1
0 0
0 0
0 0
1 1
1 1
0 0
0 0
0 1
0 0
1 1
1 0
0 0
0 0
1 0
1 1
0 0
1 0
0 0
0 0
0 0
1 1
1 1
1 0
0 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 1
1 1
0 0
0 0
1 1
0 0
0 0
1 1
1 1
0 0
0 0
0 1
1 1
0 0
1 1
1 1
0 1
0 0
0 1
0 0
0 0
1 1
0 0
1 1
0 0
0 1
0 0
0 0
0 0
1 0
0 0
1 1
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
1 1
1 1
1 0
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 0
0 0
0 0
0 1
1 0
0 0
1 1
0 0
0 0
1 1
0 1
1 0
1 1
0 0
0 1
1 1
1 1
0 1
1 1
0 0
1 1
1 1
0 0
0 1
0 0
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 1
0 0
0 0
0 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 1
0 1
0 0
0 0
0 0
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
1 1
0 1
1 1
1 1
1 0
0 1
1 1
0 0
1 0
0 0
0 0
0 0
1 1
0 0
1 0
1 0
1 1
0 0
0 0
1 1
0 0
0 0
0 0
1 1
1 1
0 0
1 1
1 1
0 1
0 0
0 1
1 1
0 0
1 1
1 1
1 1
0 1
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
1 1
0 1
0 0
0 0
1 0
0 0
1 1
0 0
1 1
0 1
0 1
1 1
1 1
1 0
0 0
0 0
1 0
1 1
0 1
0 0
0 0
1 1
1 1
0 0
0 1
