## Create the machine learning algorithm

### Libraries

In [1]:
import numpy as np
import tensorflow as tf

### Data

In [2]:
# Create a temporary variable npz, with each of the three Audiobooks datasets
npz = np.load('Audiobooks_data_train.npz')

# Extract the inputs using the keyword under which it was saved
# Ensure that they are all floats
train_inputs = npz['inputs'].astype(np.float)
# targets must be int because of sparse_categorical_crossentropy
train_targets = npz['targets'].astype(np.int)

# Load the validation data in the temporary variable
npz = np.load('Audiobooks_data_validation.npz')
# Load the inputs and the targets in the same line
validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

# Load the test data in the temporary variable
npz = np.load('Audiobooks_data_test.npz')
# Create 2 variables that will contain the test inputs and the test targets
test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

### Model

In [3]:
# Optionally set the input size. It won't be using in that case, but in some cases it is beneficial
# input_size = 10
# Set the output size
output_size = 2
# Use same hidden layer size for both hidden layers.
hidden_layer_size = 50
    
# Define how the model will look like
model = tf.keras.Sequential([
    # tf.keras.layers.Dense is basically implementing: output = activation(dot(input, weight) + bias)
    # it takes several arguments, but the most important ones for us are the hidden_layer_size and the activation function
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2nd hidden layer
    # the final layer is no different, make sure to activate it with softmax
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])


# Choose the optimizer and the loss function
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training
# Set the batch size
batch_size = 100

# Set a maximum number of training epochs
max_epochs = 100

# Set an early stopping mechanism
# Set patience=2, to be a bit tolerant against random validation loss increases
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

# Fit the model
# note that this time the train, validation and test data are not iterable
model.fit(train_inputs, # train inputs
          train_targets, # train targets
          batch_size=batch_size, # batch size
          epochs=max_epochs, # epochs that we will train for (assuming early stopping doesn't kick in)
          # callbacks are functions called by a task when a task is completed
          # task here is to check if val_loss is increasing
          callbacks=[early_stopping], # early stopping
          validation_data=(validation_inputs, validation_targets), # validation data
          verbose = 2 # making sure we get enough information about the training process
          )

Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 1s - loss: 0.5316 - accuracy: 0.7818 - val_loss: 0.3664 - val_accuracy: 0.9060
Epoch 2/100
3579/3579 - 0s - loss: 0.3630 - accuracy: 0.8698 - val_loss: 0.2780 - val_accuracy: 0.9105
Epoch 3/100
3579/3579 - 0s - loss: 0.3261 - accuracy: 0.8751 - val_loss: 0.2526 - val_accuracy: 0.9150
Epoch 4/100
3579/3579 - 0s - loss: 0.3072 - accuracy: 0.8832 - val_loss: 0.2392 - val_accuracy: 0.9195
Epoch 5/100
3579/3579 - 0s - loss: 0.2946 - accuracy: 0.8849 - val_loss: 0.2329 - val_accuracy: 0.9217
Epoch 6/100
3579/3579 - 0s - loss: 0.2831 - accuracy: 0.8899 - val_loss: 0.2236 - val_accuracy: 0.9284
Epoch 7/100
3579/3579 - 0s - loss: 0.2784 - accuracy: 0.8910 - val_loss: 0.2218 - val_accuracy: 0.9329
Epoch 8/100
3579/3579 - 0s - loss: 0.2697 - accuracy: 0.8952 - val_loss: 0.2250 - val_accuracy: 0.9239
Epoch 9/100
3579/3579 - 0s - loss: 0.2642 - accuracy: 0.8972 - val_loss: 0.2124 - val_accuracy: 0.9329
Epoch 10/100
3579/3579 - 0

<tensorflow.python.keras.callbacks.History at 0x133d85c50>

## Test the model

After training on the training data and validating on the validation data, test the final prediction power of model by running it on the test dataset that the algorithm has NEVER seen before.

It is very important to realize that fiddling with the hyperparameters overfits the validation dataset. 

The test is the absolute final instance.

If you adjust your model after testing, you will start overfitting the test dataset, which will defeat its purpose.

In [4]:
# Declare two variables that are going to contain the two outputs from the evaluate function
# they are the loss (which is there by default) and whatever was specified in the 'metrics' argument when fitting the model
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [5]:
# Print the result in neatly formatted
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.25. Test accuracy: 90.85%


### Obtain the probability for a customer to convert

In [6]:
# Predict the probability of each class using the 'predict' method
# The output comes in a scientific format
# Uncomment the round() method to achive a rounded result (not scientific notation)
model.predict(test_inputs)#.round(2)

array([[6.10031843e-01, 3.89968127e-01],
       [1.00000000e+00, 2.32860470e-13],
       [1.00000000e+00, 3.48767934e-16],
       [6.18820131e-01, 3.81179929e-01],
       [7.06007719e-01, 2.93992281e-01],
       [1.71381548e-01, 8.28618407e-01],
       [9.20913666e-02, 9.07908618e-01],
       [1.54565737e-01, 8.45434248e-01],
       [4.62374592e-04, 9.99537706e-01],
       [8.24483752e-01, 1.75516218e-01],
       [1.54005095e-01, 8.45994890e-01],
       [1.40965417e-01, 8.59034538e-01],
       [1.01247795e-01, 8.98752213e-01],
       [9.93918419e-01, 6.08153036e-03],
       [8.60991716e-01, 1.39008224e-01],
       [9.44926023e-01, 5.50739579e-02],
       [9.99934316e-01, 6.56858174e-05],
       [1.17736310e-02, 9.88226354e-01],
       [1.03848435e-01, 8.96151602e-01],
       [9.95944560e-01, 4.05547814e-03],
       [9.76306975e-01, 2.36930232e-02],
       [9.98183906e-01, 1.81603699e-03],
       [7.36958325e-01, 2.63041705e-01],
       [2.08558583e-12, 1.00000000e+00],
       [1.642642

In [7]:
# Alternatively, get only the second column
# The main idea is that we are often interested in ONLY ONE of the two outcomes
# In this case we would like to know if the customer will convert again
# Round to 0 digits, to achieve only 0s or 1s
model.predict(test_inputs)[:,1].round(0)

array([0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1.,
       1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1.,
       0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1.,
       0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0.,
       0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1.,
       1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1.,
       1., 0., 0., 1., 0.

In [8]:
# A much better approach here would be to use argmax (arguments of the maxima)
# Argmax indicates the position of the highest argument row-wise or column-wise
# In our case, we want ot know which COLUMN has the higher argument (probability), therefore we set axis=1 (for columns)
# The output would be the column ID with the highest argument for each observation (row)
# For instance, the first observation (in our output) was [0.93,0.07]
# np.argmax([0.93,0.07], axis=1) would find that 0.91 is the higher argument (higher probability) and return 0
# This method is great for multi-class problems as it is independent of the number of classes
np.argmax(model.predict(test_inputs),axis=1)

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

### Save the model

In [9]:
# Save the model using the built-in method TensorFlow method
# Since the HDF format is optimal for large numerical objects, that's our preferred choice here
# The proper extension is .h5 to indicate HDF, version 5
model.save('audiobooks_model.h5')