# Audiobooks - Building a model to predict whether a customer will buy again or not

## 1. Preprocessing

* Preprocess the data
* Balance the dataset
* Create training, validation and test datasets
* Save the data in npz format

##### Imports

In [289]:
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

##### Data

In [290]:
raw_data = np.loadtxt('data/Audiobooks_data.csv', delimiter = ',')

# Inputs
raw_inputs = raw_data[:,1:-1]
# Output
raw_output = raw_data[:,-1]

##### Balancing the dataset

* There are way more zeros than ones in output column.
* The problem is that you could achieve a whopping ~84% accuracy by always predicting 0
* This could be problematic and requires balancing

In [291]:
(raw_output == 0).sum(), (raw_output == 1).sum()

(11847, 2237)

In [292]:
# Zero indices
indices_zero = np.where(raw_output == 0)[0]
# One indices
indices_one = np.where(raw_output == 1)[0]

# Chosen random zero indices
chosen_indices_zero = np.random.choice(indices_zero, size = len(indices_one), replace = False)

# Concatenate chosen zero indices and one indices
balanced_indices = np.concatenate((indices_one, chosen_indices_zero))

In [293]:
# Balanced output
balanced_output = raw_output[balanced_indices]
balanced_output.shape

(4474,)

In [294]:
# Balanced input
balanced_input = raw_inputs[balanced_indices]
balanced_input.shape

(4474, 10)

In [295]:
(balanced_output == 0).sum(), (balanced_output == 1).sum()

(2237, 2237)

##### Standardizing

In [296]:
scaled_inputs = preprocessing.scale(balanced_input)

##### Shuffling

In [297]:
# Shuffling of indices
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

# Shuffled inputs and output
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_output = balanced_output[shuffled_indices]


##### Splitting

In [298]:
# Count all
samples_count = shuffled_inputs.shape[0]
samples_count

4474

In [299]:
# Training 0.8 * all
train_count = int(0.8*samples_count)
train_count

3579

In [300]:
# Validation 0.1 * all
validation_count = int(0.1*samples_count)
validation_count

447

In [301]:
# Test = all - train + validation
test_count = samples_count - train_count - validation_count
test_count

448

In [302]:
# Train inputs and output
train_inputs = shuffled_inputs[:train_count]
train_output = shuffled_output[:train_count]

# Validation inputs and output
validation_inputs = shuffled_inputs[train_count: train_count+validation_count]
validation_output = shuffled_output[train_count: train_count+validation_count]

# Test inputs and output
test_inputs = shuffled_inputs[train_count+validation_count:]
test_output = shuffled_output[train_count+validation_count:]

In [303]:
train_inputs.shape,validation_inputs.shape, test_inputs.shape

((3579, 10), (447, 10), (448, 10))

In [304]:
train_output.shape, validation_output.shape, test_output.shape

((3579,), (447,), (448,))

##### Save npzs

In [305]:
# Train npz
np.savez('npz/Audiobooks_data_train', inputs = train_inputs, output = train_output)
# Validation npz
np.savez('npz/Audiobooks_data_validation', inputs = validation_inputs, output = validation_output)
# Test npz
np.savez('npz/Audiobooks_data_test', inputs = test_inputs, output = test_output)

## 2. Model

##### Load the data

* (We already have the data but oh well)

In [306]:
# Load npzs
npz_train = np.load('npz/Audiobooks_data_train.npz')
npz_validation = np.load('npz/Audiobooks_data_validation.npz')
npz_test = np.load('npz/Audiobooks_data_test.npz')

# Unload npzs
train_inputs = npz_train['inputs'].astype(float)
train_output = npz_train['output'].astype(int)

validation_inputs = npz_validation['inputs'].astype(float)
validation_output = npz_validation['output'].astype(int)

test_inputs = npz_test['inputs'].astype(float)
test_output = npz_test['output'].astype(int)


##### Model

In [307]:
input_size = 10
output_size = 2
hidden_layer_size = 50

model = tf.keras.Sequential([
    # 2 hidden layers
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    # Output layer
    tf.keras.layers.Dense(output_size, activation='softmax')
])

In [308]:
model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

##### Early stopping

In [309]:
# Monitors val_loss and restores the weights for best validation loss
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                  patience = 3,
                                                  restore_best_weights=True)

##### Training

In [310]:
batch_size = 100
max_epochs = 100

In [311]:
model.fit(train_inputs,
          train_output,
          batch_size=batch_size,
          epochs = max_epochs,
          callbacks = [early_stopping],
          validation_data = (validation_inputs, validation_output),
          verbose = 1)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


<keras.src.callbacks.History at 0x179b86e2fe0>

## 3. Testing

In [313]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_output)



In [314]:
print('Test loss: {0:.2f}\nTest accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.0))

Test loss: 0.34
Test accuracy: 82.59%


* 82.59% Accuracy

* Decent but was honestly hoping for >90% accuracy. Might do some finetuning later.