# Audiobooks Customer Return Prediction

# Information about Input data

The data in CSV is without headers and some level of pre-processing is already done. 
For instance, the missing values of rating field are filled with the mean value (8.91)

The columns are as follows (in order)

1. Customer ID
2. Total minutes of all the audio books purchased
3. Average minutes of all the audio books purchased
4. Total price
5. Average price
6. Review ( 0 = Did not submit review, 1 = Submitted review )
7. Review out of 10
8. Minutes listened
9. Completion percent
10. Support requests made
11. ( Last visited date - Purchase date )
12. Customer returned to buy new audiobook (Target value) ( 1 = Customer returned, 0 = Customer did not return )

## Imports

In [29]:
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

## Preprocessing

1. Shuffling the data to remove any time bias.
2. Balancing the target data as 50/50 to remove the learning bias.
3. Standardizing the inputs
4. Shuffling again to make sure train, validation and test data set are unbiased

There are 10 input fields except ID column and 1 Target field.

In [22]:
# Extract
raw_csv_data = np.loadtxt('../input/audiobook-store-customer/Audiobooks_data.csv',delimiter=',')


# Shuffle
shuffled_indices = np.arange(raw_csv_data.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs = raw_csv_data[shuffled_indices]


# Separate Input and Target
inputs_all = shuffled_inputs[:,1:-1]
targets_all = shuffled_inputs[:,-1]


# Balance data set by removing the excessive 0 targets as it will bias the learnig of model
one_targets_count = int(np.sum(targets_all))
zero_targets_count = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_count += 1
        if zero_targets_count > one_targets_count:
            indices_to_remove.append(i)
            
balanced_inputs_all = np.delete(inputs_all, indices_to_remove, axis=0)
balanced_targets_all = np.delete(targets_all, indices_to_remove, axis=0)


# Standardize the input values
std_inputs = preprocessing.scale(balanced_inputs_all)


# Shuffle again
shuffled_indices = np.arange(inputs.shape[0])
np.random.shuffle(shuffled_indices)

inputs = std_inputs[shuffled_indices]
targets = balanced_targets_all[shuffled_indices]

# Split the dataset into Train, Validation, and Test sets

Dataset is split in 80/10/10 parts. 

In [25]:
# Total samples
samples = inputs.shape[0]

# Split count
train_samples_count = int(0.8 * samples)
validation_samples_count = int(0.1 * samples)
test_samples_count = samples - train_samples_count - validation_samples_count

# Creating train set
train_inputs = inputs[:train_samples_count]
train_targets = targets[:train_samples_count]

# Creating validation set
validation_inputs = inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = targets[train_samples_count:train_samples_count+validation_samples_count]

# Creating test set
test_inputs = inputs[train_samples_count+validation_samples_count:]
test_targets = targets[train_samples_count+validation_samples_count:]

# Print the number of targets that are 1s, the total number of samples, and the proportion for training, validation, and test.
print("---Train---")
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print("---Validation---")
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print("---Test---")
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

---Train---
1799.0 3579 0.5026543727298128
---Validation---
212.0 447 0.4742729306487696
---Test---
226.0 448 0.5044642857142857


# Create *.npz for model

In [24]:
np.savez('./Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('./Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('./Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

# Load npz files

In [26]:
# Train set
npz = np.load('Audiobooks_data_train.npz')
train_inputs, train_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

# Validation set
npz = np.load('Audiobooks_data_validation.npz')
validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

# Test set
npz = np.load('Audiobooks_data_test.npz')
test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

# Model 

In [48]:
input_size = 10
output_size = 2

# Configuring the NN values. Hidden layers = 2
hidden_layer_size = 50
batch_size = 120
max_epochs = 100

# Early stopping mechanism with 2 patience level. Which means the model will continue learing until the error has been minimized 
# and cannot minimize further. The model will stop learning after 2 (patience level) such instances
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)
        
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2nd hidden layer    
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(train_inputs, 
          train_targets, 
          batch_size=batch_size, 
          epochs=max_epochs, 
          callbacks=[early_stopping], 
          validation_data=(validation_inputs, validation_targets), 
          verbose = 2 
          )  

Epoch 1/100
30/30 - 1s - loss: 0.6141 - accuracy: 0.6605 - val_loss: 0.5405 - val_accuracy: 0.7539
Epoch 2/100
30/30 - 0s - loss: 0.5019 - accuracy: 0.7580 - val_loss: 0.4645 - val_accuracy: 0.7629
Epoch 3/100
30/30 - 0s - loss: 0.4477 - accuracy: 0.7714 - val_loss: 0.4212 - val_accuracy: 0.7852
Epoch 4/100
30/30 - 0s - loss: 0.4195 - accuracy: 0.7882 - val_loss: 0.4019 - val_accuracy: 0.8076
Epoch 5/100
30/30 - 0s - loss: 0.4036 - accuracy: 0.7974 - val_loss: 0.3865 - val_accuracy: 0.8098
Epoch 6/100
30/30 - 0s - loss: 0.3920 - accuracy: 0.8047 - val_loss: 0.3769 - val_accuracy: 0.8098
Epoch 7/100
30/30 - 0s - loss: 0.3865 - accuracy: 0.8055 - val_loss: 0.3708 - val_accuracy: 0.8143
Epoch 8/100
30/30 - 0s - loss: 0.3783 - accuracy: 0.8108 - val_loss: 0.3715 - val_accuracy: 0.8166
Epoch 9/100
30/30 - 0s - loss: 0.3757 - accuracy: 0.8145 - val_loss: 0.3605 - val_accuracy: 0.8166
Epoch 10/100
30/30 - 0s - loss: 0.3708 - accuracy: 0.8136 - val_loss: 0.3591 - val_accuracy: 0.8188
Epoch 11/

<tensorflow.python.keras.callbacks.History at 0x7f7c4846b390>

# Testing the model

In [49]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets,verbose=0)
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.39. Test accuracy: 80.58%
