<a href="https://colab.research.google.com/github/SobowaleAhmed/gomycode/blob/main/audiobooks_dl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### We want to build a Deep learning model to predict if a customer of a audiobook platform would return
1. Preprocessing the data
    * Balance the dataset - Avoid Bias
    * Divide three parts: Training, Validation and test - Prevent over fitting
    * Save the data in a tensor format
2. Create the Deep Learning algorithm
3. Evaluate the model

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

## Load the Data and preprocess it

data

In [None]:
path ='Audiobooks_data.csv'
raw_data = np.loadtxt(path, delimiter = ',')

In [None]:
raw_data

array([[9.9400e+02, 1.6200e+03, 1.6200e+03, ..., 5.0000e+00, 9.2000e+01,
        0.0000e+00],
       [1.1430e+03, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.0590e+03, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 3.8800e+02,
        0.0000e+00],
       ...,
       [3.1134e+04, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [3.2832e+04, 1.6200e+03, 1.6200e+03, ..., 0.0000e+00, 9.0000e+01,
        0.0000e+00],
       [2.5100e+02, 1.6740e+03, 3.3480e+03, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]], shape=(14084, 12))

In [None]:
# dividing the data into feautures and target variables

features = raw_data[0: ,1:-1]
target = raw_data[0: , -1]

### Balance the dataset
* We will count the number of target that are 1
* We will keep as many 0s as 1s
* Delete the others

In [None]:
one_target_counter = int(np.sum(target))

zero_target_counter = 0
indices_to_remove = []

for i in range(target.shape[0]):
    if target[i] == 0:
        zero_target_counter += 1
        if zero_target_counter > one_target_counter:
            indices_to_remove.append(i)



In [None]:
one_target_counter

2237

In [None]:
zero_target_counter

11847

In [None]:
len(indices_to_remove)

9610

In [None]:
features = np.delete(features, indices_to_remove, axis = 0)
target = np.delete(target, indices_to_remove, axis = 0)

In [None]:
one = 0
zero = 0
for i in target:
    if i == 0:
        zero += 1
    else:
        one += 1



In [None]:
print(one)
print(zero)

2237
2237


In [None]:
target

array([0., 0., 0., ..., 1., 1., 1.], shape=(4474,))

In [None]:
features[:11,:]

array([[1.6200e+03, 1.6200e+03, 1.9730e+01, 1.9730e+01, 1.0000e+00,
        1.0000e+01, 9.9000e-01, 1.6038e+03, 5.0000e+00, 9.2000e+01],
       [2.1600e+03, 2.1600e+03, 5.3300e+00, 5.3300e+00, 0.0000e+00,
        8.9100e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [2.1600e+03, 2.1600e+03, 5.3300e+00, 5.3300e+00, 0.0000e+00,
        8.9100e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 3.8800e+02],
       [1.6200e+03, 1.6200e+03, 5.9600e+00, 5.9600e+00, 0.0000e+00,
        8.9100e+00, 4.2000e-01, 6.8040e+02, 1.0000e+00, 1.2900e+02],
       [2.1600e+03, 2.1600e+03, 5.3300e+00, 5.3300e+00, 0.0000e+00,
        8.9100e+00, 2.2000e-01, 4.7520e+02, 0.0000e+00, 3.6100e+02],
       [2.1600e+03, 2.1600e+03, 4.6100e+00, 4.6100e+00, 0.0000e+00,
        8.9100e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       [2.1600e+03, 2.1600e+03, 5.3300e+00, 5.3300e+00, 0.0000e+00,
        8.9100e+00, 4.0000e-02, 8.6400e+01, 0.0000e+00, 3.6600e+02],
       [6.4800e+02, 6.4800e+02, 5.3300e+0

In [None]:
scaler = StandardScaler()

features = scaler.fit_transform(features)

In [None]:
features[:11, :]

array([[ 2.10533868e-01, -1.88885170e-01,  1.97823887e+00,
         1.43819814e+00,  2.10780037e+00,  1.49110879e+00,
         4.24286158e+00,  4.80955413e+00,  1.18382842e+01,
         9.41504310e-02],
       [ 1.27894497e+00,  4.16467444e-01, -3.90824747e-01,
        -5.08367103e-01, -4.74428231e-01,  1.42655343e-03,
        -4.26477846e-01, -4.15699224e-01, -2.01834807e-01,
        -8.02558517e-01],
       [ 1.27894497e+00,  4.16467444e-01, -3.90824747e-01,
        -5.08367103e-01, -4.74428231e-01,  1.42655343e-03,
        -4.26477846e-01, -4.15699224e-01, -2.01834807e-01,
         2.97921400e+00],
       [ 2.10533868e-01, -1.88885170e-01, -2.87178214e-01,
        -4.23204874e-01, -4.74428231e-01,  1.42655343e-03,
         1.55445403e+00,  1.80107493e+00,  2.20618899e+00,
         4.54783377e-01],
       [ 1.27894497e+00,  4.16467444e-01, -3.90824747e-01,
        -5.08367103e-01, -4.74428231e-01,  1.42655343e-03,
         6.11153137e-01,  1.13252399e+00, -2.01834807e-01,
         2.

### Shuffling the Data

In [None]:
test = np.arange(10)
np.random.shuffle(test)
test

array([7, 1, 4, 0, 2, 9, 8, 3, 6, 5])

In [None]:
# Shuffle the index first

shuffled_indices = np.arange(features.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_features = features[shuffled_indices]
shuffled_targets = target[shuffled_indices]

In [None]:
shuffled_targets[:11]

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.])

In [None]:
target[:11]

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])

### Split the Dataset into Train, Validation and Test set
* Training set 80%
* Validation set 10%
* Test set 10%

In [None]:
sample_count = shuffled_features.shape[0]
train_sample_count = int(0.8*sample_count)
val_sample_count = int(0.1 * sample_count)
test_sample_count = sample_count - train_sample_count - val_sample_count

training_features = shuffled_features[:train_sample_count]
training_target  = shuffled_targets[:train_sample_count]

val_features = shuffled_features[train_sample_count:train_sample_count+val_sample_count]
val_target = shuffled_targets[train_sample_count:train_sample_count+val_sample_count]

test_features = shuffled_features[train_sample_count+val_sample_count:]
test_target = shuffled_targets[train_sample_count+val_sample_count:]

In [None]:
len(training_features)

3579

In [None]:
len(test_features)

448

### Save the three datasets into .npz format

In [None]:
np.savez("Audio_train_data", input = training_features, targets = training_target)
np.savez("Audio_validation_data", input = val_features, targets = val_target)
np.savez("Audio_test_data", input = test_features, targets = test_target)

## Reload the data

In [None]:
# Creating a temporary variable to store each of the datasets

train_npz = np.load('Audio_train_data.npz')
train_inputs = train_npz['input'].astype(float)
train_targets = train_npz['targets'].astype(int)

val_npz = np.load('Audio_validation_data.npz')
val_inputs, val_target = val_npz['input'].astype(float), val_npz['targets'].astype(int)

test_npz = np.load("Audio_test_data.npz")
test_inputs, test_targets = test_npz['input'].astype(float), test_npz['targets'].astype(int)

## Model: Outline, Optimizers, Loss, early stopping and training

In [None]:
# Outlines
input_size = 10
output_szie = 2
hidden_layer_size = 50

# define how the model looks and choose the optimizer and loss function
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation= 'relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation= 'relu'),
    tf.keras.layers.Dense(output_szie, activation='softmax')
])

# Model Compilation
model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])


# Training the model
batch_size = 100
epoch = 100

early_stopping = tf.keras.callbacks.EarlyStopping(patience = 2)

model.fit(train_inputs, train_targets, batch_size = batch_size,
          epochs = epoch,
          callbacks= [early_stopping],
          validation_data = (val_inputs, val_target),
          verbose = 2
          )

Epoch 1/100
36/36 - 4s - 106ms/step - accuracy: 0.6681 - loss: 0.5917 - val_accuracy: 0.7517 - val_loss: 0.4873
Epoch 2/100
36/36 - 0s - 8ms/step - accuracy: 0.7678 - loss: 0.4433 - val_accuracy: 0.8031 - val_loss: 0.3877
Epoch 3/100
36/36 - 0s - 8ms/step - accuracy: 0.7871 - loss: 0.3912 - val_accuracy: 0.7987 - val_loss: 0.3638
Epoch 4/100
36/36 - 0s - 8ms/step - accuracy: 0.8047 - loss: 0.3686 - val_accuracy: 0.8210 - val_loss: 0.3469
Epoch 5/100
36/36 - 0s - 8ms/step - accuracy: 0.8027 - loss: 0.3585 - val_accuracy: 0.8121 - val_loss: 0.3407
Epoch 6/100
36/36 - 0s - 8ms/step - accuracy: 0.8078 - loss: 0.3538 - val_accuracy: 0.8233 - val_loss: 0.3475
Epoch 7/100
36/36 - 0s - 8ms/step - accuracy: 0.8064 - loss: 0.3476 - val_accuracy: 0.8143 - val_loss: 0.3330
Epoch 8/100
36/36 - 0s - 7ms/step - accuracy: 0.8136 - loss: 0.3388 - val_accuracy: 0.8277 - val_loss: 0.3324
Epoch 9/100
36/36 - 0s - 8ms/step - accuracy: 0.8111 - loss: 0.3422 - val_accuracy: 0.8009 - val_loss: 0.3342
Epoch 10

<keras.src.callbacks.history.History at 0x2aaa448bec0>

## Test The Model

In [None]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8438 - loss: 0.3206 


In [None]:
print("Test Loss: {0:.2f}. \nTest accuracy: {1:.2f}%".format(test_loss,test_accuracy * 100))

Test Loss: 0.32. 
Test accuracy: 84.38%
