### Preprocessing the data

Here i balance the data to enable a equal distribution of the dataset for good ML predicton. Create 3 dataset, the training, test, validation. then save the dataset as npz format

In [1]:
import numpy as np
from  sklearn import preprocessing

### Extract the data

In [2]:
raw_csv_data = np.loadtxt('Audiobooks_data.csv.csv', delimiter=',')
                          
unscaled_features = raw_csv_data[:, 1:-1]
targets_all = raw_csv_data[:,-1]

### Balance the dataset

1. i will counts the number of targets that are 1s
2. keep as many 0s as 1s(delete the others)



In [3]:
num_one_targets = int(np.sum(targets_all)) 
zero_targets_counter = 0
indices_to_remove = []
"""
in the loop, we want to increase the zeroes counter by 1, if the target is 0.
if the target at the position i is 0, and the number of 0s is bigger then the number of 1s, i will know the indices of all data points to be removed.
"""
for i in range(targets_all.shape[0]):
   
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

prior_unscaled_features_equal = np.delete(unscaled_features, indices_to_remove, axis=0)
prior_targets_equal = np.delete(targets_all, indices_to_remove, axis=0)

### Standardize the inputs


In [4]:
scaled_features = preprocessing.scale(prior_unscaled_features_equal)

###  Shuffle the data
since i will be batching, the model will confuse if there homogeneity in the dataset. It be be great to  shuffle to enable randomness for proper effecitveness of SDG.

In [5]:
shuffled_indices= np.arange(scaled_features.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_features = scaled_features[shuffled_indices]
shuffled_targets = prior_targets_equal[shuffled_indices]

### Split the dataset into train, validation and test

In [6]:
sample_counts = shuffled_features.shape[0]

train_samples_count = int(0.8*sample_counts)
validation_samples_count = int(0.1*sample_counts)
test_samples_count = sample_counts - train_samples_count + validation_samples_count

train_features = shuffled_features[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_features = shuffled_features[train_samples_count:train_samples_count+validation_samples_count] 
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count] 

test_features = shuffled_features[train_samples_count+validation_samples_count:] 
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]


print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets)/test_samples_count)


1767.0 3579 0.49371332774518023
237.0 447 0.5302013422818792
233.0 1342 0.1736214605067064


### save preprocess data to .npz format

In [7]:
np.savez('Audio_data_train', inputs=train_features, targets=train_targets)
np.savez('Audio_data_validation', inputs=validation_features, targets=validation_targets)
np.savez('Audio_data_test', inputs=test_features, targets=test_targets)

###  Creating the model


In [8]:
import tensorflow as tf

In [9]:
# train data
npz = np.load('Audio_data_train.npz')
train_features= npz['inputs'].astype(float)
train_targets = npz['targets'].astype(int)

# validation data
npz = np.load('Audio_data_validation.npz')
validation_features= npz['inputs'].astype(float)
validation_targets = npz['targets'].astype(int)

# test data
npz = np.load('Audio_data_test.npz')
test_features= npz['inputs'].astype(float)
test_targets = npz['targets'].astype(int)

In [10]:
input_size = 10
output_size = 2
hidden_layer = 50

model = tf.keras.Sequential([
                            tf.keras.layers.Dense(hidden_layer, activation='relu'),#hidden layer,
                            tf.keras.layers.Dense(hidden_layer, activation='relu'),#hidden layer,
                            tf.keras.layers.Dense(hidden_layer, activation='relu'),#hidden layer,
                            tf.keras.layers.Dense(output_size, activation='softmax'),#outlayer layer(returns probability),
                     
                            ])

### choosing optimizer and loss function


In [11]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [12]:
BATCH_SIZE = 200
MAX_EPOCH = 200
EARLY_STOPPING = tf.keras.callbacks.EarlyStopping(patience=2)
model.fit(train_features, 
          train_targets,
          callbacks=[EARLY_STOPPING],
          batch_size=BATCH_SIZE, 
          epochs=MAX_EPOCH, 
          validation_data=(validation_features, validation_targets),
          verbose=2)

Epoch 1/200
18/18 - 2s - loss: 0.5673 - accuracy: 0.7737 - val_loss: 0.4651 - val_accuracy: 0.8591 - 2s/epoch - 85ms/step
Epoch 2/200
18/18 - 0s - loss: 0.3896 - accuracy: 0.8782 - val_loss: 0.3455 - val_accuracy: 0.8680 - 87ms/epoch - 5ms/step
Epoch 3/200
18/18 - 0s - loss: 0.3202 - accuracy: 0.8854 - val_loss: 0.3152 - val_accuracy: 0.8770 - 78ms/epoch - 4ms/step
Epoch 4/200
18/18 - 0s - loss: 0.2937 - accuracy: 0.8910 - val_loss: 0.3010 - val_accuracy: 0.8770 - 81ms/epoch - 4ms/step
Epoch 5/200
18/18 - 0s - loss: 0.2780 - accuracy: 0.8994 - val_loss: 0.3022 - val_accuracy: 0.8859 - 84ms/epoch - 5ms/step
Epoch 6/200
18/18 - 0s - loss: 0.2705 - accuracy: 0.8977 - val_loss: 0.2850 - val_accuracy: 0.8904 - 79ms/epoch - 4ms/step
Epoch 7/200
18/18 - 0s - loss: 0.2610 - accuracy: 0.9014 - val_loss: 0.2785 - val_accuracy: 0.8881 - 78ms/epoch - 4ms/step
Epoch 8/200
18/18 - 0s - loss: 0.2559 - accuracy: 0.9036 - val_loss: 0.2821 - val_accuracy: 0.8949 - 94ms/epoch - 5ms/step
Epoch 9/200
18/18

<keras.callbacks.History at 0x273f2b244c0>

### Test the model

In [13]:
test_loss, test_accuracy = model.evaluate(test_features, test_targets)



In [14]:
print('Test loss: {:.2f}, Test accuray:{:.2f}%'.format(test_loss, test_accuracy*100.0))

Test loss: 0.32, Test accuray:88.17%
