### Importing relevant libraries

In [10]:
import numpy as np
import tensorflow as tf
from sklearn import preprocessing

### Loading data

In [11]:
raw_data = np.loadtxt('Audiobooks_data.csv',delimiter=',')
raw_data

array([[8.7300e+02, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [6.1100e+02, 1.4040e+03, 2.8080e+03, ..., 0.0000e+00, 1.8200e+02,
        1.0000e+00],
       [7.0500e+02, 3.2400e+02, 3.2400e+02, ..., 1.0000e+00, 3.3400e+02,
        1.0000e+00],
       ...,
       [2.8671e+04, 1.0800e+03, 1.0800e+03, ..., 0.0000e+00, 2.9000e+01,
        0.0000e+00],
       [3.1134e+04, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [3.2832e+04, 1.6200e+03, 1.6200e+03, ..., 0.0000e+00, 9.0000e+01,
        0.0000e+00]])

### Separating inputs and targets

In [3]:
inputs_raw_data = raw_data[:,1:-1]
targets_raw_data = raw_data[:,-1]

### Balancing the dataset

In [4]:
num_one_targets = int(np.sum(targets_raw_data))
zero_targets_counter = 0
indices_to_remove = []
for i in range (targets_raw_data.shape[0]):
    if targets_raw_data[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

balanced_inputs = np.delete(inputs_raw_data, indices_to_remove, axis = 0)
balanced_targets = np.delete(targets_raw_data, indices_to_remove, axis = 0)

### Standardizing the dataset

In [5]:
standardized_inputs = preprocessing.scale(balanced_inputs)

### Shuffling the dataset

In [6]:
shuffled_indices = np.arange(standardized_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = standardized_inputs[shuffled_indices]
shuffled_targets = balanced_targets[shuffled_indices]

### Splitting the dataset

In [8]:
num_samples = shuffled_inputs.shape[0]

train_data_num_samples = int(0.8 * num_samples)
validation_data_num_samples = int(0.1 * num_samples)
test_data_num_samples = num_samples - (train_data_num_samples + validation_data_num_samples)

train_input_data = shuffled_inputs[ : train_data_num_samples]
train_target_data = shuffled_targets[ : train_data_num_samples]

validation_input_data = shuffled_inputs[train_data_num_samples : train_data_num_samples+validation_data_num_samples]
validation_target_data = shuffled_targets[train_data_num_samples : train_data_num_samples+validation_data_num_samples]

test_input_data = shuffled_inputs[train_data_num_samples+validation_data_num_samples : ]
test_target_data = shuffled_targets[train_data_num_samples+validation_data_num_samples : ]

# Checking whether the dataset is balanced or not; dataset with 50% of the target as 1 and 50% of the target as 0 indicates the dataset is balanced
print(np.sum(train_target_data), train_data_num_samples, np.sum(train_target_data)/train_data_num_samples)
print(np.sum(validation_target_data), validation_data_num_samples, np.sum(validation_target_data)/validation_data_num_samples)
print(np.sum(test_target_data), test_data_num_samples, np.sum(test_target_data)/test_data_num_samples)

1789.0 3579 0.49986029617211514
227.0 447 0.5078299776286354
221.0 448 0.49330357142857145


### Saving the dataset in npz format

In [12]:
# To load the data in Tensorflow we need to save it in npz file
np.savez('Audiobooks_train_data_npz', inputs = train_input_data, targets = train_target_data)
np.savez('Audiobooks_validation_data_npz', inputs = validation_input_data, targets = validation_target_data)
np.savez('Audiobooks_test_data_npz', inputs = test_input_data, targets = test_target_data)

# The model has been built in another file: AudiobooksBusinessCaseStudy_TF_model.ipynb