In [1]:
import tensorflow as tf
import numpy as np

In [9]:
from sklearn import preprocessing

Loading the Data

In [8]:
raw_data = np.loadtxt('Audiobooks_data.csv', delimiter = ',')

Shuffling the Raw Data

In [26]:
raw_shuffled_indices = np.arange(raw_data.shape[0])
np.random.shuffle(raw_shuffled_indices)
raw_data_shuffled = raw_data[raw_shuffled_indices]

Extracting inputs and targets

In [27]:
raw_shuffled_inputs_all = raw_data_shuffled[:,1:-1]
raw_shuffled_targets_all = raw_data_shuffled[:,-1]

Standardising the inputs

In [31]:
scaled_inputs_all = preprocessing.scale(raw_shuffled_inputs_all)

Balancing the Dataset

In [38]:
num_of_ones = int(np.sum(raw_shuffled_targets_all))
zeroes_counter = 0
indices_to_remove = []

for i in range(raw_shuffled_targets_all.shape[0]):
    if(raw_shuffled_targets_all[i] == 0):
        zeroes_counter+=1
        if(zeroes_counter > num_of_ones):
            indices_to_remove.append(i)

balanced_inputs = np.delete(scaled_inputs_all, indices_to_remove, axis= 0)
balanced_targets = np.delete(raw_shuffled_targets_all, indices_to_remove, axis= 0)

Shuffle after Balancing

In [39]:
balanced_shuffled_indices = np.arange(balanced_targets.shape[0])
np.random.shuffle(balanced_shuffled_indices)

balanced_shuffled_inputs = balanced_inputs[balanced_shuffled_indices]
balanced_shuffled_targets = balanced_targets[balanced_shuffled_indices]

Spliting into Train, Validation, Test

In [41]:
sample_count = balanced_shuffled_targets.shape[0]

train_sample_count = int(0.8*sample_count)
validation_sample_count = int(0.1*sample_count)
test_sample_count = sample_count - train_sample_count - validation_sample_count

train_inputs = balanced_shuffled_inputs[:train_sample_count]
train_targets = balanced_shuffled_targets[:train_sample_count]

validation_inputs = balanced_shuffled_inputs[train_sample_count : train_sample_count+validation_sample_count]
validation_targets = balanced_shuffled_targets[train_sample_count : train_sample_count+validation_sample_count]

test_inputs = balanced_shuffled_inputs[train_sample_count+validation_sample_count:]
test_targets = balanced_shuffled_targets[train_sample_count+validation_sample_count:]


#Checking the balance of the dataset
print(np.sum(train_targets), train_sample_count, np.sum(train_targets) / train_sample_count)
print(np.sum(validation_targets), validation_sample_count, np.sum(validation_targets) / validation_sample_count)
print(np.sum(test_targets), test_sample_count, np.sum(test_targets) / test_sample_count)

1793.0 3579 0.5009779267951942
215.0 447 0.4809843400447427
229.0 448 0.5111607142857143


Save as npz

In [42]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)