In [22]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

import tensorflow as tf

In [23]:
data = pd.read_csv("Audiobooks_data.csv", header=None)
raw_data = data.values
raw_data

array([[8.7300e+02, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [6.1100e+02, 1.4040e+03, 2.8080e+03, ..., 0.0000e+00, 1.8200e+02,
        1.0000e+00],
       [7.0500e+02, 3.2400e+02, 3.2400e+02, ..., 1.0000e+00, 3.3400e+02,
        1.0000e+00],
       ...,
       [2.8671e+04, 1.0800e+03, 1.0800e+03, ..., 0.0000e+00, 2.9000e+01,
        0.0000e+00],
       [3.1134e+04, 2.1600e+03, 2.1600e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [3.2832e+04, 1.6200e+03, 1.6200e+03, ..., 0.0000e+00, 9.0000e+01,
        0.0000e+00]])

In [24]:
unscaled_inputs = raw_data[:, 1:-1]
unscaled_inputs

array([[2.160e+03, 2.160e+03, 1.013e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.404e+03, 2.808e+03, 6.660e+00, ..., 0.000e+00, 0.000e+00,
        1.820e+02],
       [3.240e+02, 3.240e+02, 1.013e+01, ..., 0.000e+00, 1.000e+00,
        3.340e+02],
       ...,
       [1.080e+03, 1.080e+03, 6.550e+00, ..., 0.000e+00, 0.000e+00,
        2.900e+01],
       [2.160e+03, 2.160e+03, 6.140e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.620e+03, 1.620e+03, 5.330e+00, ..., 0.000e+00, 0.000e+00,
        9.000e+01]])

In [25]:
targets = raw_data[:, -1]
targets

array([1., 1., 1., ..., 0., 0., 0.])

## Balance the dataset

In [26]:
sum_one = targets.sum()
sum_one = int(sum_one)
sum_one

2237

In [27]:
index_delete = []
count_zero = 0
for i in range(targets.shape[0]):
    if(targets[i] == 0):
        count_zero += 1
    if(count_zero > sum_one):
        index_delete.append(i)

inputs_balance = np.delete(unscaled_inputs, index_delete, axis=0)
targets_balance = np.delete(targets, index_delete, axis = 0)

In [28]:
targets_balance, inputs_balance

(array([1., 1., 1., ..., 0., 0., 0.]),
 array([[2.160e+03, 2.160e+03, 1.013e+01, ..., 0.000e+00, 0.000e+00,
         0.000e+00],
        [1.404e+03, 2.808e+03, 6.660e+00, ..., 0.000e+00, 0.000e+00,
         1.820e+02],
        [3.240e+02, 3.240e+02, 1.013e+01, ..., 0.000e+00, 1.000e+00,
         3.340e+02],
        ...,
        [2.160e+03, 2.160e+03, 1.013e+01, ..., 2.592e+02, 0.000e+00,
         1.400e+01],
        [2.160e+03, 2.160e+03, 8.300e+00, ..., 2.592e+02, 0.000e+00,
         9.300e+01],
        [2.160e+03, 2.160e+03, 8.000e+00, ..., 2.592e+02, 0.000e+00,
         2.400e+01]]))

## Standardize the inputs

In [29]:
standard_inputs = preprocessing.scale(inputs_balance)
standard_inputs

array([[ 1.18956512,  0.36398846,  0.67728889, ..., -0.8635056 ,
        -0.20536617, -0.77240946],
       [-0.33022754,  1.10843845, -0.08841391, ..., -0.8635056 ,
        -0.20536617,  1.16499791],
       [-2.50135991, -1.74528653,  0.67728889, ..., -0.8635056 ,
         2.23179102,  2.78305242],
       ...,
       [ 1.18956512,  0.36398846,  0.67728889, ..., -0.20129479,
        -0.20536617, -0.62337812],
       [ 1.18956512,  0.36398846,  0.27347444, ..., -0.20129479,
        -0.20536617,  0.21758442],
       [ 1.18956512,  0.36398846,  0.20727535, ..., -0.20129479,
        -0.20536617, -0.51692717]])

## Shuffle the data

In [36]:
shuffle_index = np.arange(targets_balance.shape[0])
np.random.shuffle(shuffle_index)
shuffle_index
shuffle_inputs = inputs_balance[shuffle_index]
shuffle_targets = targets_balance[shuffle_index]

### Split the dataset into train, validation, and test

In [41]:
sample_counts = shuffle_targets.shape[0]

train_sample_counts = int(0.8 * sample_counts)
validation_sample_counts = int(0.1 * sample_counts)
test_sample_counts = int(0.1 * sample_counts)
train_sample_counts, validation_sample_counts, test_sample_counts

(3579, 447, 447)

In [49]:
train_inputs = shuffle_inputs[:train_sample_counts, :]
train_targets  = shuffle_targets[:train_sample_counts]

validation_inputs = shuffle_inputs[train_sample_counts:train_sample_counts + validation_sample_counts, :]
validation_targets = shuffle_targets[train_sample_counts:train_sample_counts + validation_sample_counts]

test_inputs = shuffle_inputs[sample_counts - test_sample_counts:, :]
test_targets = shuffle_targets[sample_counts - test_sample_counts:]

print(np.sum(train_targets), train_sample_counts, np.sum(train_targets) / train_sample_counts)
print(np.sum(validation_targets), validation_sample_counts, np.sum(validation_targets) / validation_sample_counts)
print(np.sum(test_targets), test_sample_counts, np.sum(test_targets) / test_sample_counts)

1785.0 3579 0.49874266554903607
230.0 447 0.5145413870246085
222.0 447 0.4966442953020134


In [50]:
# Save the three datasets in *.npz.
# In the next lesson, you will see that it is extremely valuable to name them in such a coherent way!

np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)