#### Import ML and Data Analysis Libraries

In [1]:
import numpy as np
from sklearn import preprocessing

#### Import the dataset

The dataset is given with no column titles. So we will use "np.loadtxt" method.

In [2]:
raw_data = np.loadtxt("Audiobooks_data.csv", delimiter = ",")
unscaled_inputs_all = raw_data[:,1:-1]
unscaled_inputs_all

array([[1620.  , 1620.  ,   19.73, ..., 1603.8 ,    5.  ,   92.  ],
       [2160.  , 2160.  ,    5.33, ...,    0.  ,    0.  ,    0.  ],
       [2160.  , 2160.  ,    5.33, ...,    0.  ,    0.  ,  388.  ],
       ...,
       [2160.  , 2160.  ,    6.14, ...,    0.  ,    0.  ,    0.  ],
       [1620.  , 1620.  ,    5.33, ...,  615.6 ,    0.  ,   90.  ],
       [1674.  , 3348.  ,    5.33, ...,    0.  ,    0.  ,    0.  ]])

In [3]:
targets_all = raw_data[:,-1]
targets_all.shape

(14084,)

#### Shuffle the dataset once

We would like to shuffle the data before the balancing happens.

If the original dataset was ordered in a particular way, we want to eliminate this to have least amount of bias towards that order.

In [4]:
num_data = targets_all.shape[0]
num_data

14084

In [5]:
shuffled_indices = np.arange(num_data)
np.random.shuffle(shuffled_indices)
print(shuffled_indices)

[   40  6977  8498 ...  5158  6568 13349]


In [6]:
unscaled_inputs_all = unscaled_inputs_all[shuffled_indices]
targets_all = targets_all[shuffled_indices]

In [7]:
targets_all

array([0., 0., 0., ..., 0., 1., 0.])

#### Balancing the dataset

There are way too many customers that did not return to buy another audiobook. 

We have to balance the dataset so that there is no bias towards one type of user in the model. For this, we will keep equal number of data from customers who returned and customers who didn't.

In [8]:
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

# iterating over all "targets"; hence, iterating over all the customer data
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            # say num_one_targets == 100. Then we should have 100 data whose target is zero. 
            # after we collected 100 data with target == 0, then zero_targets_counter == 100.
            # if the current data has target zero, then zero_targets_counter will increase by one, going over 100.
            # then we don't want to include this data because we already gathered enough zeros.
            indices_to_remove.append(i)
        
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

#### Standardize The Data

In [9]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors) # "preprocessing.scale" method standardizes the given data
scaled_inputs # an element at index i is a row, i.e. data from one customer

array([[ 1.20635076,  0.37673481, -0.38128345, ..., -0.02161991,
        -0.19531834, -0.6015741 ],
       [ 0.13031187, -0.23881977, -0.38128345, ..., -0.37483335,
        -0.19531834, -0.75347362],
       [-0.94572702, -0.85437435, -0.38128345, ..., -0.37483335,
        -0.19531834,  0.08197375],
       ...,
       [ 0.13031187, -0.23881977, -0.38128345, ..., -0.37483335,
        -0.19531834,  0.75467163],
       [ 0.13031187, -0.23881977, -0.3218385 , ..., -0.37483335,
        -0.19531834, -0.75347362],
       [-0.73051925, -0.73126343, -0.38128345, ..., -0.37483335,
        -0.19531834, -0.73177369]])

#### Shuffle the data

In [10]:
shuffled_indices = np.arange(scaled_inputs.shape[0]) 
shuffled_indices

array([   0,    1,    2, ..., 4471, 4472, 4473])

In [11]:
np.random.shuffle(shuffled_indices)
shuffled_indices

array([1161, 4205, 1993, ..., 3573, 3311, 2474])

In [12]:
shuffled_inputs = scaled_inputs[shuffled_indices] # shuffled_indices is a set of indices, so it can be used inside operator[] 
shuffled_targets = targets_equal_priors[shuffled_indices]

#### Manage training, validation, and test data

In [13]:
samples_count = shuffled_inputs.shape[0]

#80-10-10 split

train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - (train_samples_count + validation_samples_count)

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:(train_samples_count+validation_samples_count)]
validation_targets = shuffled_targets[train_samples_count:(train_samples_count+validation_samples_count)]

test_inputs = shuffled_inputs[(train_samples_count+validation_samples_count):]
test_targets = shuffled_targets[(train_samples_count+validation_samples_count):]

In [14]:
print("Ratio of returned customers to all, in the training data:",np.sum(train_targets)/train_samples_count)
print("Ratio of returned customers to all, in the validation data:",np.sum(validation_targets)/validation_samples_count)
print("Ratio of returned customers to all, in the test data:",np.sum(test_targets)/test_samples_count)

Ratio of returned customers to all, in the training data: 0.49594858899133837
Ratio of returned customers to all, in the validation data: 0.5078299776286354
Ratio of returned customers to all, in the test data: 0.5245535714285714


#### Save the datasets into .npz format

In [15]:
np.savez("audiobooks_data_train", inputs=train_inputs,targets=train_targets)
np.savez("audiobooks_data_validation", inputs=validation_inputs,targets=validation_targets)
np.savez("audiobooks_data_test", inputs=test_inputs,targets=test_targets)