# Import libraries 

In [41]:
import numpy as np
from sklearn import preprocessing

# Load the raw data

In [42]:
# if we open  the file with notebook instead of excel, we will see that numbers are separated by commas
# so using np (unlike pd), we can load as a text file so we need to indicate the the comma is the delimiter
raw_csv_data = np.loadtxt ('F:\\schulich\\python\\Udemy Data course\\required data files\\Audiobooks_data.csv', delimiter= ',')

# Preprocessing

## 1- Extract the raw input

In [43]:
# in our data set we do not need the first and the last column.
# the first is an arbitary ID for each customer, no useful information. raw_csv_data[:, 0]
# the last column is target: raw_csv_data[:, -1]
# so we exclude these two columns and we are left with raw input
# note that unlike panda, as we do not have the title name of columns, we only have to indicate the index number.
 
unscaled_inputs_all = raw_csv_data[:, 1:-1]

targets_all = raw_csv_data[:, -1]

targets_all

array([1., 1., 1., ..., 0., 0., 0.])

## 2- Balance the data set

In [44]:
# we count all the 1s in our target data set, and take the same number of 0s and remove the rest

# Count how many targets are 1 (meaning that the customer did convert)
number_of_targets_1 = int(np.sum(targets_all)) # we use int otherwise it gives us a float number

# keep as many 0s as 1s
zero_targets_counter = 0

# we will need a variable that records the indices to be removed. which is empty for now. but we want it to ne a list or tuple
indices_to_remove = []

# lets itrate over the data set and balance it
for i in range(targets_all.shape[0]):  # targets_all.shape[0] = gives the number of all our targets = number of observations
    if targets_all[i] == 0:
        zero_targets_counter = zero_targets_counter + 1 
        if zero_targets_counter > number_of_targets_1: # once we get the same number of 0s, remove the rest
            indices_to_remove.append(i)
            
# as we defined the indices of targets with the value of zero that need to be removed we can remove them
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)

# similarily we need to apply the same procedure to balance our target 
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

# now we have a balanced dataset including both balanced input data and target data

## 3- Standardize the input

In [56]:
# the current input is unscaled and we know standarizing will improve our machine learning very well. 
# That's the only place we use sklearn functionality. We will take advantage of its preprocessing capabilities
# It's a simple line of code, which standardizes the inputs, as we explained in one of the lectures.
# At the end of the business case, you can try to run the algorithm WITHOUT this line of code. 
# The result will be interesting.
scaled_input = preprocessing.scale(unscaled_inputs_equal_priors)
scaled_input.shape[0]

4474

## 4- Shuffle the data including inputs and the targets


In [58]:
# as it is possible that our original data set is oredered in someway, such as date
# since we will batching we need to shuffle the data to have differnt data with no specific thing in each batch

# as we want to shuffle our inputs and targets the same way so that each target associated with each input remain correct
# so instead of shuffling inputs and targets we define an order from 0 to the number od our scaled obsevations
# then we shuffle the indices and then reorder our inputs and targets based on our shuffled indices
shuffled_indices = np.arange(scaled_input.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_input[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

shuffled_indices
# note that we can shuffle data at the beginnig of preprocessing or even after balancing

array([ 830, 3337, 1518, ..., 1811, 2758, 2373])

## 5- Split data into train, validation, and test

In [59]:
# lets calculate the total number of our remained observations after balancing
sample_count = shuffled_inputs.shape[0]

# next we must detemine the size of each three datasets
# we use 80% , 10%, 10% split
train_percent = 0.8
validation_percent = 0.1

train_sample_count = int(train_percent * sample_count) # make sure you use int
validation_sample_count = int(validation_percent * sample_count)
test_sample_count = sample_count - (train_sample_count + validation_sample_count)

# as we have the size of each one let's extract them
train_inputs = shuffled_inputs[:train_sample_count]
train_targets = shuffled_targets[:train_sample_count]

validation_inputs = shuffled_inputs[train_sample_count:(train_sample_count+validation_sample_count)]
validation_targets = shuffled_targets[train_sample_count:(train_sample_count+validation_sample_count)]

test_inputs = shuffled_inputs[(train_sample_count+validation_sample_count):]
test_targets = shuffled_targets[(train_sample_count+validation_sample_count):]

sample_count


4474

In [62]:
# We balanced our dataset to be 50-50 (for targets 0 and 1), but the training, validation, and test were 
# taken from a shuffled dataset. Check if they are balanced, too. Note that each time you rerun this code, 
# you will get different values, as each time they are shuffled randomly.
# Normally you preprocess ONCE, so you need not rerun this code once it is done.
# If you rerun this whole sheet, the npzs will be overwritten with your newly preprocessed data.

# let's check the numbers
# Print the number of targets that are 1s, the total number of samples, and the proportion for training, validation, and test.
print(np.sum(train_targets), train_sample_count, np.sum(train_targets) / train_sample_count)
print(np.sum(validation_targets), validation_sample_count, np.sum(validation_targets) / validation_sample_count)
print(np.sum(test_targets), test_sample_count, np.sum(test_targets) / test_sample_count)

1773.0 3579 0.4953897736797988
228.0 447 0.5100671140939598
236.0 448 0.5267857142857143


## 6- save the three datasets in *.npz
which is compatible with our ML library (tensorflow) that we are going to use

In [64]:
# Save the three datasets in *.npz.
# try to name them in a very semantic way
# In the next lesson, you will see that it is extremely valuable to name them in such a coherent way!
np.savez('Audiobooks_data_train', inputs = train_inputs, targets = train_targets)
np.savez('Audiobooks_data_validation', inputs = validation_inputs, targets = validation_targets)
np.savez('Audiobooks_data_test', inputs = test_inputs, targets = test_targets)
# the files have been saved in the same route as this jupyter codes
# :C:\Users\sasan\Udemy data science bootcamp 2020