# Business case: Outlining the solution

### 1. Preprocess the data
##### 1.1 Balance the dataset
##### 1.2 Divide the dataset into training, validation and test
##### 1.3 Save the data in a tensor friendly format

### 2. Create the machine learning algorithm

## Import the relevant libraries

In [1]:
import numpy as np
from sklearn import preprocessing

## Load the data

In [2]:
raw_csv_data = np.loadtxt(r'C:\Users\HP\Downloads\Audiobooks_data.csv', delimiter=',') 

unscaled_inputs_all = raw_csv_data[:, 1:-1] # select the inputs
targets_all = raw_csv_data[:, -1] # select the target

## Balance the dataset

In [3]:
num_one_targets = int(np.sum(targets_all)) # number of targets that are '1s'
zero_targets_counter = 0 # counter for targets that are '0s'
indices_to_remove = [] # empty list for indices to remove

for i in range (targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1 # counts of all zero targets
        if zero_targets_counter > num_one_targets: # if counts of all zero targets becomes greater than sum of targets that are '1s'
            indices_to_remove.append(i) # insert the greater zero targets row_wise in the list 'indices_to_remove'

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0) # delete the selected array from the total input array
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0) # delete the selected array from the total target array

## Standardizing the inputs

In [4]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

## Shuffle the data

In [5]:
shuffled_indices = np.arange(scaled_inputs.shape[0]) # shuffling in the range 4474
np.random.seed(1)
np.random.shuffle(shuffled_indices) # random shuffling

shuffled_inputs = scaled_inputs[shuffled_indices] # apply the shuffled indices to the scaled inputs
shuffled_targets = targets_equal_priors[shuffled_indices] # apply the shuffled indices to the target

## Split the data into train, validation and test

In [6]:
samples_count = shuffled_inputs.shape[0] # number of samples

train_samples_count = int(0.8 * samples_count) # percentage of train samples
validation_samples_count = int(0.1 * samples_count) # percentage of validation samples
test_samples_count = samples_count - train_samples_count - validation_samples_count # percentage of test samples

train_inputs = shuffled_inputs[:train_samples_count] # extract the training inputs
train_targets = shuffled_targets[:train_samples_count] # extract the training targets

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count + validation_samples_count] # extract the validation inputs
validation_targets = shuffled_targets[train_samples_count:train_samples_count + validation_samples_count] # extract the validation targets

test_inputs = shuffled_inputs[train_samples_count + validation_samples_count:] # extract the test inputs
test_targets = shuffled_targets[train_samples_count + validation_samples_count:] # extract the test target

## Checking how balanced our dataset is 

In [7]:
print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count) # balance of training data
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets)/validation_samples_count) # balance of validation data
print(np.sum(test_targets), test_samples_count, np.sum(test_targets)/test_samples_count) # balance of test data

1797.0 3579 0.5020955574182733
220.0 447 0.49217002237136465
220.0 448 0.49107142857142855


## Save the datasets

In [8]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets) # save the training data
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets) # save the validation data
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets) # save the testing data

In [9]:
import os
print(os.getcwd())

C:\Users\HP
