# Audiobooks Business Case

## 1. Preprocessing

* Preprocess the data
* Balance the dataset
* Create training, validation and test datasets
* Save the data in npz format

##### Imports

In [2]:
import numpy as np
from sklearn import preprocessing

##### Data

In [3]:
raw_data = np.loadtxt('data/Audiobooks_data.csv', delimiter = ',')

# Inputs
raw_inputs = raw_data[:,1:-1]
# Output
raw_output = raw_data[:,-1]

##### Balancing the dataset

* There are way more zeros than ones in output column.
* The problem is that you could achieve a whopping ~84% accuracy by always predicting 0
* This could be problematic and requires balancing

In [8]:
(raw_output == 0).sum(), (raw_output == 1).sum()

(11847, 2237)

In [11]:
# Zero indices
indices_zero = np.where(raw_output == 0)[0]
# One indices
indices_one = np.where(raw_output == 1)[0]

# Chosen random zero indices
chosen_indices_zero = np.random.choice(indices_zero, size = len(indices_one), replace = False)

# Concatenate chosen zero indices and one indices
balanced_indices = np.concatenate((indices_one, chosen_indices_zero))

In [12]:
# Balanced output
balanced_output = raw_output[balanced_indices]
balanced_output.shape

(4474,)

In [14]:
# Balanced input
balanced_input = raw_inputs[balanced_indices]
balanced_input.shape

(4474, 10)

In [13]:
(balanced_output == 0).sum(), (balanced_output == 1).sum()

(2237, 2237)

##### Standardizing

In [15]:
scaled_inputs = preprocessing.scale(balanced_input)

##### Shuffling

In [16]:
# Shuffling of indices
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

# Shuffled inputs and output
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_output = balanced_output[shuffled_indices]


##### Splitting

In [18]:
# Count all
samples_count = shuffled_inputs.shape[0]
samples_count

4474

In [20]:
# Training 0.8 * all
train_count = int(0.8*samples_count)
train_count

3579

In [23]:
# Validation 0.1 * all
validation_count = int(0.1*samples_count)
validation_count

447

In [25]:
# Test = all - train + validation
test_count = samples_count - train_count - validation_count
test_count

448

In [None]:
# Train inputs and output
train_inputs = shuffled_inputs[:train_count]
train_output = shuffled_inputs[:train_count]

# Validation inputs and output
validation_inputs = shuffled_inputs[train_count: train_count+validation_count]
validation_output = shuffled_inputs[train_count: train_count+validation_count]

# in progress