In [1]:
import numpy as np 
import pandas as pd 
import scipy 
import pickle

In [2]:
df = pd.read_csv("Audiobooks_data.csv")
raw_csv_data = df

In [3]:
df.head(3)

Unnamed: 0,ID,Book Length (mins)_overall,Book Length(mins)_avg,Price_Overall,Price_avg,Review,Review 10/10,Minutes Listened,Completion,Support Requests,Last visited minus Purchase date,Targets
0,873,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0,1
1,611,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182,1
2,705,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334,1


In [4]:
from sklearn.preprocessing import StandardScaler 

In [5]:
unscaled_inputs_all=raw_csv_data.iloc[:,1:-1]

In [6]:
unscaled_inputs_all

Unnamed: 0,Book Length (mins)_overall,Book Length(mins)_avg,Price_Overall,Price_avg,Review,Review 10/10,Minutes Listened,Completion,Support Requests,Last visited minus Purchase date
0,2160.0,2160,10.13,10.13,0,8.91,0.00,0.0,0,0
1,1404.0,2808,6.66,13.33,1,6.50,0.00,0.0,0,182
2,324.0,324,10.13,10.13,1,9.00,0.00,0.0,1,334
3,1620.0,1620,15.31,15.31,0,9.00,0.00,0.0,0,183
4,432.0,1296,7.11,21.33,1,9.00,0.00,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
14079,2160.0,2160,7.99,7.99,0,8.91,0.00,0.0,0,54
14080,1620.0,1620,5.33,5.33,1,9.00,0.61,0.0,0,4
14081,1080.0,1080,6.55,6.55,1,6.00,0.29,0.0,0,29
14082,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0


In [7]:
targets_all = raw_csv_data.iloc[:,-1]

In [8]:
targets_all

0        1
1        1
2        1
3        1
4        1
        ..
14079    0
14080    0
14081    0
14082    0
14083    0
Name: Targets, Length: 14084, dtype: int64

### Balancing the dataset

In [9]:
num_one_targets = int(np.sum(targets_all))

In [10]:
zero_targets_counter = 0

In [11]:
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

In [12]:
scaler_deep_learning = StandardScaler()
scaled_inputs = scaler_deep_learning.fit_transform(unscaled_inputs_equal_priors)

### Shuffle the dataset

In [13]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

### Splitting the dataset

In [14]:
samples_count = shuffled_inputs.shape[0]
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]
validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]
test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

In [15]:
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

1811 3579 0.50600726459905
217 447 0.4854586129753915
209 448 0.46651785714285715


In [16]:

train_inputs = pd.DataFrame(train_inputs)
train_targets = pd.DataFrame(train_targets)
validation_inputs = pd.DataFrame(validation_inputs)
validation_targets = pd.DataFrame(validation_targets)
test_inputs = pd.DataFrame(test_inputs)
test_targets = pd.DataFrame(test_targets)

train_inputs.to_csv('Audiobooks_data_train_inputs.csv', index=False)
train_targets.to_csv('Audiobooks_data_train_targets.csv', index=False)
validation_inputs.to_csv('Audiobooks_data_validation_inputs.csv', index=False)
validation_targets.to_csv('Audiobooks_data_validation_targets.csv', index=False)
test_inputs.to_csv('Audiobooks_data_test_inputs.csv', index=False)
test_targets.to_csv('Audiobooks_data_test_targets.csv', index=False)


### Saving the scaler

In [17]:
pickle.dump(scaler_deep_learning, open('scaler_deep_learning.pickle', 'wb'))