# Business case - Classify customers

## preprocessing

In [111]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing

In [112]:
row_csv_data = np.loadtxt('Audiobooks-data.csv', delimiter= ',')
unscaled_input_all = row_csv_data[:,1:-1]
targets_all= row_csv_data[:,-1]

## shuffle the data

In [113]:
unscaled_input_all

array([[1620.  , 1620.  ,   19.73, ..., 1603.8 ,    5.  ,   92.  ],
       [2160.  , 2160.  ,    5.33, ...,    0.  ,    0.  ,    0.  ],
       [2160.  , 2160.  ,    5.33, ...,    0.  ,    0.  ,  388.  ],
       ...,
       [2160.  , 2160.  ,    6.14, ...,    0.  ,    0.  ,    0.  ],
       [1620.  , 1620.  ,    5.33, ...,  615.6 ,    0.  ,   90.  ],
       [1674.  , 3348.  ,    5.33, ...,    0.  ,    0.  ,    0.  ]])

In [114]:
shuffled_indices = np.arange(unscaled_input_all.shape[0])
print(shuffled_indices)
np.random.shuffle(shuffled_indices)
print(shuffled_indices)
shuffled_inputs_all = uscaled_input_all[shuffled_indices]
shuffled_targets_all = targets_all[shuffled_indices]

[    0     1     2 ... 14081 14082 14083]
[11463 12873  5661 ...  6034   201  2776]


In [115]:
shuffled_inputs_all

array([[1.620e+03, 1.620e+03, 8.610e+00, ..., 8.910e+02, 1.000e+00,
        1.800e+01],
       [2.160e+03, 2.160e+03, 6.070e+00, ..., 9.288e+02, 0.000e+00,
        1.950e+02],
       [1.620e+03, 1.620e+03, 5.330e+00, ..., 4.536e+02, 0.000e+00,
        1.850e+02],
       ...,
       [1.620e+03, 1.620e+03, 1.013e+01, ..., 6.480e+01, 0.000e+00,
        2.000e+01],
       [2.160e+03, 2.160e+03, 8.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.160e+03, 2.160e+03, 5.330e+00, ..., 4.320e+01, 0.000e+00,
        0.000e+00]])

## balance dataset


In [116]:
num_one_tragets = sum(shuffled_targets_all)
zero_targets_counter = 0
indices_to_remove = []

for i in range(shuffled_targets_all.shape[0]):
    if shuffled_targets_all[i]==0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_tragets:
            indices_to_remove.append(i)
        
        
unscaled_inputs_equal_priors = np.delete(shuffled_inputs_all, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(shuffled_targets_all, indices_to_remove,axis=0)
        



## standardize the inputs

In [117]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

## suffle the data

In [118]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

## split the dataset into train, validation an test

In [119]:
sample_count = targets_equal_priors.shape[0]
train_samples_count = int(0.8*sample_count)
validation_samples_count = int(0.1*sample_count)
test_samples_count = sample_count - train_samples_count - validation_samples_count

In [124]:
train_inputs = shuffled_inputs[:train_samples_count]
train_targets =shuffled_targets[:train_samples_count]

validation_inputs= shuffled_inputs[train_samples_count:train_samples_count+validation_sample_count]
validation_targets =  shuffled_targets[train_samples_count:train_samples_count+validation_sample_count]

test_inputs =  shuffled_inputs[train_samples_count+validation_sample_count:]
test_targets =  shuffled_targets[train_samples_count+validation_sample_count:]

In [125]:
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

1800.0 3579 0.5029337803855826
213.0 447 0.47651006711409394
224.0 448 0.5


### Save the three datasets in *.npz

In [126]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

## load the data

In [132]:
npz = np.load('Audiobooks_data_train.npz')
train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)

npz = np.load('Audiobooks_data_validation.npz')
validation_inputs = npz['inputs'].astype(np.float)
validation_targets = npz['targets'].astype(np.int)

npz = np.load('Audiobooks_data_test.npz')
test_inputs = npz['inputs'].astype(np.float)
test_targets = npz['targets'].astype(np.int)

## model

In [140]:
input_size = 10
output_size = 2
hidden_layer_size = 50


model = tf.keras.Sequential([
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size,  activation='relu'),
                            tf.keras.layers.Dense(output_size,  activation='softmax')
                            ])

model.compile(optimizer = 'adam',loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

batch_size = 100
max_epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping(patience = 2)

model.fit(train_inputs,train_targets,batch_size = batch_size, epochs=max_epochs,
          callbacks=[early_stopping],
          validation_data = (validation_inputs,validation_targets),
          verbose=2)

Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 0s - loss: 0.5808 - accuracy: 0.6588 - val_loss: 0.5041 - val_accuracy: 0.7696
Epoch 2/100
3579/3579 - 0s - loss: 0.4803 - accuracy: 0.7617 - val_loss: 0.4394 - val_accuracy: 0.7785
Epoch 3/100
3579/3579 - 0s - loss: 0.4370 - accuracy: 0.7804 - val_loss: 0.4092 - val_accuracy: 0.8076
Epoch 4/100
3579/3579 - 0s - loss: 0.4117 - accuracy: 0.7904 - val_loss: 0.3869 - val_accuracy: 0.8031
Epoch 5/100
3579/3579 - 0s - loss: 0.4008 - accuracy: 0.7955 - val_loss: 0.3690 - val_accuracy: 0.8277
Epoch 6/100
3579/3579 - 0s - loss: 0.3900 - accuracy: 0.8002 - val_loss: 0.3656 - val_accuracy: 0.8166
Epoch 7/100
3579/3579 - 0s - loss: 0.3830 - accuracy: 0.8011 - val_loss: 0.3607 - val_accuracy: 0.8233
Epoch 8/100
3579/3579 - 0s - loss: 0.3790 - accuracy: 0.8030 - val_loss: 0.3549 - val_accuracy: 0.8233
Epoch 9/100
3579/3579 - 0s - loss: 0.3781 - accuracy: 0.8013 - val_loss: 0.3446 - val_accuracy: 0.8300
Epoch 10/100
3579/3579 - 0

<tensorflow.python.keras.callbacks.History at 0x7f721863c7d0>

In [142]:
test_loss, test_accuracy = model.evaluate(test_inputs,test_targets)



In [143]:
print('Test loss : {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss,test_accuracy*100.))

Test loss : 0.40. Test accuracy: 76.79%
