In [1]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf
import os

from tensorflow.keras import Sequential

from tensorflow.keras.layers import *

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


### Read the data

In [2]:
train_data_df = pd.read_csv("sales_data_training.csv")
test_data_df = pd.read_csv("sales_data_test.csv")

In [3]:
train_data_df.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3.5,1,0,1,0,1,0,0,132717,59.99
1,4.5,0,0,0,0,1,1,0,83407,49.99
2,3.0,0,0,0,0,1,1,0,62423,49.99
3,4.5,1,0,0,0,0,0,1,69889,39.99
4,4.0,1,0,1,0,1,0,1,161382,59.99


In [4]:
test_data_df.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3.5,1,1,1,0,1,0,1,247537,59.99
1,2.5,0,0,0,1,1,0,0,73960,59.99
2,3.5,0,0,0,0,1,1,0,82671,59.99
3,4.0,1,1,0,0,1,0,0,137456,39.99
4,2.0,1,0,1,0,1,0,0,89639,59.99


### Scaling the data

We want to scale all the numbers in each column to be between zero and one. If the numbers in one column are large, but the numbers in another column are small, the training process won't work very well. We can do this using the Min Max Scaler object 

In [5]:
Scaler = MinMaxScaler(feature_range = (0, 1))

In [6]:
scaled_training = Scaler.fit_transform(train_data_df)
scaled_test = Scaler.transform(test_data_df)

In [7]:
mul_factor = Scaler.scale_[8]
add_factor = Scaler.min_[8]

print("Note:\n")
print("Total_earnings values were scaled by first multiplying: {:.10f}".format(mul_factor))
print("After that adding: {:.6f}".format(add_factor))

Note:

Total_earnings values were scaled by first multiplying: 0.0000036968
After that adding: -0.115913


In [8]:
scaled_train_df = pd.DataFrame(scaled_training, columns = train_data_df.columns.values)
scaled_test_df = pd.DataFrame(scaled_test, columns = test_data_df.columns.values)

### Save the scaled training and scaled testing data in different csv

In [9]:
scaled_train_df.to_csv("scaled_training_data.csv")
scaled_test_df.to_csv("scaled_testing_data.csv")

In [10]:
scaled_train_data = pd.read_csv("scaled_training_data.csv").drop("Unnamed: 0", axis = 1)
scaled_test_data = pd.read_csv("scaled_testing_data.csv").drop("Unnamed: 0", axis = 1)

In [11]:
scaled_train_data.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,0.5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.374714,1.0
1,0.833333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.192425,0.5
2,0.333333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.114852,0.5
3,0.833333,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.142452,0.0
4,0.666667,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.480682,1.0


In [12]:
scaled_test_data.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,0.5,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.799179,1.0
1,0.166667,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.157502,1.0
2,0.5,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.189704,1.0
3,0.666667,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.392233,0.0
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.215464,1.0


### Create a model

In [13]:
X_train = scaled_train_data.drop("total_earnings", axis = 1).values
Y_train = scaled_train_data[["total_earnings"]].values

In [14]:
model = tf.keras.Sequential()

In [15]:
model.add(Dense(50, input_dim = 9, activation = "relu", name = "layer_1"))
model.add(Dense(100, activation = "relu", name = "layer_2"))
model.add(Dense(200, activation = "relu", name = "layer_3"))
model.add(Dense(300, activation = "relu", name = "layer_4"))
model.add(Dense(200, activation = "relu", name = "layer_5"))
model.add(Dense(100, activation = "relu", name = "layer_6"))
model.add(Dense(50, activation = "relu", name = "layer_7"))
model.add(Dense(1, activation = "linear", name = "output_layer"))

model.compile(loss = "mean_squared_error", optimizer = "adam")


### Create a tensorboard logger

In [16]:
logger = keras.callbacks.TensorBoard(
    log_dir = "logs",
    write_graph = True,
    histogram_freq = 5
)

### Let's train the neural network

In [17]:
model.fit(X_train, 
          Y_train, 
          epochs = 100, 
          shuffle = True, 
          verbose = 2,
          callbacks = [logger]
         )

Epoch 1/100
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
32/32 - 0s - loss: 0.0166
Epoch 2/100
32/32 - 0s - loss: 0.0011
Epoch 3/100
32/32 - 0s - loss: 0.0012
Epoch 4/100
32/32 - 0s - loss: 4.5608e-04
Epoch 5/100
32/32 - 0s - loss: 2.0519e-04
Epoch 6/100
32/32 - 0s - loss: 1.4791e-04
Epoch 7/100
32/32 - 0s - loss: 1.8709e-04
Epoch 8/100
32/32 - 0s - loss: 1.0508e-04
Epoch 9/100
32/32 - 0s - loss: 8.0618e-05
Epoch 10/100
32/32 - 0s - loss: 1.1614e-04
Epoch 11/100
32/32 - 0s - loss: 6.4168e-05
Epoch 12/100
32/32 - 0s - loss: 5.4713e-05
Epoch 13/100
32/32 - 0s - loss: 5.8982e-05
Epoch 14/100
32/32 - 0s - loss: 4.1868e-05
Epoch 15/100
32/32 - 0s - loss: 1.3599e-04
Epoch 16/100
32/32 - 0s - loss: 2.7302e-04
Epoch 17/100
32/32 - 0s - loss: 9.4190e-05
Epoch 18/100
32/32 - 0s - loss: 9.9650e-05
Epoch 19/100
32/32 - 0s - loss: 8.3016e-05
Epoch 20/100
32/32 - 0s - loss: 4.4786e-05
Epoch 21/100
32/32 - 0s - loss: 5.7752e-05
Epoch 22/100
32/32 - 0s - loss: 9.7764e-05
Epo

<tensorflow.python.keras.callbacks.History at 0x15e21cbd0>

In [18]:
X_test = scaled_test_data.drop("total_earnings", axis = 1).values
Y_test = scaled_test_data[["total_earnings"]].values

In [19]:
test_error_rate = model.evaluate(X_test, Y_test, verbose = 0)

In [20]:
print("The mean-square-error for the test data is {} %".format(np.round(test_error_rate*100, 5)))

The mean-square-error for the test data is 0.00461 %


### Let's predict the data for newly proposed values

In [21]:
predict_X = pd.read_csv("proposed_new_product.csv").values

In [22]:
prediction = model.predict(predict_X)

In [23]:
prediction = prediction[0][0]

### We need to re-scale the prediction, since it will be in the similar ( 0, 1 ) range, for our scaled dataset

In [24]:
prediction = prediction + abs(add_factor)
prediction = prediction / mul_factor

In [25]:
print("Earning prediction for proposed product: ${}".format(prediction))

Earning prediction for proposed product: $265344.2622062564


_____