In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
import matplotlib.pyplot as plt

## Load Data

In [2]:
npz = np.load('Cars_train.npz')
train_inputs, train_targets = npz['inputs'], npz['targets']

npz = np.load('Cars_test.npz')
test_inputs, test_targets = npz['inputs'], npz['targets']


In [3]:
print(train_inputs.shape)
print(train_targets.shape)
print(test_inputs.shape)
print(test_targets.shape)

(3443, 19)
(3443,)
(383, 19)
(383,)


## Model

Outline, loss function, optimizer, early stopping and training

In [4]:
input_size = 19
hidden_layer_size = 50
output_size = 1

model = tf.keras.Sequential([tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                             tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                             tf.keras.layers.Dense(output_size)])

custom_optimiser = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(optimizer=custom_optimiser,loss='mean_squared_error',metrics=['mse'])

BATCH_SIZE = 50
max_epochs = 200
early_stopping = tf.keras.callbacks.EarlyStopping(patience=1)

model.fit(train_inputs,train_targets,batch_size=BATCH_SIZE,epochs=max_epochs,verbose=2,
          callbacks=[early_stopping],
          validation_split=0.1)


Train on 3098 samples, validate on 345 samples
Epoch 1/200
3098/3098 - 2s - loss: 41.5726 - mse: 41.5726 - val_loss: 6.0638 - val_mse: 6.0638
Epoch 2/200
3098/3098 - 0s - loss: 2.9932 - mse: 2.9932 - val_loss: 1.6890 - val_mse: 1.6890
Epoch 3/200
3098/3098 - 0s - loss: 1.2462 - mse: 1.2462 - val_loss: 1.0192 - val_mse: 1.0192
Epoch 4/200
3098/3098 - 0s - loss: 0.7864 - mse: 0.7864 - val_loss: 0.6763 - val_mse: 0.6763
Epoch 5/200
3098/3098 - 0s - loss: 0.5550 - mse: 0.5550 - val_loss: 0.5155 - val_mse: 0.5155
Epoch 6/200
3098/3098 - 0s - loss: 0.4275 - mse: 0.4275 - val_loss: 0.4164 - val_mse: 0.4164
Epoch 7/200
3098/3098 - 0s - loss: 0.3466 - mse: 0.3466 - val_loss: 0.3568 - val_mse: 0.3568
Epoch 8/200
3098/3098 - 0s - loss: 0.2904 - mse: 0.2904 - val_loss: 0.3069 - val_mse: 0.3069
Epoch 9/200
3098/3098 - 0s - loss: 0.2480 - mse: 0.2480 - val_loss: 0.2683 - val_mse: 0.2683
Epoch 10/200
3098/3098 - 0s - loss: 0.2150 - mse: 0.2150 - val_loss: 0.2411 - val_mse: 0.2411
Epoch 11/200
3098/30

<tensorflow.python.keras.callbacks.History at 0x7f85637c7e10>

In [5]:
pred = model.predict(test_inputs)

In [6]:
# price was transformed by converting to log (np.log)
# To convert back, use np.exp

In [7]:
prediction = np.exp(pred)

In [8]:
prediction

array([[ 11275.044  ],
       [  2624.11   ],
       [  2754.401  ],
       [ 12694.1    ],
       [  7090.2783 ],
       [  5682.7705 ],
       [  6510.9053 ],
       [ 11095.33   ],
       [  3424.2263 ],
       [ 14262.795  ],
       [  7618.1562 ],
       [  3784.7976 ],
       [ 42776.652  ],
       [ 13717.402  ],
       [ 11680.822  ],
       [ 23417.777  ],
       [  8180.1226 ],
       [  8016.78   ],
       [ 15516.898  ],
       [ 21838.74   ],
       [  4701.9624 ],
       [ 28454.01   ],
       [ 11248.999  ],
       [ 10384.897  ],
       [ 26386.775  ],
       [ 18785.55   ],
       [  8983.713  ],
       [ 20186.307  ],
       [  4253.646  ],
       [  6215.137  ],
       [ 13060.084  ],
       [  3704.7144 ],
       [ 12791.61   ],
       [ 37389.145  ],
       [ 49645.832  ],
       [ 18493.707  ],
       [ 12600.227  ],
       [  7905.2974 ],
       [ 20863.729  ],
       [  6298.4585 ],
       [ 28087.293  ],
       [  3184.7092 ],
       [ 53514.434  ],
       [ 20

In [9]:
np.reshape(prediction,prediction.shape[0])

Series(np.reshape(prediction,prediction.shape[0]))

0      11275.043945
1       2624.110107
2       2754.400879
3      12694.099609
4       7090.278320
           ...     
378     9009.554688
379    10121.314453
380     3403.091797
381     7275.454590
382     6131.801270
Length: 383, dtype: float32

In [10]:
summary = DataFrame()
summary['Prediction'] = Series(np.reshape(prediction,prediction.shape[0]))
summary['Actual'] = np.exp(test_targets)
summary['Residual'] = summary['Actual']-summary['Prediction']
summary['% Difference'] = np.abs((summary['Residual']/summary['Actual'])*100)

In [11]:
pd.set_option('display.max_row',None)
summary.sort_values('% Difference',ascending=False,inplace=True)
summary.round(2)

Unnamed: 0,Prediction,Actual,Residual,% Difference
127,9590.230469,2850.0,-6740.23,236.5
202,38909.679688,12000.0,-26909.68,224.25
78,10900.929688,3999.0,-6901.93,172.59
51,54933.800781,21300.0,-33633.8,157.91
306,4506.709961,1850.0,-2656.71,143.61
93,79372.710938,33950.0,-45422.71,133.79
194,8288.639648,3650.0,-4638.64,127.09
205,24635.509766,11200.0,-13435.51,119.96
278,31670.470703,14500.0,-17170.47,118.42
120,46302.378906,24900.0,-21402.38,85.95


In [12]:
summary.describe()

Unnamed: 0,Prediction,Actual,Residual,% Difference
count,383.0,383.0,383.0,383.0
mean,19640.480469,18684.436762,-956.047376,22.537261
std,20367.751953,18009.43108,8371.410163,27.886165
min,786.659241,1389.0,-71711.109375,0.038841
25%,7885.187012,7750.0,-2560.75,6.62649
50%,12812.697266,12400.0,-202.922607,14.5468
75%,24589.337891,23600.0,1394.514526,28.555978
max,169711.109375,125000.0,24987.929687,236.499246
