# Build your own neural network

Now it is all up to you to build your own neural network from scratch and apply a hyperparameter search.

## The data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,doors,bodystyle,drive,enginelocation,wheelbase,...,enginesize,fuelsystem,bore,stroke,compression,hp,rpm,citympg,highwaympg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [2]:
df = df.fillna(0)

y = df[['price']]
X = df.drop(['price'],axis=1)

In [3]:
for column in X.columns:
    if X[column].dtype == np.object:
        print('Converting ', column)
        X = pd.concat([X,pd.get_dummies(X[column], prefix=column, drop_first=True)],axis=1).drop([column],axis=1)

Converting  make
Converting  fuel-type
Converting  aspiration
Converting  doors
Converting  bodystyle
Converting  drive
Converting  enginelocation
Converting  enginetype
Converting  numcyl
Converting  fuelsystem


## Create the model

Creating train and test set and standardising:

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(X)
y = StandardScaler().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Baselines

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train,y_train)
prediction = lr.predict(X_test)

print('RMSE:', np.sqrt(mse(y_test,prediction)))

rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train,y_train)
prediction = rf.predict(X_test)

print('RMSE:', np.sqrt(mse(y_test,prediction)))

RMSE: 0.6375896829908441
RMSE: 0.2838598182648678


  # This is added back by InteractiveShellApp.init_path()


### Create your neural network

Incorporate the following:
- 2 different kernels in hidden layer.
- 2 different output kernels.
- Different sizes for the hidden layers.
- Different number of hidden layers.
- Use 10 epochs.

Here, create your model compatible with Keras:

In [6]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import Adam

def nn_model(no_neurons,no_layers,kernel,output_kernel):
    model = Sequential()
    model.add(Dense(no_neurons,input_dim=input_dim))
    model.add(Activation(kernel))

    # Extra hidden layers
    for _ in range(0,no_layers):
        model.add(Dense(no_neurons))
        model.add(Activation(kernel))

    # Output
    model.add(Dense(output_dim))
    model.add(Activation(output_kernel))
    model.compile(optimizer=Adam(),loss='mean_squared_error',metrics=['mean_squared_error'])
        
    return model

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Here, run your grid search (10 epochs):

In [7]:
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV

input_dim = X.shape[1]
output_dim = 1

parameters = {'no_neurons':[50,100],'kernel':['relu','linear'],'output_kernel':['linear','sigmoid'],'no_layers':[1,2],'verbose':[0]} 

grid_search = GridSearchCV(KerasRegressor(nn_model), parameters, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(X, y,epochs=10)

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']

print('Mean RMSE (+/- standard deviation), for parameters')
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/- %0.03f) for %r"
          # The MSE is return as a negative, so we multiple it with -1 before squaring it
          % (np.sqrt(-1*mean), np.sqrt(std), params))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Mean RMSE (+/- standard deviation), for parameters
0.859 (+/- 0.700) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 50, 'output_kernel': 'linear', 'verbose': 0}
0.884 (+/- 0.666) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 50, 'output_kernel': 'sigmoid', 'verbose': 0}
0.803 (+/- 0.647) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 100, 'output_kernel': 'linear', 'verbose': 0}
0.840 (+/- 0.617) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 100, 'output_kernel': 'sigmoid', 'verbose': 0}
0.770 (+/- 0.603) for {'kernel': 'relu', 'no_layers': 2, 'no_neurons': 50, 'output_kernel': 'linear', 'verbose': 0}
0.887 (+/- 0.705) for {'kernel': 'relu', 'no_layers': 2, 'no_neurons': 50, 'output_kernel': 'sigmoid', 'verbose': 0}
0.799 (+/- 0.682) for {'kernel': 'relu', 'no_layers': 2, 'no_neurons': 100, 'output_kernel': 'linear', 'verbose': 0}
0.827 (+/- 

It seems that there is quite some difference between using the sigmiod and linear output kernel. The former gives a lower RMSE. The better results are obtained using ReLU as the activation function in the hidden layers.

Smaller hidden layers:

In [9]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

input_dim = X.shape[1]
output_dim = 1

parameters = {'no_neurons':[10,20],'kernel':['relu','linear'],'output_kernel':['linear','sigmoid'],'no_layers':[0,1],'verbose':[0]} 

grid_search = GridSearchCV(KerasClassifier(nn_model), parameters, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(X, y,epochs=10)

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']

print('Mean RMSE (+/- standard deviation), for parameters')
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/- %0.03f) for %r"
          # The MSE is return as a negative, so we multiple it with -1 before squaring it
          % (np.sqrt(-1*mean), np.sqrt(std), params))

Mean RMSE (+/- standard deviation), for parameters
1.694 (+/- 1.085) for {'kernel': 'relu', 'no_layers': 0, 'no_neurons': 10, 'output_kernel': 'linear', 'verbose': 0}
1.605 (+/- 0.993) for {'kernel': 'relu', 'no_layers': 0, 'no_neurons': 10, 'output_kernel': 'sigmoid', 'verbose': 0}
1.657 (+/- 0.974) for {'kernel': 'relu', 'no_layers': 0, 'no_neurons': 20, 'output_kernel': 'linear', 'verbose': 0}
1.511 (+/- 1.084) for {'kernel': 'relu', 'no_layers': 0, 'no_neurons': 20, 'output_kernel': 'sigmoid', 'verbose': 0}
1.590 (+/- 1.064) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 10, 'output_kernel': 'linear', 'verbose': 0}
1.472 (+/- 1.117) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 10, 'output_kernel': 'sigmoid', 'verbose': 0}
1.400 (+/- 1.046) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 20, 'output_kernel': 'linear', 'verbose': 0}
1.405 (+/- 1.050) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 20, 'output_kernel': 'sigmoid', 'verbose': 0}
1.674 (+/- 1.024)

We clearly have a higher error rate, meaning that a bigger network pays off in this case.

Let's try more epochs for the smaller hidden layers:

In [8]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

input_dim = X.shape[1]
output_dim = 1

parameters = {'no_neurons':[10,20],'kernel':['relu','linear'],'output_kernel':['linear','sigmoid'],'no_layers':[0,1],'verbose':[0]} 

grid_search = GridSearchCV(KerasClassifier(nn_model), parameters, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(X, y,epochs=100)

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']

print('Mean RMSE (+/- standard deviation), for parameters')
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/- %0.03f) for %r"
          # The MSE is return as a negative, so we multiple it with -1 before squaring it
          % (np.sqrt(-1*mean), np.sqrt(std), params))

Mean RMSE (+/- standard deviation), for parameters
1.492 (+/- 1.105) for {'kernel': 'relu', 'no_layers': 0, 'no_neurons': 10, 'output_kernel': 'linear', 'verbose': 0}
1.493 (+/- 1.057) for {'kernel': 'relu', 'no_layers': 0, 'no_neurons': 10, 'output_kernel': 'sigmoid', 'verbose': 0}
1.485 (+/- 1.073) for {'kernel': 'relu', 'no_layers': 0, 'no_neurons': 20, 'output_kernel': 'linear', 'verbose': 0}
1.460 (+/- 1.061) for {'kernel': 'relu', 'no_layers': 0, 'no_neurons': 20, 'output_kernel': 'sigmoid', 'verbose': 0}
1.408 (+/- 1.080) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 10, 'output_kernel': 'linear', 'verbose': 0}
1.394 (+/- 1.046) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 10, 'output_kernel': 'sigmoid', 'verbose': 0}
1.394 (+/- 1.046) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 20, 'output_kernel': 'linear', 'verbose': 0}
1.394 (+/- 1.046) for {'kernel': 'relu', 'no_layers': 1, 'no_neurons': 20, 'output_kernel': 'sigmoid', 'verbose': 0}
1.719 (+/- 0.994)

No difference here except for the reLU kernel based ones. we might be overfitting in the other cases.