In [2]:
#Import libraries and modules
#boston_housing dataset is already available in the datasets module under Keras
#scikit-learn (sklearn) is a simple and efficient tools for predictive data analysis

import pandas
from keras.datasets import boston_housing

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
#Loading the dataset
#Note that there are 13 features in X_train per sample
# X_train[0] – tuple – vector containing all 13 values
# Y_train[0] – 15.2 is the expected output

(X_train, y_train), (X_test, y_test) = boston_housing.load_data()

# let us view on sample from the features
for i in range(5):
  print(X_train[i], y_train[i])

#output
#[  1.23247   0.        8.14      0.        0.538     6.142    91.7
#   3.9769    4.      307.       21.      396.9      18.72   ] 15.2



In [None]:
#X_train.dtype
#dtype('float64')
X_train.view

In [None]:
print(X_test[0], y_test[0])

# **Define baseline model**

Sequential() function will return an empty model. (no layers added)
Dense(13 - no of neurons ,Input_dim – 13 dimensional vector as input ,Activation = relu function is used
No activation function is used for the output layer because it is a regression problem, and you are interested in predicting numerical values directly without transformation.

The efficient ADAM optimization algorithm is used, and a mean squared error loss function is optimized. This will be the same metric you will use to evaluate the performance of the model. It is a desirable metric because taking the square root gives an error value you can directly understand in the context of the problem (thousands of dollars).

In [7]:

def baseline_model():
 # create model
 model = Sequential()
 model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
 model.add(Dense(1, kernel_initializer='normal'))
 # Compile model
 model.compile(loss='mean_squared_error', optimizer='adam')
 return model


In [None]:
baseline_model().summary()

'''Model: "sequential" - default name given
_________________________________________________________________
 Layer (type)                Output Shape              Param #
=================================================================
 dense_2 (Dense)             (None, 13)                182

 dense_3 (Dense)             (None, 1)                 14

=================================================================
Total params: 196
Trainable params: 196
Non-trainable params: 0
_________________________________________________________________'''

#Evaluate model
•	KFold The training set is split into k smaller sets
•	The following procedure is followed for each of the k “folds”:
*   A model is trained using k-1 of the folds as training data
*   the resulting model is validated on the remaining part of the data
*   Out of 10 sets 9 will be used for training and 1 for validation ,for each of 100 iteration 1 set is left out

•	As we fine tune the model we need the value to decrease(The values don’t have any meaning as such but we need them to decrease)

The Keras wrapper object used in scikit-learn as a regression estimator is called **KerasRegressor**. You create an instance and pass it both the name of the function to create the neural network model and some parameters to pass along to the fit() function of the model later, such as the number of epochs and batch size. Both of these are set to sensible defaults.

In [None]:
# evaluate model

estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)
kfold = KFold(n_splits=10)
results = cross_val_score(estimator, X_train, y_train, cv=kfold)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

''' estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)
Baseline: -24.74 (15.01) MSE'''

Running this code gives you an estimate of the model’s performance on the problem for unseen data.

Note: Your results may vary given the stochastic nature of the algorithm or evaluation procedure, or differences in numerical precision.

The result reports the mean squared error, including the average and standard deviation (average variance) across all ten folds of the cross validation evaluation.

# Evaluate model with standardized dataset
•	Standardizing is considered part of pre-processing
•	standardize is the process of normalizing the different features to a uniform scale.
•	Typically this is done by subtracting the mean and dividing by the standard deviation
•	z = (x - u) / s
•	Standardization of a dataset is a common requirement for many machine learning estimators:
they might behave badly if the individual features do not more or less look like standard normally distributed data
•	Though the results might vary, on average, a standardized model usually performs better.
•	Keras automatically standardize. Then train the model(similar steps as before)

A further extension of this section would be to similarly apply a rescaling to the output variable, such as normalizing it to the range of 0-1 and using a Sigmoid or similar activation function on the output layer to narrow output predictions to the same range.

In [None]:
# evaluate model with standardized dataset

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10)
results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

#Baseline: -24.74 (15.01) MSE
#Standardized: -22.58 (10.71) MSE

**Tuning with deeper model**

•	Tuning is the process of varying the hyperparameters to arrive at a better model
•	One of the approaches of tuning is by increasing the number of layers
•	WE are adding a layer in between (6 neurons) then rest is same
•	Then how far to go? Till we are satisfied with our error – more layers will increase cost so

In [12]:
#13 inputs -> [13 -> 6] -> 1 output
def larger_model():
 # create model
 model = Sequential()
 model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
 model.add(Dense(6, kernel_initializer='normal', activation='relu'))
 model.add(Dense(1, kernel_initializer='normal'))
 # Compile model
 model.compile(loss='mean_squared_error', optimizer='adam')
 return model

In [None]:
larger_model().summary()

"""Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #
=================================================================
 dense_42 (Dense)            (None, 13)                182

 dense_43 (Dense)            (None, 6)                 84

 dense_44 (Dense)            (None, 1)                 7

=================================================================
Total params: 273 ---increased count as we have added one more layer
Trainable params: 273
Non-trainable params: 0
_________________________________________________________________"""

In [None]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10)
results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))

#Baseline: -24.74 (15.01) MSE
#Standardized: -22.58 (10.71) MSE
#Larger: -17.75 (9.03) MSE

**Tuning with wider model**

Tuning is the process of varying the hyperparameters to arrive at a better model
One of the approaches of tuning is by increasing the number of neurons in a layer
The added lwer has 20 neurons rest is same
Error is further reduced.


In [16]:
#13 inputs -> [20] -> 1 output
def wider_model():
 # create model
 model = Sequential()
 model.add(Dense(20, input_dim=13, kernel_initializer='normal', activation='relu'))
 model.add(Dense(1, kernel_initializer='normal'))
 # Compile model
 model.compile(loss='mean_squared_error', optimizer='adam')
 return model

In [None]:
wider_model().summary()

"""Model: "sequential_33"
_________________________________________________________________
 Layer (type)                Output Shape              Param #
=================================================================
 dense_78 (Dense)            (None, 20)                280

 dense_79 (Dense)            (None, 1)                 21

=================================================================
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________"""

In [None]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10)
results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
print("wider: %.2f (%.2f) MSE" % (results.mean(), results.std()))

#Baseline: -24.74 (15.01) MSE
#Standardized: -22.58 (10.71) MSE
#Larger: -17.75 (9.03) MSE
#wider: -16.54 (7.91) MSE

#Building the model reveals a further drop in error to about 16 thousand squared dollars. This is not a bad result for this problem.