# Step A. Baseline model

Install required libraries

In [1]:
# All Libraries required for this notebook are listed below.
# If you doesn't have the libraries preinstalled, you may need to uncomment and install certain libraries.

#!pip install numpy
#!pip install pandas
#!pip install keras
#!pip install sklearn

## Preparation stage
Import required libraries

In [2]:
import pandas as pd
import numpy as np

Download the concrete dataset


In [3]:
concrete_data = pd.read_csv('https://cocl.us/concrete_data')
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


Let's see the dataset description and check for any missing values


In [4]:
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [5]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

There is no any missing data


# Create model
Import all needed packages from Keras


In [6]:
from keras.models import Sequential
from keras.layers import InputLayer
from keras.layers import Dense

Define a model. We define a function that describes our regression model so that we can conveniently call it to create our model.

The function create a model that: 
- Has one hidden layer of 10 nodes, and a ReLU activation function
- Use the adam optimizer and the mean squared error as the loss function.


In [7]:
# define regression model
def regression_model(input_shape):
    # create model
    model = Sequential()
    model.add(InputLayer(shape=input_shape))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['root_mean_squared_error'])
    return model

# 1. Split data
a) Split data to features (predictors) and Y (targets)

In [8]:
concrete_data_columns = concrete_data.columns

features = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
Y = concrete_data['Strength'] # Strength column

b) Split data into Train and Test datasets

We randomly split the data into a training and test sets by holding 30% of the data for testing, using the *train_test_split* helper function from Scikit-learn


In [9]:
from sklearn.model_selection import train_test_split # load train_test_split from preinstalled sklearn library
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size=0.3)

Let's save the number of predictors to *n_cols* since we will need this number when building our network.


In [10]:
n_cols = features.shape[1] # number of features
print(n_cols)

8


## 2. Train model

Call the function to create the model.

In [11]:
# build the model
model = regression_model(input_shape=(n_cols,))

Train and validate the model at the same time using the *fit* method with **train** and **test** data

In [12]:
# fit the model
history = model.fit(x_train, y_train, epochs=50, verbose=2)

Epoch 1/50
23/23 - 1s - 26ms/step - loss: 3819.1353 - root_mean_squared_error: 62.1540
Epoch 2/50
23/23 - 0s - 1ms/step - loss: 1847.8258 - root_mean_squared_error: 42.9452
Epoch 3/50
23/23 - 0s - 1ms/step - loss: 1476.3329 - root_mean_squared_error: 38.6611
Epoch 4/50
23/23 - 0s - 1ms/step - loss: 1275.6495 - root_mean_squared_error: 35.8348
Epoch 5/50
23/23 - 0s - 1ms/step - loss: 1133.7610 - root_mean_squared_error: 33.5106
Epoch 6/50
23/23 - 0s - 1ms/step - loss: 987.6344 - root_mean_squared_error: 31.5225
Epoch 7/50
23/23 - 0s - 1ms/step - loss: 889.8820 - root_mean_squared_error: 29.7332
Epoch 8/50
23/23 - 0s - 1ms/step - loss: 791.3735 - root_mean_squared_error: 28.1654
Epoch 9/50
23/23 - 0s - 1ms/step - loss: 708.0656 - root_mean_squared_error: 26.5678
Epoch 10/50
23/23 - 0s - 1ms/step - loss: 628.2115 - root_mean_squared_error: 25.1703
Epoch 11/50
23/23 - 0s - 1ms/step - loss: 574.6365 - root_mean_squared_error: 23.9895
Epoch 12/50
23/23 - 0s - 1ms/step - loss: 510.5694 - root

# 3. Model evaluation and show metrics  (RMSE)

The evaluation result (*score*) is array with two elements in our case - the *loss_value* (score[0]) and the *root_mean_squared_error* value (score[1])

In [13]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Evaluation Root Mean Squared Error:', score[1])

Evaluation Root Mean Squared Error: 11.94777774810791


# 4.Repeating steps 1...3 for 50 times
Repeat steps 1...3 and collect metrics in one list

In [14]:
# build the model
model = regression_model(input_shape=(n_cols,))

rmse_list = []
print('Iterations calculation started')
for step in range(50): # Cycle of iterations for splitting, learning and evaluation stages
    # print('Iteration: ', step, end='\r')
    x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size=0.3) 
    # fit the model
    history = model.fit(x_train, y_train, epochs=50, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    print('Evaluation Root Mean Squared Error', 'for iteration', step+1, '=', score[1], end='\r')
    rmse_list.append(score[1]) # adding to the RMSE list current iteration metric
# print(rmse_list) #print list of calculated RMSE

Iterations calculation started
Evaluation Root Mean Squared Error for iteration 50 = 6.3188023567199715

Calculate mean and standard deviation for the calculated metrics

In [15]:
mean_rmse = np.mean(np.array(rmse_list))
mean_std = np.std(np.array(rmse_list))
print('Mean RMSE = ', mean_rmse)
print('Standard deviation of RMSE = ', mean_std)

Mean RMSE =  7.178126182556152
Standard deviation of RMSE =  1.2556304301368935
