# Building a Regression Model using Keras

### Data cleaning and Preprocessing

Importing Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("concrete_data.csv")

#let's show the dataframe
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


Checking dimensions of data

In [3]:
df.shape

(1030, 9)

let's see satestical summary of the data

In [4]:
df.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


Checking for null values

In [5]:

df.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

This shows there are no null values in the dataframe. Now let's seprate columns for predictors and target
dataframes.

In [6]:
predict_columns = df.columns
x_data = df[predict_columns[df.columns != 'Strength']]

#let's see the predict data
x_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [7]:
y_data = df['Strength']

y_data.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

Now the x and y data is ready let's split the data into training and testing data. For this we will use train_test_split of sci-kit learn module

In [8]:
#importing the module
from sklearn.model_selection import train_test_split

def data_split(x_data,y_data):
    x_train, x_test, y_train, y_test = train_test_split(x_data,y_data, test_size = 0.3, random_state = 1)
    return (x_train,x_test,y_train,y_test)
#let's see the shape of training and testing data
#print("training data: ",x_train.shape, y_train.shape)
#print("testing data: ",x_test.shape,y_test.shape)

### Building the Model

Importing Keras module

In [9]:
import keras

In [10]:
#importing functions
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error

#Building Nueral Network model

def reg_model():
    model = Sequential()

    n_cols = x_train.shape[1]

    model.add(Dense(10,activation ='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    model.compile(optimizer='adam',loss='mean_squared_error')
    return model

Training the model

In [11]:
SME = np.zeros(50)

#loop for running model 50 times
for i in range(50):
    print("model running for {} time".format(i+1))
    #spliting data
    x_train,x_test,y_train,y_test = data_split(x_data,y_data)
    #creating network model instance
    model = reg_model()
    model.fit(x_train,y_train,epochs = 50,verbose = 0)
    y_predict = model.predict(x_test)
    SME[i] = mean_squared_error(y_test,y_predict)
    print(SME[i])

SME

model running for 1 time
122.21848527612622
model running for 2 time
157.99596009961155
model running for 3 time
317.2813765391262
model running for 4 time
687.2800809159489
model running for 5 time
128.85124071428174
model running for 6 time
327.742447818412
model running for 7 time
771.0152451619343
model running for 8 time
227.76964288956145
model running for 9 time
191.76653623766902
model running for 10 time
226.92531252299477
model running for 11 time
305.1660697738106
model running for 12 time
163.8296008215652
model running for 13 time
144.22750113715645
model running for 14 time
162.70023099153389
model running for 15 time
128.20637118959257
model running for 16 time
984.0200383913998
model running for 17 time
178.7232038500214
model running for 18 time
2114.5514644915247
model running for 19 time
111.99387944619879
model running for 20 time
127.44378079958487
model running for 21 time
137.5522647283722
model running for 22 time
242.5761866866437
model running for 23 time
174.

array([ 122.21848528,  157.9959601 ,  317.28137654,  687.28008092,
        128.85124071,  327.74244782,  771.01524516,  227.76964289,
        191.76653624,  226.92531252,  305.16606977,  163.82960082,
        144.22750114,  162.70023099,  128.20637119,  984.02003839,
        178.72320385, 2114.55146449,  111.99387945,  127.4437808 ,
        137.55226473,  242.57618669,  174.90752759,  130.0799341 ,
       1289.40569896,  133.26848668,  186.30299202,  127.26791191,
        522.36004786,  629.80240772,  177.37394619,  259.27105786,
        147.56791766,  172.19334449,  183.4727379 ,  460.52733278,
        805.09837131, 1407.91322782,  396.98876535, 1737.80114506,
        504.09569824,  118.52126901,  121.23029888,  156.15909888,
        149.97770934,  148.37343627,  129.922688  ,  184.97424971,
        111.37266219,  504.56642074])

Let's find out the mean and standard deviation of the mean_squared_error

In [14]:
SME_mean = SME.sum()/SME.shape[0]
SME_mean

SME_std = np.sqrt(pow((SME -np.array(SME_mean)),2).sum()/SME.shape[0])
print("Error of the model run 50 times")
print("Mean of Squared Mean Error calculated 50 times: ",SME_mean,"\nStandard Deviation of Squared Mean Error calculated 50 times: ",SME_std)

Error of the model run 50 times
Mean of Squared Mean Error calculated 50 times:  380.6126661002473 
Standard Deviation of Squared Mean Error calculated 50 times:  431.55417510806063
