# Building a Regression Model using Keras

### Data cleaning and Preprocessing

Importing Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("concrete_data.csv")

#let's show the dataframe
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


Checking dimensions of data

In [3]:
df.shape

(1030, 9)

let's see satestical summary of the data

In [4]:
df.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


Checking for null values

In [5]:
df.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

This shows there are no null values in the dataframe. Now let's seprate columns for predictors and target
dataframes.

In [6]:
predict_columns = df.columns
x_data = df[predict_columns[df.columns != 'Strength']]

#let's see the predict data
x_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [7]:
y_data = df['Strength']

y_data.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

### Normalizing Data

In [8]:
import math
#col = x_data.columns


for i,col in enumerate(x_data.columns):
    col_mean = x_data[col].sum()/x_data[col].shape[0]
    col_std = math.sqrt(pow((x_data[[col]]- col_mean),2).sum()/x_data[[col]].shape[0])
    print(i+1,"col : ",col,col_mean)
    x_data[col] = (x_data[col] - col_mean)/col_std
x_data.head()

1 col :  Cement 281.16786407766995
2 col :  Blast Furnace Slag 73.89582524271846
3 col :  Fly Ash 54.18834951456311
4 col :  Water 181.56728155339806
5 col :  Superplasticizer 6.204660194174758
6 col :  Coarse Aggregate 972.9189320388349
7 col :  Fine Aggregate 773.5804854368932
8 col :  Age 45.662135922330094


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.477915,-0.856888,-0.847144,-0.916764,-0.620448,0.863154,-1.21767,-0.279733
1,2.477915,-0.856888,-0.847144,-0.916764,-0.620448,1.056164,-1.21767,-0.279733
2,0.491425,0.795526,-0.847144,2.175461,-1.039143,-0.526517,-2.240917,3.553066
3,0.491425,0.795526,-0.847144,2.175461,-1.039143,-0.526517,-2.240917,5.057677
4,-0.790459,0.678408,-0.847144,0.488793,-1.039143,0.070527,0.647884,4.978487


Now the x and y data is ready let's split the data into training and testing data. For this we will use train_test_split of sci-kit learn module

In [9]:
#importing the module
from sklearn.model_selection import train_test_split

def data_split(x_data,y_data):
    x_train, x_test, y_train, y_test = train_test_split(x_data,y_data, test_size = 0.3, random_state = 1)
    return (x_train,x_test,y_train,y_test)
#let's see the shape of training and testing data
#print("training data: ",x_train.shape, y_train.shape)
#print("testing data: ",x_test.shape,y_test.shape)

### Building the Model

Importing Keras module

In [10]:
import keras

In [11]:
#importing functions
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error


def reg_model():
    model = Sequential()

    n_cols = x_train.shape[1]

    model.add(Dense(10,activation ='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    model.compile(optimizer='adam',loss='mean_squared_error')
    return model

Training the model

In [12]:
x_train,x_test,y_train,y_test = data_split(x_data,y_data)
print("training data: ",x_train.shape, y_train.shape)
print("testing data: ",x_test.shape,y_test.shape)

training data:  (721, 8) (721,)
testing data:  (309, 8) (309,)


In [13]:
#With Normalized Data and 100 epochs
SME = np.zeros(50)

#loop for running model 50 times
for i in range(50):
    print("model running for {} time".format(i+1))
    #spliting data
    x_train,x_test,y_train,y_test = data_split(x_data,y_data)
    #creating network model instance
    model = reg_model()
    model.fit(x_train,y_train,epochs = 100,verbose = 0)
    y_predict = model.predict(x_test)
    SME[i] = mean_squared_error(y_test,y_predict)
    print(SME[i])

SME

model running for 1 time
170.05297193525297
model running for 2 time
206.4548529303638
model running for 3 time
180.0403500064819
model running for 4 time
227.94718074115087
model running for 5 time
192.88679242828263
model running for 6 time
178.19289018085132
model running for 7 time
192.67279418839945
model running for 8 time
182.47075950056978
model running for 9 time
183.92385225581194
model running for 10 time
187.72929493214107
model running for 11 time
178.0832533647785
model running for 12 time
218.0539027548268
model running for 13 time
175.65612382875415
model running for 14 time
186.83069341185654
model running for 15 time
188.75632999825083
model running for 16 time
192.59574536128744
model running for 17 time
194.6686619269033
model running for 18 time
178.77694650298105
model running for 19 time
179.6005819013249
model running for 20 time
187.38402372311043
model running for 21 time
180.89942200253927
model running for 22 time
179.03907168912454
model running for 23 time

array([170.05297194, 206.45485293, 180.04035001, 227.94718074,
       192.88679243, 178.19289018, 192.67279419, 182.4707595 ,
       183.92385226, 187.72929493, 178.08325336, 218.05390275,
       175.65612383, 186.83069341, 188.75633   , 192.59574536,
       194.66866193, 178.7769465 , 179.6005819 , 187.38402372,
       180.899422  , 179.03907169, 187.5533109 , 189.52762037,
       171.23250546, 173.88256808, 179.01133392, 172.90784022,
       177.6682722 , 190.7107972 , 209.91105032, 180.18062963,
       185.88336748, 185.39219425, 192.23744618, 208.43639919,
       179.24338686, 189.03194135, 182.57109845, 188.21512359,
       188.45023178, 186.54158645, 179.66255909, 170.3491458 ,
       184.36383846, 190.15241405, 184.23595053, 179.83535649,
       186.53424171, 191.00890455])

Let's find out the mean and standard deviation of the mean_squared_error

In [16]:
SME_mean = SME.sum()/SME.shape[0]
SME_mean

SME_std = np.sqrt(pow((SME -np.array(SME_mean)),2).sum()/SME.shape[0])
print("Error of the Model using normalization and 100 epochs")
print("Mean of Squared Mean Error calculated 50 times: ",SME_mean,"\nStandard Deviation of Squared Mean Error calculated 50 times: ",SME_std)

Error of the Model using normalization and 100 epochs
Mean of Squared Mean Error calculated 50 times:  186.54895220249438 
Standard Deviation of Squared Mean Error calculated 50 times:  11.320946698270642


# Comparizon with Part-B Conclusion

There is a very significant difference between the means of the SME of Part-B(with normalization and 50 epochs) and Part-C(with normalization and 100 epochs). The value of mean in Part-C is much less than the value of mean in Part-B

There is very significant difference between the Standard Deviations of the SME of Part-B and Part-C. In Part-C, where the data is normalized the Standard deviation value is hugely less than the value calculated in Part-B.