# Building a Regression Model using Keras

### Data cleaning and Preprocessing

Importing Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("concrete_data.csv")

#let's show the dataframe
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


Checking dimensions of data

In [3]:
df.shape

(1030, 9)

let's see satestical summary of the data

In [4]:
df.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


Checking for null values

In [5]:
df.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

This shows there are no null values in the dataframe. Now let's seprate columns for predictors and target
dataframes.

In [6]:
predict_columns = df.columns
x_data = df[predict_columns[df.columns != 'Strength']]

#let's see the predict data
x_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [7]:
y_data = df['Strength']

y_data.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

### Normalizing Data

In [8]:
import math
#col = x_data.columns


for i,col in enumerate(x_data.columns):
    col_mean = x_data[col].sum()/x_data[col].shape[0]
    col_std = math.sqrt(pow((x_data[[col]]- col_mean),2).sum()/x_data[[col]].shape[0])
    print(i+1,"col : ",col,col_mean)
    x_data[col] = (x_data[col] - col_mean)/col_std
x_data.head()

1 col :  Cement 281.16786407766995
2 col :  Blast Furnace Slag 73.89582524271846
3 col :  Fly Ash 54.18834951456311
4 col :  Water 181.56728155339806
5 col :  Superplasticizer 6.204660194174758
6 col :  Coarse Aggregate 972.9189320388349
7 col :  Fine Aggregate 773.5804854368932
8 col :  Age 45.662135922330094


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.477915,-0.856888,-0.847144,-0.916764,-0.620448,0.863154,-1.21767,-0.279733
1,2.477915,-0.856888,-0.847144,-0.916764,-0.620448,1.056164,-1.21767,-0.279733
2,0.491425,0.795526,-0.847144,2.175461,-1.039143,-0.526517,-2.240917,3.553066
3,0.491425,0.795526,-0.847144,2.175461,-1.039143,-0.526517,-2.240917,5.057677
4,-0.790459,0.678408,-0.847144,0.488793,-1.039143,0.070527,0.647884,4.978487


Now the x and y data is ready let's split the data into training and testing data. For this we will use train_test_split of sci-kit learn module

In [9]:
#importing the module
from sklearn.model_selection import train_test_split

def data_split(x_data,y_data):
    x_train, x_test, y_train, y_test = train_test_split(x_data,y_data, test_size = 0.3, random_state = 1)
    return (x_train,x_test,y_train,y_test)
#let's see the shape of training and testing data
#print("training data: ",x_train.shape, y_train.shape)
#print("testing data: ",x_test.shape,y_test.shape)

### Building the Model

Importing Keras module

In [10]:
import keras

In [11]:
#importing functions
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error


def reg_model():
    model = Sequential()

    n_cols = x_train.shape[1]

    model.add(Dense(10,activation ='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    model.compile(optimizer='adam',loss='mean_squared_error')
    return model

Training the model

In [12]:
x_train,x_test,y_train,y_test = data_split(x_data,y_data)
print("training data: ",x_train.shape, y_train.shape)
print("testing data: ",x_test.shape,y_test.shape)

training data:  (721, 8) (721,)
testing data:  (309, 8) (309,)


In [13]:
#With Normalized Data
SME = np.zeros(50)

#loop for running model 50 times
for i in range(50):
    print("model running for {} time".format(i+1))
    #spliting data
    x_train,x_test,y_train,y_test = data_split(x_data,y_data)
    #creating network model instance
    model = reg_model()
    model.fit(x_train,y_train,epochs = 50,verbose = 0)
    y_predict = model.predict(x_test)
    SME[i] = mean_squared_error(y_test,y_predict)
    print(SME[i])

SME

model running for 1 time
286.73304002065913
model running for 2 time
439.08421618039114
model running for 3 time
398.3129983148268
model running for 4 time
321.2460331015323
model running for 5 time
669.5502771106605
model running for 6 time
391.4967519416527
model running for 7 time
434.3772782445964
model running for 8 time
324.8506942274787
model running for 9 time
297.26971372484593
model running for 10 time
330.86942659428627
model running for 11 time
332.7506054562174
model running for 12 time
304.6579530904853
model running for 13 time
642.9793719118446
model running for 14 time
288.1329925281759
model running for 15 time
391.6363846438675
model running for 16 time
265.3809888101341
model running for 17 time
324.26185608889904
model running for 18 time
259.0538741957459
model running for 19 time
454.01274406873813
model running for 20 time
454.6455642034139
model running for 21 time
292.9195141150618
model running for 22 time
398.37186797951
model running for 23 time
337.6080110

array([286.73304002, 439.08421618, 398.31299831, 321.2460331 ,
       669.55027711, 391.49675194, 434.37727824, 324.85069423,
       297.26971372, 330.86942659, 332.75060546, 304.65795309,
       642.97937191, 288.13299253, 391.63638464, 265.38098881,
       324.26185609, 259.0538742 , 454.01274407, 454.6455642 ,
       292.91951412, 398.37186798, 337.60801101, 417.3562713 ,
       296.01282497, 298.68703155, 443.16927878, 293.57082338,
       259.68996589, 400.56042926, 401.70922861, 276.47913382,
       349.20181034, 387.95732258, 342.81745548, 302.08462661,
       354.32660599, 783.05438556, 268.50866512, 265.42237171,
       342.17053311, 266.92022799, 375.97553288, 285.55851802,
       385.97476228, 305.05682033, 315.15116383, 479.60322034,
       330.83164509, 280.58987231])

Let's find out the mean and standard deviation of the mean_squared_error

In [20]:
SME_mean = SME.sum()/SME.shape[0]
SME_mean

SME_std = np.sqrt(pow((SME -np.array(SME_mean)),2).sum()/SME.shape[0])
print("Error of Model with Normalization\n")
print("Mean of Squared Mean Error calculated 50 times: ",SME_mean,"\nStandard Deviation of Squared Mean Error calculated 50 times: ", SME_std)

Error of Model with Normalization

Mean of Squared Mean Error calculated 50 times:  362.97285369382263 
Standard Deviation of Squared Mean Error calculated 50 times:  104.20791021314665


# Comparizon with Part-A Conclusion

There is not any significant difference between the means of the SME of Part-A(without normalization) and Part-B(with normalization). The values are similar in both parts

There is very significant difference between the Standard Deviations of the SME of Part-A and Part-B. In Part-B, where the data is normalized the Standard deviation value is hugely less than the value calculated in Part-A.