### First load required libraries:

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, LSTM, TimeDistributed, RepeatVector

### Load CSV File From URL

In [58]:
url=r"https://cocl.us/concrete_data"
concrete_data=pd.read_csv(url)
concrete_data.head(5)

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


The dataset is about the compressive strength of different samples of concrete based on the volumes of the different ingredients that were used to make them. Ingredients include:
1. Cement
2. Blast Furnace Slag
3. Fly Ash
4. Water
5. Superplasticizer
6. Coarse Aggregate
7. Fine Aggregate

In [59]:
concrete_data.shape

(1030, 9)

Check each columns datatype of dataset 

In [60]:
concrete_data.dtypes

Cement                float64
Blast Furnace Slag    float64
Fly Ash               float64
Water                 float64
Superplasticizer      float64
Coarse Aggregate      float64
Fine Aggregate        float64
Age                     int64
Strength              float64
dtype: object

Check each columns whether exist missing value of dataset

In [61]:
pd.isna(concrete_data).sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

The data looks no missing valye and is ready to be used to build our model.

In [62]:
X=concrete_data[concrete_data.columns[:8]]
y=pd.DataFrame(concrete_data["Strength"])
Xcol=X.columns
ycol=y.columns

## Normalize Data

Repeat Part A but use a normalized version of the data

In [63]:
stdard_x=preprocessing.StandardScaler()
stdard_y=preprocessing.StandardScaler()
X= stdard_x.fit_transform(X)
X=pd.DataFrame(X,columns=Xcol)
X[0:5]
y= stdard_y.fit_transform(y)
y=pd.DataFrame(y,columns=ycol)
print("Independent variable of dataset:\n {}\n\n".format(X.head()))
print("Dependent variable of dataset:\n {}".format(y.head()))

Independent variable of dataset:
      Cement  Blast Furnace Slag   Fly Ash     Water  Superplasticizer  \
0  2.477915           -0.856888 -0.847144 -0.916764         -0.620448   
1  2.477915           -0.856888 -0.847144 -0.916764         -0.620448   
2  0.491425            0.795526 -0.847144  2.175461         -1.039143   
3  0.491425            0.795526 -0.847144  2.175461         -1.039143   
4 -0.790459            0.678408 -0.847144  0.488793         -1.039143   

   Coarse Aggregate  Fine Aggregate       Age  
0          0.863154       -1.217670 -0.279733  
1          1.056164       -1.217670 -0.279733  
2         -0.526517       -2.240917  3.553066  
3         -0.526517       -2.240917  5.057677  
4          0.070527        0.647884  4.978487  


Dependent variable of dataset:
    Strength
0  2.645408
1  1.561421
2  0.266627
3  0.313340
4  0.507979


1.Use the Keras library to build a neural network with the following:  
2.One hidden layer of 10 nodes, and a ReLU activation function  
3.Use the adam optimizer and the mean squared error  as the loss function.

In [64]:
model=Sequential()
input_number=X.shape[1]
model.add(Dense(units=10, input_shape=(input_number,), activation='relu', kernel_initializer='normal')) 
model.add(Dense(units=1, kernel_initializer='normal')) 

model.compile(loss='mean_squared_error',
              optimizer='adam')

1.Randomly split the data into a training and test sets by holding 30% of the data for testing.

In [65]:
X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.3, random_state=1)
print("Traning data sample size:{}".format(X_train.shape[0]))
print("Validation data sample size:{}".format(X_val.shape[0]))

Traning data sample size:721
Validation data sample size:309


2.Train the model on the training data using 50 epochs.

In [66]:
model.fit(X_train,y_train, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x2287709fac8>

3.Evaluate the model on the test data and compute the mean squared error between the predicted concrete strength and the actual concrete strength.

In [67]:
loss_val = model.evaluate(X_val, y_val)
loss_val #mean_squared_error



0.20906905882952667

4.Repeat steps 1 - 3, 50 times.

In [68]:
MSE=[]
repeat=50
for i in range(repeat):
    X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.3, random_state=i)
    train_model =model.fit(X_train,y_train, epochs=50, batch_size=len(X_train), verbose=0, shuffle=True)
    prediction=model.predict(X_val)
    prediction_inverse=stdard_y.inverse_transform(prediction)
    y_val_inverse=stdard_y.inverse_transform(y_val)
    error=np.array(y_val_inverse).reshape(-1,1)-np.array(prediction_inverse).reshape(-1,1)
    mse=1/len(error)*(error.T.dot(error))
    print('Run{} MSE:{}'.format(i,mse[0][0]))
    MSE.append(mse)

Run0 MSE:45.542641512136534
Run1 MSE:57.010992885339554
Run2 MSE:43.77906412640339
Run3 MSE:46.56935153276122
Run4 MSE:47.20620462624733
Run5 MSE:50.71808637990092
Run6 MSE:51.20799419970516
Run7 MSE:39.56296089112595
Run8 MSE:45.370109849982484
Run9 MSE:44.62346802964511
Run10 MSE:43.52588749477351
Run11 MSE:41.52573911718265
Run12 MSE:48.8542242514173
Run13 MSE:49.1517722721104
Run14 MSE:42.01570317297936
Run15 MSE:37.01650294540703
Run16 MSE:39.654176043081435
Run17 MSE:39.545741401521646
Run18 MSE:37.52179336782
Run19 MSE:39.48707996853255
Run20 MSE:36.250420947752424
Run21 MSE:37.76000567629153
Run22 MSE:33.65813370133623
Run23 MSE:38.1890770228628
Run24 MSE:37.57086969307518
Run25 MSE:40.247177423409624
Run26 MSE:35.49162030111694
Run27 MSE:34.61197802842294
Run28 MSE:40.006524513912595
Run29 MSE:38.20432614011748
Run30 MSE:33.88631475724986
Run31 MSE:35.16874040785519
Run32 MSE:34.55236770788876
Run33 MSE:37.31104271565255
Run34 MSE:37.60615116817233
Run35 MSE:43.56526037691111


5.Report the mean and the standard deviation of the mean squared errors.

In [72]:
Mean_MSE=np.mean(MSE)
STD_MSE=np.std(MSE)
print("Below is the mean and standard deviation of {i} mean squared errors with normalized data.\n\
Total number of epochs for each training is: {i}".format(i=repeat))
print("mean of MSE:{}".format(Mean_MSE))
print("Std of MSE:{}".format(STD_MSE))

Below is the mean and standard deviation of 50 mean squared errors with normalized data.
Total number of epochs for each training is: 50
mean of MSE:39.877576763176904
Std of MSE:5.334894408546133


How does the mean of the mean squared errors compare to that from Step A?

In [80]:
compare_result=pd.DataFrame({"Normalized":["without","with"],"Mean_MSE":[110.004,39.877],"Std_MSE":[8.508,5.335]})
print("Comparesion table with and without Normalized")
print(compare_result)

Comparesion table with and without Normalized
   Mean_MSE Normalized  Std_MSE
0   110.004    without    8.508
1    39.877       with    5.335


Mean of the mean squared errors compare to that from Step A(without Normalized)  
We can see the mean_MSE and Std_MSE significantly reduced after proceed Normalized. 