Download and Clean Data

In [1]:
#import neccessary libraries
import pandas as pd
import numpy as np

Let's download the data and read it into a pandas dataframe.

In [5]:
concrete_data = pd.read_csv('https://cocl.us/concrete_data')

In [6]:
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


Check how many data points we have.

In [7]:
concrete_data.shape

(1030, 9)

Check for the statistical data of our dataset

In [8]:
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


Check for missing values

In [9]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

Split data into predictors and target

In [11]:
concrete_data_columns = concrete_data.columns

predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
target = concrete_data['Strength'] # Strength column

Let's do a quick sanity check of the predictors and the target dataframes.

In [13]:
predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [14]:
target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

 Normalize the data by substracting the mean and dividing by the standard deviation.

In [15]:
predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [16]:
#Save the number of predictors to n_cols since we will need this number when building our network.

n_cols = predictors_norm.shape[1] # number of predictors

 import the Keras library


In [18]:
import keras
from keras.models import Sequential
from keras.layers import Dense

## Build a Neural Network


 # Build a baseline model (5 marks) 

Use the Keras library to build a neural network with the following:

- One hidden layer of 10 nodes, and a ReLU activation function

- Use the adam optimizer and the mean squared error  as the loss function.

In [19]:
#define regression model
n_cols = predictors_norm.shape[1]
def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model



# build the model


In [20]:
model = regression_model()







1. Randomly split the data into a training and test sets by holding 30% of the data for testing. You can use the train_test_splithelper function from Scikit-learn.

2. Train the model on the training data using 50 epochs.

3. Evaluate the model on the test data and compute the mean squared error between the predicted concrete strength and the actual concrete strength. You can use the mean_squared_error function from Scikit-learn.

4. Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.

5. Report the mean and the standard deviation of the mean squared errors.

In [24]:
from sklearn.model_selection import train_test_split
list_of_mean_squared_error = []
for cycle in range(50):
    #Randomly split the data into a training set (70%) and a test set (30%):  
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3)
    #Train and test the model at the same time
    res = model.fit(X_train, y_train, epochs=50, verbose=0, validation_data=(X_test, y_test))
    #Find mean_squared_error as last value in history.
    mean_squared_error = res.history['val_loss'][-1]
    #Add value of mean_squared_error for every cycle in list.
    list_of_mean_squared_error.append(mean_squared_error)
    print('Cycle #{}: mean_squared_error {}'.format(cycle+1, mean_squared_error))

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'






2023-01-03 13:24:56.767000: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
2023-01-03 13:24:56.774679: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2593900000 Hz
2023-01-03 13:24:56.775582: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56050cd57f50 executing computations on platform Host. Devices:
2023-01-03 13:24:56.775651: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>


Cycle #1: mean_squared_error 122.80638534891567
Cycle #2: mean_squared_error 117.93663787841797
Cycle #3: mean_squared_error 104.05118037350356
Cycle #4: mean_squared_error 109.39853875613908
Cycle #5: mean_squared_error 116.84879954810282
Cycle #6: mean_squared_error 127.17034680017761
Cycle #7: mean_squared_error 107.89928149868369
Cycle #8: mean_squared_error 104.63978971709712
Cycle #9: mean_squared_error 129.09735798758595
Cycle #10: mean_squared_error 122.40755620666306
Cycle #11: mean_squared_error 119.56229857410814
Cycle #12: mean_squared_error 109.07729939581121
Cycle #13: mean_squared_error 99.57946347727359
Cycle #14: mean_squared_error 111.95750827233768
Cycle #15: mean_squared_error 120.76868852288206
Cycle #16: mean_squared_error 112.08090728463479
Cycle #17: mean_squared_error 109.03682782812027
Cycle #18: mean_squared_error 114.23226005282602
Cycle #19: mean_squared_error 106.7397953021102
Cycle #20: mean_squared_error 115.1193970615424
Cycle #21: mean_squared_error 12

## B. Normalize the data 

Repeat Part A but use a normalized version of the data. Recall that one way to normalize the data is by subtracting the mean from the individual predictors and dividing by the standard deviation.

In [25]:
predictors_norm = (predictors - predictors.mean())/predictors.std()
predictors_norm.head(10)

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069
5,-0.145138,0.464818,-0.846733,2.174405,-1.038638,-0.526262,-1.291914,0.701883
6,0.945704,0.244603,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
7,0.945704,0.244603,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,-0.279597
8,-0.145138,0.464818,-0.846733,2.174405,-1.038638,-0.526262,-1.291914,-0.279597
9,1.85474,-0.856472,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,-0.279597


In [26]:
#Build the model:

n_cols = predictors_norm.shape[1]
def regression_model2():
    model2 = Sequential()
    model2.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model2.add(Dense(1))
    
    model2.compile(optimizer='adam', loss='mean_squared_error')
    return model2

model2 = regression_model2()

Train and test the model at the same time using the fit-method. We will leave out 30% of the data for validation and we will train the model for 50 epochs. And use predictors_norm instead of predictors.



In [27]:
list_of_mean_squared_error = []
for cycle in range(50):
    #Randomly split the data into a training set (70%) and a test set (30%):  
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3)
    #Train and test the model at the same time
    res = model2.fit(X_train, y_train, epochs=50, verbose=0, validation_data=(X_test, y_test))
    #Find mean_squared_error as last value in history.
    mean_squared_error = res.history['val_loss'][-1]
    #Add value of mean_squared_error for every cycle in list.
    list_of_mean_squared_error.append(mean_squared_error)
    print('Cycle #{}: mean_squared_error {}'.format(cycle+1, mean_squared_error))

Cycle #1: mean_squared_error 400.35807192904275
Cycle #2: mean_squared_error 155.15586319710445
Cycle #3: mean_squared_error 109.77933326894025
Cycle #4: mean_squared_error 66.5478954870724
Cycle #5: mean_squared_error 62.23923136887041
Cycle #6: mean_squared_error 50.17901590341118
Cycle #7: mean_squared_error 46.50980532748028
Cycle #8: mean_squared_error 44.4597361172673
Cycle #9: mean_squared_error 46.475097211819254
Cycle #10: mean_squared_error 44.276822951233505
Cycle #11: mean_squared_error 36.41177732350371
Cycle #12: mean_squared_error 36.334193498185535
Cycle #13: mean_squared_error 33.32784205810152
Cycle #14: mean_squared_error 38.16383208734704
Cycle #15: mean_squared_error 40.76581047576608
Cycle #16: mean_squared_error 35.08059235606765
Cycle #17: mean_squared_error 30.66311375763038
Cycle #18: mean_squared_error 34.2715177937233
Cycle #19: mean_squared_error 33.69269516089973
Cycle #20: mean_squared_error 30.25648192989016
Cycle #21: mean_squared_error 34.5158963743537

Find the mean and the standard deviation of the mean squared errors:



In [29]:
print('Mean of mean squared errors: {}'.format(np.mean(list_of_mean_squared_error)))
print('Standard deviation of mean squared errors: {}'.format(np.std(list_of_mean_squared_error)))

Mean of mean squared errors: 45.93277439166813
Standard deviation of mean squared errors: 54.921107446377626


### The Average and the standard deviation of the mean squared errors in case A is lower than in case B, but the difference is small. In my opinion, it is not useful to compare the results of two poor neutral networks with one hidden layer. Data normalization does not have significant impact on the results. Both case A and case B have large errors.

# C. Increate the number of epochs (5 marks)

Repeat Part B but use 100 epochs this time for training.

How does the mean of the mean squared errors compare to that from Step B?

In [35]:
#Build the model:

def regression_model3():
    model3 = Sequential()
    model3.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model3.add(Dense(1))
    
    model3.compile(optimizer='adam', loss='mean_squared_error')
    return model3

model3 = regression_model3() 

In [36]:
# fit the model
list_of_mean_squared_error = []
for cycle in range(50):
    #Randomly split the data into a training set (70%) and a test set (30%):  
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3)
    #Train and test the model at the same time
    res = model3.fit(X_train, y_train, epochs=100, verbose=0, validation_data=(X_test, y_test))
    #Find mean_squared_error as last value in history.
    mean_squared_error = res.history['val_loss'][-1]
    #Add value of mean_squared_error for every cycle in list.
    list_of_mean_squared_error.append(mean_squared_error)
    print('Cycle #{}: mean_squared_error {}'.format(cycle+1, mean_squared_error))

Cycle #1: mean_squared_error 156.5311630644073
Cycle #2: mean_squared_error 98.96681814594947
Cycle #3: mean_squared_error 57.842272835642
Cycle #4: mean_squared_error 43.29311820218478
Cycle #5: mean_squared_error 40.41866359278608
Cycle #6: mean_squared_error 43.77438711271317
Cycle #7: mean_squared_error 38.04487750599685
Cycle #8: mean_squared_error 36.94570010533997
Cycle #9: mean_squared_error 41.893891541317444
Cycle #10: mean_squared_error 40.37571354591345
Cycle #11: mean_squared_error 42.059506141637904
Cycle #12: mean_squared_error 35.3779560694031
Cycle #13: mean_squared_error 35.991734217671514
Cycle #14: mean_squared_error 38.16725350660799
Cycle #15: mean_squared_error 33.162761274664916
Cycle #16: mean_squared_error 40.10676926548041
Cycle #17: mean_squared_error 33.82246095231436
Cycle #18: mean_squared_error 36.3585885489257
Cycle #19: mean_squared_error 35.827501315515015
Cycle #20: mean_squared_error 40.26225137556255
Cycle #21: mean_squared_error 34.29345342642281


Find the mean and the standard deviation of the mean squared errors:



In [37]:
print('The mean of the mean squared errors: {}'.format(np.mean(list_of_mean_squared_error)))
print('The standard deviation of the mean squared errors: {}'.format(np.std(list_of_mean_squared_error)))

The mean of the mean squared errors: 40.621481474719
The standard deviation of the mean squared errors: 19.218660553708315


### The mean squared error in case C is larger than in case B, and both errors are quite significant. Comparing the results of two poorly-performing neural networks with only one hidden layer is not particularly useful. The number of epochs does not improve the outcome.

## D. Increase the number of hidden layers (5 marks)
Repeat part B but use a neural network with the following instead:

Three hidden layers, each of 10 nodes and ReLU activation function.
How does the mean of the mean squared errors compare to that from Step B?

Create a new model with three hidden layers, each of 10 nodes and ReLU activation function.

In [38]:
def regression_model4():
    model4 = Sequential()
    model4.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model4.add(Dense(10, activation='relu'))
    model4.add(Dense(10, activation='relu'))
    model4.add(Dense(1))
    
    model4.compile(optimizer='adam', loss='mean_squared_error')
    return model4

Build a new model with 3 hidden layers:



In [39]:
model4 = regression_model4()

Train and test the model at the same time using the fit-method. We will leave out 30% of the data (data after normalization) for validation and we will train the model for 50 epochs and use three hidden layers, each of 10 nodes and ReLU activation function.

In [40]:
list_of_mean_squared_error = []
for cycle in range(50):
    #Randomly split the data into a training set (70%) and a test set (30%):  
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3)
    #Train and test the model at the same time
    res = model4.fit(X_train, y_train, epochs=50, verbose=0, validation_data=(X_test, y_test))
    #Find mean_squared_error as last value in history.
    mean_squared_error = res.history['val_loss'][-1]
    #Add value of mean_squared_error for every cycle in list.
    list_of_mean_squared_error.append(mean_squared_error)
    print('Cycle #{}: mean_squared_error {}'.format(cycle+1, mean_squared_error))

Cycle #1: mean_squared_error 104.72520199871371
Cycle #2: mean_squared_error 82.36005305318
Cycle #3: mean_squared_error 64.5602930309703
Cycle #4: mean_squared_error 57.55586790807039
Cycle #5: mean_squared_error 49.13276553848415
Cycle #6: mean_squared_error 38.595694582053376
Cycle #7: mean_squared_error 37.53847286384854
Cycle #8: mean_squared_error 38.68952650545484
Cycle #9: mean_squared_error 34.17565351319544
Cycle #10: mean_squared_error 29.477356537260285
Cycle #11: mean_squared_error 39.58683784262648
Cycle #12: mean_squared_error 33.296815779602646
Cycle #13: mean_squared_error 32.60351004492504
Cycle #14: mean_squared_error 32.04151560959307
Cycle #15: mean_squared_error 39.17422777935139
Cycle #16: mean_squared_error 35.83408926065686
Cycle #17: mean_squared_error 30.33158486869343
Cycle #18: mean_squared_error 33.44070420373219
Cycle #19: mean_squared_error 33.31405895196119
Cycle #20: mean_squared_error 30.26710223225714
Cycle #21: mean_squared_error 33.68267664863068
C

Find the mean and the standard deviation of the mean squared errors:

In [41]:
print('The mean of the mean squared errors: {}'.format(np.mean(list_of_mean_squared_error)))
print('The standard deviation of the mean squared errors: {}'.format(np.std(list_of_mean_squared_error)))

The mean of the mean squared errors: 34.52927302289549
The standard deviation of the mean squared errors: 14.448781645678157


### The mean squared error in case D is lower than in cases A, B, and C, and is the only case where the error is relatively small. This suggests that adding layers to the neural network is more important than other factors. The comparison between poorly-performing neural networks with one hidden layer in previous cases was not reliable and produced unpredictable results.