# Part A. Building the baseline Model

In [53]:
# download the concrete_data.csv file from url directly
! wget -O concrete_data.csv https://cocl.us/concrete_data

--2024-03-15 09:03:25--  https://cocl.us/concrete_data
Resolving cocl.us (cocl.us)... 23.209.46.163, 23.209.46.160, 2600:1413:b000:6::17d5:2bca, ...
Connecting to cocl.us (cocl.us)|23.209.46.163|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv [following]
--2024-03-15 09:03:25--  https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv
Resolving s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)... 67.228.254.196
Connecting to s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)|67.228.254.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58988 (58K) [text/csv]
Saving to: ‘concrete_data.csv’


2024-03-15 09:03:26 (162 KB/s) - ‘concrete_data.csv’ saved [58988/58988]



## Let's import all necessary modules


In [54]:
import pandas as pd    # for data analysis
from sklearn.model_selection import train_test_split   #for splitting the data into training and testing sets
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error    # for calculating mean_square error

## Let's read the data from csv file and analyzing it using pandas

In [55]:
df = pd.read_csv("concrete_data.csv")    # reading the data from csv file

In [56]:
df.shape  # it has 1030 rows and 9 columns

(1030, 9)

### let's read the top 10 rows of the data

In [57]:
df.head(n=10)

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3
5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90,47.03
6,380.0,95.0,0.0,228.0,0.0,932.0,594.0,365,43.7
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28,36.45
8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28,39.29


### Statistics summary of the data

In [58]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cement,1030.0,281.167864,104.506364,102.0,192.375,272.9,350.0,540.0
Blast Furnace Slag,1030.0,73.895825,86.279342,0.0,0.0,22.0,142.95,359.4
Fly Ash,1030.0,54.18835,63.997004,0.0,0.0,0.0,118.3,200.1
Water,1030.0,181.567282,21.354219,121.8,164.9,185.0,192.0,247.0
Superplasticizer,1030.0,6.20466,5.973841,0.0,0.0,6.4,10.2,32.2
Coarse Aggregate,1030.0,972.918932,77.753954,801.0,932.0,968.0,1029.4,1145.0
Fine Aggregate,1030.0,773.580485,80.17598,594.0,730.95,779.5,824.0,992.6
Age,1030.0,45.662136,63.169912,1.0,7.0,28.0,56.0,365.0
Strength,1030.0,35.817961,16.705742,2.33,23.71,34.445,46.135,82.6


### Check the null or missing value in the data

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              1030 non-null   float64
 1   Blast Furnace Slag  1030 non-null   float64
 2   Fly Ash             1030 non-null   float64
 3   Water               1030 non-null   float64
 4   Superplasticizer    1030 non-null   float64
 5   Coarse Aggregate    1030 non-null   float64
 6   Fine Aggregate      1030 non-null   float64
 7   Age                 1030 non-null   int64  
 8   Strength            1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [60]:
df.isnull().sum()
# there is no any null or missing values

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

## Prepare the data for features as input and target as output

In [61]:
features = df[df.columns.to_list()[:-1]]
target = df['Strength']

In [62]:
features.shape[1]

8

## Let's define regression model function
### With one hidden layer of 10 nodes, and a ReLU activation function
### Using the adam optimizer and the mean squared error  as the loss function.

In [63]:
def regression_model():
  model = Sequential()

  model.add(Dense(10, activation='relu', input_shape = (features.shape[1],)))   # one hidden layer with 10 nodes
  model.add(Dense(1))      # one output layer with 1 nodes

  model.compile(optimizer='adam', loss='mean_squared_error')
  return model

## Split the data into training and testing sets using train_test_split from sklearn with 30% testing size

In [64]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.3, random_state=4)

## Training the model on training data X_train and y_train using 50 epochs

In [65]:
# let's call the function
model = regression_model()

In [None]:
# Train the model on training
model.fit(X_train, y_train, epochs = 50, verbose=2)

## Predict the model on testing data X_test

In [67]:
# predict the model on test data
predictions = model.predict(X_test)



## Evaluate mean_square error on predicted data and y_test data using scikit learn

In [68]:
mse = mean_squared_error(predictions, y_test)
print(f"The mean squared error is: {mse}")

The mean squared error is: 163.33872352483553


In [4]:
# list of value of 50 mean_squared error, counted by re running the session after random_state set to 4
value = [160.84122, 143.27948, 161.1131, 179.2361, 163.86484, 144.74873, 136.40836, 139.37009, 144.01689, 163.53163, 166.57637, 156.02597, 189.23009, 180.05835, 145.57138, 145.50436, 142.16441, 143.66316, 137.07188, 155.097, 152.60661, 152.45369, 141.35791, 182.08801, 170.13863, 134.88678, 140.94058, 167.86838, 136.20494, 156.0176, 190.61133, 131.90657, 165.57019, 171.02113, 143.85733, 138.08606, 140.21332, 132.46246, 142.89754, 152.66281, 189.99655, 166.25871, 173.85088, 138.61688, 143.67609, 166.4588, 171.01712, 146.07324, 161.75612, 170.36243]

In [5]:
# mean and the standard deviation of the mean squared errors
import numpy as np
mean_mse = np.mean(value)
SD_mse = np.std(value)
print(f"The mean and standard deviation of mean square errors is {mean_mse} and {SD_mse}")

The mean and standard deviation of mean square errors is 155.38584200000003 and 16.173423764254863
