In [1]:
# Module importations
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [2]:
# Import dataset
csv_string = r'C:\Developer\scratch-pad-python\Datasets\Concrete_Data.xls'
df = pd.read_excel(csv_string)
df.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Cement (component 1)(kg in a m^3 mixture)              1030 non-null   float64
 1   Blast Furnace Slag (component 2)(kg in a m^3 mixture)  1030 non-null   float64
 2   Fly Ash (component 3)(kg in a m^3 mixture)             1030 non-null   float64
 3   Water  (component 4)(kg in a m^3 mixture)              1030 non-null   float64
 4   Superplasticizer (component 5)(kg in a m^3 mixture)    1030 non-null   float64
 5   Coarse Aggregate  (component 6)(kg in a m^3 mixture)   1030 non-null   float64
 6   Fine Aggregate (component 7)(kg in a m^3 mixture)      1030 non-null   float64
 7   Age (day)                                              1030 non-null   int64  
 8   Concrete compressive strength(MPa, megapascals)  

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cement (component 1)(kg in a m^3 mixture),1030.0,281.165631,104.507142,102.0,192.375,272.9,350.0,540.0
Blast Furnace Slag (component 2)(kg in a m^3 mixture),1030.0,73.895485,86.279104,0.0,0.0,22.0,142.95,359.4
Fly Ash (component 3)(kg in a m^3 mixture),1030.0,54.187136,63.996469,0.0,0.0,0.0,118.27,200.1
Water (component 4)(kg in a m^3 mixture),1030.0,181.566359,21.355567,121.75,164.9,185.0,192.0,247.0
Superplasticizer (component 5)(kg in a m^3 mixture),1030.0,6.203112,5.973492,0.0,0.0,6.35,10.16,32.2
Coarse Aggregate (component 6)(kg in a m^3 mixture),1030.0,972.918592,77.753818,801.0,932.0,968.0,1029.4,1145.0
Fine Aggregate (component 7)(kg in a m^3 mixture),1030.0,773.578883,80.175427,594.0,730.95,779.51,824.0,992.6
Age (day),1030.0,45.662136,63.169912,1.0,7.0,28.0,56.0,365.0
"Concrete compressive strength(MPa, megapascals)",1030.0,35.817836,16.705679,2.331808,23.707115,34.442774,46.136287,82.599225


In [5]:
# Create baseline model from un-augmented dataset
X = df.copy()
y = X.pop('Concrete compressive strength(MPa, megapascals) ')

# Train & score baseline model
baseline = RandomForestRegressor(criterion = 'mae', random_state = 0)

baseline_score = cross_val_score(
    baseline, X, y, cv = 5, scoring = 'neg_mean_absolute_error'
)

baseline_score = -1 * baseline_score.mean()

print(f'MAE Baseline Score: {baseline_score:.4}')

MAE Baseline Score: 8.397


In [6]:
# Engineer a Feature based on ingredients ratio
X = df.copy()
y = X.pop('Concrete compressive strength(MPa, megapascals) ')

# Create synthetic features
X['FCRatio'] = X['Fine Aggregate (component 7)(kg in a m^3 mixture)'] / X['Coarse Aggregate  (component 6)(kg in a m^3 mixture)']
X['AggCntRatio'] = (X['Fine Aggregate (component 7)(kg in a m^3 mixture)'] + X['Coarse Aggregate  (component 6)(kg in a m^3 mixture)']) / X['Cement (component 1)(kg in a m^3 mixture)']
X['WtrCmtRatio'] = X['Water  (component 4)(kg in a m^3 mixture)'] / X['Cement (component 1)(kg in a m^3 mixture)']

# Train and score model on dataset with additional ratio features
model = RandomForestRegressor(criterion = 'mae', random_state = 0)
score = cross_val_score(
    model, X, y, cv = 5, scoring = 'neg_mean_absolute_error'
)
score = -1 * score.mean()

print(f'MAE score with ratio features: {score:.4}')