In [1]:
# importing libraries
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from pathlib import Path
from warnings import simplefilter
simplefilter('ignore')

In [2]:
# load dataset
data_dir = Path("../input/fe-course-data")
conc = pd.read_csv(data_dir / "concrete.csv")
conc.head()

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,CompressiveStrength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
# checking for missing value
conc.isnull().sum()

Cement                 0
BlastFurnaceSlag       0
FlyAsh                 0
Water                  0
Superplasticizer       0
CoarseAggregate        0
FineAggregate          0
Age                    0
CompressiveStrength    0
dtype: int64

In [4]:
X = conc.copy()
y = X.pop("CompressiveStrength")

# train and score baseline model
baseline = RandomForestRegressor(criterion='absolute_error', random_state = 0)
baseline_score = cross_val_score(baseline, X, y, cv=5, scoring = 'neg_mean_absolute_error')
baseline_score = -1 * baseline_score.mean()
print(f'MAE Baseline Score: {baseline_score:.4}')

MAE Baseline Score: 8.232


In [5]:
conc.head()

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,CompressiveStrength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [6]:
X = conc.copy()
y = X.pop("CompressiveStrength")

# create synthetic features
X['FCRatio'] = X['FineAggregate'] / X['CoarseAggregate']
X['AggCmtRatio'] = (X['CoarseAggregate'] + X['FineAggregate']) / (X['Cement'])
X['WtrCmtRatio'] = X['Water'] / X['Cement']

# train and score model on dataset with additional ration features
model = RandomForestRegressor(criterion = 'absolute_error', random_state = 0)
score = cross_val_score(model, X, y, cv=5, scoring = 'neg_mean_absolute_error')
score = -1 * score.mean()
print(f'MEA score with Ratio features: {score:.4}')

MEA score with Ratio features: 7.948
