# Adding features to data to improve model performance

## Importing modules and data

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

df = pd.read_csv("../input/fe-course-data/concrete.csv")
df.head()

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,CompressiveStrength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


## creating a baseline model
- essential at the start of feature engineering as it helps decide whether your new features are worth keeping or not

In [10]:
X = df.copy()
y = X.pop("CompressiveStrength")
X, y

(      Cement  BlastFurnaceSlag  FlyAsh  Water  Superplasticizer  \
 0      540.0               0.0     0.0  162.0               2.5   
 1      540.0               0.0     0.0  162.0               2.5   
 2      332.5             142.5     0.0  228.0               0.0   
 3      332.5             142.5     0.0  228.0               0.0   
 4      198.6             132.4     0.0  192.0               0.0   
 ...      ...               ...     ...    ...               ...   
 1025   276.4             116.0    90.3  179.6               8.9   
 1026   322.2               0.0   115.6  196.0              10.4   
 1027   148.5             139.4   108.6  192.7               6.1   
 1028   159.1             186.7     0.0  175.6              11.3   
 1029   260.9             100.5    78.3  200.6               8.6   
 
       CoarseAggregate  FineAggregate  Age  
 0              1040.0          676.0   28  
 1              1055.0          676.0   28  
 2               932.0          594.0  270  
 3

In [11]:

# Train and score baseline model
baseline = RandomForestRegressor(criterion="mae", random_state=0)
baseline_score = cross_val_score(
    baseline, X, y, cv=5, scoring="neg_mean_absolute_error"
)
baseline_score = -1 * baseline_score.mean()

print(f"MAE Baseline Score: {baseline_score:.4}")


MAE Baseline Score: 8.232


## Adding new features to data

In [13]:
# Create synthetic features
X["FCRatio"] = X["FineAggregate"] / X["CoarseAggregate"]
X["AggCmtRatio"] = (X["CoarseAggregate"] + X["FineAggregate"]) / X["Cement"]
X["WtrCmtRatio"] = X["Water"] / X["Cement"]
X

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,FCRatio,AggCmtRatio,WtrCmtRatio
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,0.650000,3.177778,0.300000
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,0.640758,3.205556,0.300000
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,0.637339,4.589474,0.685714
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,0.637339,4.589474,0.685714
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,0.843724,9.083082,0.966767
...,...,...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,0.883002,5.927641,0.649783
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,0.994498,5.063004,0.608318
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,0.874048,11.261953,1.297643
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,0.797191,11.178504,1.103708


In [16]:
# Train and score model on dataset with additional ratio features
model = RandomForestRegressor(criterion="mae", random_state=0)
score = cross_val_score(
    model, X, y, cv=5, scoring="neg_mean_absolute_error"
)
score = -1 * score.mean()

print(f"MAE Score with Ratio Features: {score:.4}")

MAE Score with Ratio Features: 7.948
