In [13]:
import kagglehub
import pandas as pd
import os

from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score

# Download latest version of dataset
path = kagglehub.dataset_download("sinamhd9/concrete-comprehensive-strength")
path

'/Users/unrealre/.cache/kagglehub/datasets/sinamhd9/concrete-comprehensive-strength/versions/1'

In [17]:
df = pd.read_excel(os.path.join(path, "Concrete_Data.xls"))
df.columns = ['Cement', 'BlastFurnaceSlag', 'FlyAsh', 'Water', 'Superplasticizer', 'CoarseAggregate', 'FineAggregate', 'Age', 'CompressiveStrength']
df.head()

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,CompressiveStrength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [29]:
# Baseline
X = df.copy()
y = X.pop("CompressiveStrength")

# Train and score baseline model
default_params = {
    'iterations': 1000, 
    'learning_rate': 0.05, 
    'depth': 6, 
    'loss_function': 'RMSE', 
    'verbose': 100, 
    'random_seed': 42,
    'early_stopping_rounds': 100
}
baseline = CatBoostRegressor(**default_params)
baseline_score = cross_val_score(
    baseline, X, y, cv=5, scoring="neg_mean_absolute_error"
)
baseline_score = -1 * baseline_score.mean()

print(f"MAE Baseline Score: {baseline_score:.4}")

0:	learn: 14.5636419	total: 1.75ms	remaining: 1.75s
100:	learn: 4.3759228	total: 51ms	remaining: 454ms
200:	learn: 3.2873769	total: 95.7ms	remaining: 380ms
300:	learn: 2.6374007	total: 138ms	remaining: 322ms
400:	learn: 2.2644064	total: 185ms	remaining: 276ms
500:	learn: 1.9810257	total: 231ms	remaining: 230ms
600:	learn: 1.7688467	total: 278ms	remaining: 184ms
700:	learn: 1.6156353	total: 326ms	remaining: 139ms
800:	learn: 1.4789665	total: 376ms	remaining: 93.5ms
900:	learn: 1.3636782	total: 462ms	remaining: 50.7ms
999:	learn: 1.2523566	total: 578ms	remaining: 0us
0:	learn: 16.4388673	total: 1.58ms	remaining: 1.58s
100:	learn: 4.3111397	total: 104ms	remaining: 923ms
200:	learn: 3.2471000	total: 172ms	remaining: 683ms
300:	learn: 2.7129383	total: 216ms	remaining: 501ms
400:	learn: 2.3593921	total: 263ms	remaining: 392ms
500:	learn: 2.1199737	total: 307ms	remaining: 305ms
600:	learn: 1.9699554	total: 367ms	remaining: 244ms
700:	learn: 1.8455061	total: 413ms	remaining: 176ms
800:	learn: 

In [30]:
# MAE Baseline Score: 7.354

In [31]:
X = df.copy()
y = X.pop("CompressiveStrength")

# Create synthetic features
X["FCRatio"] = X["FineAggregate"] / X["CoarseAggregate"]
X["AggCmtRatio"] = (X["CoarseAggregate"] + X["FineAggregate"]) / X["Cement"]
X["WtrCmtRatio"] = X["Water"] / X["Cement"]

# Train and score model on dataset with additional ratio features
model = CatBoostRegressor(**default_params)
score = cross_val_score(
    model, X, y, cv=5, scoring="neg_mean_absolute_error"
)
score = -1 * score.mean()

print(f"MAE Score with Ratio Features: {score:.4}")

0:	learn: 14.5624675	total: 7.44ms	remaining: 7.44s
100:	learn: 4.2870447	total: 91.3ms	remaining: 813ms
200:	learn: 3.2261886	total: 148ms	remaining: 587ms
300:	learn: 2.6229422	total: 209ms	remaining: 486ms
400:	learn: 2.2515039	total: 417ms	remaining: 622ms
500:	learn: 1.9539804	total: 551ms	remaining: 549ms
600:	learn: 1.7605073	total: 701ms	remaining: 466ms
700:	learn: 1.6028141	total: 863ms	remaining: 368ms
800:	learn: 1.4564329	total: 913ms	remaining: 227ms
900:	learn: 1.3246570	total: 964ms	remaining: 106ms
999:	learn: 1.2315651	total: 1.02s	remaining: 0us
0:	learn: 16.4195315	total: 1.05ms	remaining: 1.05s
100:	learn: 4.2117441	total: 52.8ms	remaining: 470ms
200:	learn: 3.1421891	total: 111ms	remaining: 442ms
300:	learn: 2.5831413	total: 182ms	remaining: 422ms
400:	learn: 2.2373843	total: 238ms	remaining: 356ms
500:	learn: 2.0062516	total: 295ms	remaining: 294ms
600:	learn: 1.8380362	total: 471ms	remaining: 313ms
700:	learn: 1.7240608	total: 638ms	remaining: 272ms
800:	learn: 

In [32]:
# MAE Score with Ratio Features: 7.183