In [13]:
import kagglehub
import pandas as pd
import os

from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score

# Download latest version of dataset
path = kagglehub.dataset_download("sinamhd9/concrete-comprehensive-strength")
path

'/Users/unrealre/.cache/kagglehub/datasets/sinamhd9/concrete-comprehensive-strength/versions/1'

In [17]:
df = pd.read_excel(os.path.join(path, "Concrete_Data.xls"))
df.columns = ['Cement', 'BlastFurnaceSlag', 'FlyAsh', 'Water', 'Superplasticizer', 'CoarseAggregate', 'FineAggregate', 'Age', 'CompressiveStrength']
df.head()

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Age,CompressiveStrength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [18]:
# Baseline
X = df.copy()
y = X.pop("CompressiveStrength")

# Train and score baseline model
default_params = {
    'iterations': 1000, 
    'learning_rate': 0.05, 
    'depth': 6, 
    'loss_function': 'RMSE', 
    'verbose': 100, 
    'random_seed': 42,
    'early_stopping_rounds': 100
}
baseline = CatBoostRegressor(**default_params)
baseline_score = cross_val_score(
    baseline, X, y, cv=5, scoring="neg_mean_absolute_error"
)
baseline_score = -1 * baseline_score.mean()

print(f"MAE Baseline Score: {baseline_score:.4}")

0:	learn: 14.5636419	total: 56.9ms	remaining: 56.9s
100:	learn: 4.3759228	total: 101ms	remaining: 902ms
200:	learn: 3.2873769	total: 144ms	remaining: 572ms
300:	learn: 2.6374007	total: 191ms	remaining: 445ms
400:	learn: 2.2644064	total: 240ms	remaining: 359ms
500:	learn: 1.9810257	total: 291ms	remaining: 290ms
600:	learn: 1.7688467	total: 340ms	remaining: 226ms
700:	learn: 1.6156353	total: 383ms	remaining: 164ms
800:	learn: 1.4789665	total: 427ms	remaining: 106ms
900:	learn: 1.3636782	total: 487ms	remaining: 53.5ms
999:	learn: 1.2523566	total: 535ms	remaining: 0us
0:	learn: 16.4388673	total: 572us	remaining: 572ms
100:	learn: 4.3111397	total: 41.3ms	remaining: 367ms
200:	learn: 3.2471000	total: 83.5ms	remaining: 332ms
300:	learn: 2.7129383	total: 128ms	remaining: 296ms
400:	learn: 2.3593921	total: 174ms	remaining: 260ms
500:	learn: 2.1199737	total: 218ms	remaining: 217ms
600:	learn: 1.9699554	total: 259ms	remaining: 172ms
700:	learn: 1.8455061	total: 304ms	remaining: 130ms
800:	learn: 

In [19]:
# MAE Baseline Score: 7.354

In [22]:
X = df.copy()
y = X.pop("CompressiveStrength")

# Create synthetic features
X["FCRatio"] = X["FineAggregate"] / X["CoarseAggregate"]
X["AggCmtRatio"] = (X["CoarseAggregate"] + X["FineAggregate"]) / X["Cement"]
X["WtrCmtRatio"] = X["Water"] / X["Cement"]

# Train and score model on dataset with additional ratio features
model = CatBoostRegressor(**default_params)
score = cross_val_score(
    model, X, y, cv=5, scoring="neg_mean_absolute_error"
)
score = -1 * score.mean()

print(f"MAE Score with Ratio Features: {score:.4}")

0:	learn: 14.5624675	total: 1.28ms	remaining: 1.27s
100:	learn: 4.2870447	total: 53.6ms	remaining: 477ms
200:	learn: 3.2261886	total: 106ms	remaining: 423ms
300:	learn: 2.6229422	total: 159ms	remaining: 370ms
400:	learn: 2.2515039	total: 215ms	remaining: 321ms
500:	learn: 1.9539804	total: 276ms	remaining: 274ms
600:	learn: 1.7605073	total: 331ms	remaining: 219ms
700:	learn: 1.6028141	total: 383ms	remaining: 163ms
800:	learn: 1.4564329	total: 440ms	remaining: 109ms
900:	learn: 1.3246570	total: 494ms	remaining: 54.3ms
999:	learn: 1.2315651	total: 552ms	remaining: 0us
0:	learn: 16.4195315	total: 664us	remaining: 664ms
100:	learn: 4.2117441	total: 52.4ms	remaining: 466ms
200:	learn: 3.1421891	total: 107ms	remaining: 427ms
300:	learn: 2.5831413	total: 160ms	remaining: 371ms
400:	learn: 2.2373843	total: 210ms	remaining: 314ms
500:	learn: 2.0062516	total: 260ms	remaining: 259ms
600:	learn: 1.8380362	total: 312ms	remaining: 207ms
700:	learn: 1.7240608	total: 366ms	remaining: 156ms
800:	learn: 

In [21]:
# MAE Score with Ratio Features: 7.183