# Concrete Compressive Strength Prediction

## 1. Load the data

In [62]:
import pandas as pd

data_path = 'Concrete_Data.csv'
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [63]:
# Get the column names
data.columns

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

### Rename the columns

In [64]:
data = data.rename(columns={
        'Cement (component 1)(kg in a m^3 mixture)': 'cement',
        'Blast Furnace Slag (component 2)(kg in a m^3 mixture)': 'blast_furnace_slag',
        'Fly Ash (component 3)(kg in a m^3 mixture)': 'fly_ash',
        'Water  (component 4)(kg in a m^3 mixture)': 'water',
        'Superplasticizer (component 5)(kg in a m^3 mixture)': 'superplasticizer',
        'Coarse Aggregate  (component 6)(kg in a m^3 mixture)': 'coarse_aggregate',
        'Fine Aggregate (component 7)(kg in a m^3 mixture)': 'fine_aggregate', 
        'Age (day)': 'age',
        'Concrete compressive strength(MPa, megapascals) ': 'compressive_strength'
})
data.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   cement                1030 non-null   float64
 1   blast_furnace_slag    1030 non-null   float64
 2   fly_ash               1030 non-null   float64
 3   water                 1030 non-null   float64
 4   superplasticizer      1030 non-null   float64
 5   coarse_aggregate      1030 non-null   float64
 6   fine_aggregate        1030 non-null   float64
 7   age                   1030 non-null   int64  
 8   compressive_strength  1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.6 KB


In [66]:
# List all the columns
data.columns

Index(['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer',
       'coarse_aggregate', 'fine_aggregate', 'age', 'compressive_strength'],
      dtype='object')

## 2. Feature Engineer the data

In [67]:
data['cement_coarse'] = data.cement / data.coarse_aggregate
data['cement_fine'] = data.cement / data.fine_aggregate

## 2. Ready the data

In [68]:
X = data.drop(['compressive_strength'], axis=1)
y = data['compressive_strength']

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Preprocess the data

In [70]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer


# Define preprocessing for numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Define the full preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('transformer', PowerTransformer(method='yeo-johnson', standardize=False)),
            ('scaler', StandardScaler()),
        ]), numerical_features)
    ]
)

## 4. Train the model

In [71]:
# Define the RandomForestRegressor model
model = RandomForestRegressor(
    max_depth=None,
    max_features="log2",
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=300,
    random_state=42,
)


# Create the full pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

pipeline.fit(X_train, y_train)

### Find the best parameters (GridSearchCV)

In [72]:
# from sklearn.model_selection import cross_val_score, GridSearchCV

# param_grid = {
#     'model__n_estimators': [100, 200, 300, 500, 1000],
#     'model__max_depth': [None, 10, 20, 30],
#     'model__min_samples_split': [2, 5, 10],
#     'model__min_samples_leaf': [1, 2, 4],
#     'model__max_features': ['sqrt', 'log2']
# }

# # Perform Grid Search
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='r2', error_score='raise')
# grid_search.fit(X_train, y_train)

# # Best parameters and model
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

# print("Best Parameters:", best_params)
# print("Best R² Score on Training Data:", grid_search.best_score_)

## 5. Evaluate the model

In [73]:
from sklearn.model_selection import cross_val_score
# Perform cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print("Cross-Validation Scores:", cv_scores)
print("Mean R² Score on Train Data:", cv_scores.mean())

# Evaluate on Test Data
test_score = pipeline.score(X_test, y_test)
print("R² Score on Test Data:", test_score)

Cross-Validation Scores: [0.87535028 0.89532931 0.91453496 0.91178319 0.90489102]
Mean R² Score on Train Data: 0.900377751680748
R² Score on Test Data: 0.8970943084972506


In [74]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

y_preds = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
mse = mean_squared_error(y_test, y_preds)
rmse = root_mean_squared_error(y_test, y_preds)

print(f'MAE : {mae}\nMSE : {mse}\nRMSE : {rmse}')

MAE : 3.679291793133023
MSE : 26.516850557474005
RMSE : 5.149451481223413


## 6. Export the model

In [75]:
# import joblib
# # Save the model
# joblib.dump(pipeline, "concrete_random_forest_model.pkl")