In [96]:
#Dependencies
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
from scipy.io import arff

In [97]:
X_full = pd.read_csv('/workspaces/2025SE-Mike.N-HSC-AT2/data/concrete_engineered.csv')
y = X_full['strength']
X_full = X_full.drop(['strength'], axis = 1)

In [98]:
# Select categorical columns
categorical_cols = [cname for cname in X_full.columns if
                    X_full[cname].nunique() < 10 and 
                    X_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_full.columns if 
                X_full[cname].dtype in ['int64', 'float64']]

features = categorical_cols + numerical_cols
print(categorical_cols, numerical_cols)


[] ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer', 'coarse_aggregate', 'fine_aggregate', 'age', 'cement_to_water_ratio', 'superplasticizer_to_water_ratio', 'fine_aggregate_to_water_ratio', 'cement_to_fly_ash_ratio', 'superplasticizer_to_fly_ash_ratio', 'age_squared']


No categorical features, so every feature gets treated in the same manner.

In [99]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,cement_to_water_ratio,superplasticizer_to_water_ratio,fine_aggregate_to_water_ratio,cement_to_fly_ash_ratio,superplasticizer_to_fly_ash_ratio,age_squared
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,657.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,1.578275,0.037402,4.344763,1.074774,0.05926,6071.594175
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,0.648105,0.039131,0.824908,1.665946,0.062538,20195.954706
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,0.53125,0.0,2.605263,0.0,0.0,1.0
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,1.069502,0.0,3.88835,0.0,0.0,49.0
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,1.480718,0.034663,4.299479,0.0,0.057791,784.0
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,1.875,0.059592,4.791273,1.905525,0.083333,3136.0
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,3.746827,0.233672,7.840442,11.332379,0.457482,133225.0


In [100]:
#Check for missing values
X_full.isnull().sum()

cement                                 0
blast_furnace_slag                     0
fly_ash                                0
water                                  0
superplasticizer                       0
coarse_aggregate                       0
fine_aggregate                         0
age                                    0
cement_to_water_ratio                  0
superplasticizer_to_water_ratio        0
fine_aggregate_to_water_ratio          0
cement_to_fly_ash_ratio                0
superplasticizer_to_fly_ash_ratio    373
age_squared                            0
dtype: int64

No missing values.

In [101]:
# Check for duplicates
X_full.duplicated().sum()

34

In [102]:
# Remove duplicates
X_full = X_full.drop_duplicates()
X_full.duplicated().sum()

0

In [103]:
# Preprocessing for numerical data
# RobustScaler is used to scale the data, as it is less prone to outliers
scaler = RobustScaler()
# Imputer is used to fill in missing values. We don't treat 0s as missing values.
imputer = SimpleImputer(strategy='mean')
preprocessing = Pipeline(steps = [('imputer', imputer), ('scaler', scaler)])
X_full = preprocessing.fit_transform(X_full)
X_full = pd.DataFrame(X_full, columns = features)

In [104]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,cement_to_water_ratio,superplasticizer_to_water_ratio,fine_aggregate_to_water_ratio,cement_to_fly_ash_ratio,superplasticizer_to_fly_ash_ratio,age_squared
count,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0
mean,0.111817,0.36724,0.466334,-0.13165,-0.001606,0.061943,-0.074572,0.430364,0.129747,0.078112,0.056013,0.56602,-3.5669820000000004e-17,1.845082
std,0.701311,0.606037,0.543051,0.821784,0.592474,0.734861,0.838592,1.52264,0.833108,0.687152,0.977871,0.869841,2.454434,6.978773
min,-1.070695,-0.140351,0.0,-2.465305,-0.6,-1.575472,-1.935484,-0.642857,-1.202136,-0.558941,-1.983098,0.0,-3.018816,-0.266327
25%,-0.473915,-0.140351,0.0,-0.720894,-0.6,-0.339623,-0.542144,-0.333333,-0.519353,-0.558941,-0.469237,0.0,-0.701719,-0.2
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.526085,0.859649,1.0,0.279106,0.4,0.660377,0.457856,0.666667,0.480647,0.441059,0.530763,1.0,0.298281,0.8
max,1.879694,2.381754,1.691891,2.363146,2.62,1.669811,2.212279,8.02381,3.004077,3.579911,4.230695,5.947115,19.63861,45.047959


In [105]:
#Add the target column back into the dataset
X_full['strength'] = y
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,cement_to_water_ratio,superplasticizer_to_water_ratio,fine_aggregate_to_water_ratio,cement_to_fly_ash_ratio,superplasticizer_to_fly_ash_ratio,age_squared,strength
count,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0
mean,0.111817,0.36724,0.466334,-0.13165,-0.001606,0.061943,-0.074572,0.430364,0.129747,0.078112,0.056013,0.56602,-3.5669820000000004e-17,1.845082,35.721553
std,0.701311,0.606037,0.543051,0.821784,0.592474,0.734861,0.838592,1.52264,0.833108,0.687152,0.977871,0.869841,2.454434,6.978773,16.843789
min,-1.070695,-0.140351,0.0,-2.465305,-0.6,-1.575472,-1.935484,-0.642857,-1.202136,-0.558941,-1.983098,0.0,-3.018816,-0.266327,2.331808
25%,-0.473915,-0.140351,0.0,-0.720894,-0.6,-0.339623,-0.542144,-0.333333,-0.519353,-0.558941,-0.469237,0.0,-0.701719,-0.2,23.522163
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.955659
75%,0.526085,0.859649,1.0,0.279106,0.4,0.660377,0.457856,0.666667,0.480647,0.441059,0.530763,1.0,0.298281,0.8,46.230572
max,1.879694,2.381754,1.691891,2.363146,2.62,1.669811,2.212279,8.02381,3.004077,3.579911,4.230695,5.947115,19.63861,45.047959,82.599225


In [106]:
#Save the preprocessed data
X_full.to_csv('/workspaces/2025SE-Mike.N-HSC-AT2/data/concrete_preprocessed.csv', index = False)