In [15]:
#Dependencies
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerPathCollection
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from scipy.io import arff

In [16]:
X_full = pd.read_csv('/workspaces/2025SE-Mike.N-HSC-AT2/data/concrete_engineered.csv')
y = X_full['strength']
X_full = X_full.drop(['strength'], axis = 1)

Choosing features. Categorical variables with high cardinality are discarded.

In [17]:
# Select categorical columns
categorical_cols = [cname for cname in X_full.columns if
                    X_full[cname].nunique() < 10 and 
                    X_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_full.columns if 
                X_full[cname].dtype in ['int64', 'float64']]

features = categorical_cols + numerical_cols
print(categorical_cols, numerical_cols)


[] ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer', 'coarse_aggregate', 'fine_aggregate', 'age', 'cement_to_water_ratio', 'fine_aggregate_to_water_ratio', 'coarse_aggregate_to_water_ratio']


No categorical features, so every feature gets treated in the same manner.

In [18]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,cement_to_water_ratio,fine_aggregate_to_water_ratio,coarse_aggregate_to_water_ratio
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,1.578275,4.344763,5.443181
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,0.648105,0.824908,0.842966
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,0.53125,2.605263,3.453441
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,1.069502,3.88835,4.830208
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,1.480718,4.299479,5.451804
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,1.875,4.791273,5.974522
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,3.746827,7.840442,8.695688


Check for misisng values

In [19]:
#Check for missing values
X_full.isnull().sum()

cement                             0
blast_furnace_slag                 0
fly_ash                            0
water                              0
superplasticizer                   0
coarse_aggregate                   0
fine_aggregate                     0
age                                0
cement_to_water_ratio              0
fine_aggregate_to_water_ratio      0
coarse_aggregate_to_water_ratio    0
dtype: int64

Some missing values in the new ratios.

In [20]:
# Check for duplicates
X_full.duplicated().sum()

34

In [21]:
# Remove duplicates
X_full = X_full.drop_duplicates()
X_full.duplicated().sum()

0

In [22]:
X_full.shape

(996, 11)

In [23]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,cement_to_water_ratio,fine_aggregate_to_water_ratio,coarse_aggregate_to_water_ratio
count,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0
mean,277.549799,72.331677,55.153363,182.28499,5.983941,974.565914,772.833584,46.075301,1.549451,4.323233,5.432411
std,104.113051,86.360337,64.226657,21.317064,5.924743,77.895248,80.588694,63.950871,0.636897,0.823866,0.854491
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,0.53125,2.605263,3.453441
25%,190.595,0.0,0.0,167.0,0.0,932.0,727.9,14.0,1.053226,3.880705,4.803931
50%,260.95,20.0,0.0,185.7,6.0,968.0,780.0,28.0,1.450262,4.276042,5.422166
75%,339.05,142.5,118.27,192.94,10.0,1038.0,824.0,56.0,1.817708,4.723214,5.974522
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,3.746827,7.840442,8.695688


In [24]:
# Preprocessing for numerical data
# Imputing NANs as a result of feature engineering
# RobustScaler is used to scale the data, as it is less prone to outliers
X_standardized = (X_full - X_full.mean(axis=0)) / X_full.std(axis = 0)

#Outlier Detection
outlier_detector = LocalOutlierFactor(n_neighbors= 10)
outliers = outlier_detector.fit_predict(X_standardized)
X_standardized = X_standardized[outliers != -1]

#Dimensionality Reduction
pca = PCA(len(X_standardized.columns))
X_pca = pca.fit_transform(X_standardized)

#convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_pca)
X_scaled = pd.DataFrame(X_scaled, columns = component_names)

In [25]:
X_scaled.shape

(965, 11)

In [26]:
X_scaled.describe()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11
count,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0
mean,0.530776,0.367179,0.544774,0.486292,0.428217,0.247378,0.568184,0.608166,0.337544,0.34234,0.502997
std,0.173553,0.224139,0.194942,0.160355,0.183817,0.149994,0.135685,0.111768,0.132295,0.114465,0.090122
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.411567,0.190724,0.409492,0.372793,0.288884,0.149067,0.503038,0.550165,0.261514,0.28342,0.45207
50%,0.545404,0.33689,0.563499,0.470085,0.428621,0.227379,0.579296,0.599718,0.325454,0.339283,0.497631
75%,0.635724,0.516884,0.679236,0.591013,0.558849,0.307246,0.638203,0.682678,0.393748,0.398243,0.54618
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
#Add the target column back into the dataset
X_scaled['strength'] = y
X_scaled.describe()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,strength
count,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0,965.0
mean,0.530776,0.367179,0.544774,0.486292,0.428217,0.247378,0.568184,0.608166,0.337544,0.34234,0.502997,35.905109
std,0.173553,0.224139,0.194942,0.160355,0.183817,0.149994,0.135685,0.111768,0.132295,0.114465,0.090122,16.947501
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.331808
25%,0.411567,0.190724,0.409492,0.372793,0.288884,0.149067,0.503038,0.550165,0.261514,0.28342,0.45207,23.523542
50%,0.545404,0.33689,0.563499,0.470085,0.428621,0.227379,0.579296,0.599718,0.325454,0.339283,0.497631,34.294536
75%,0.635724,0.516884,0.679236,0.591013,0.558849,0.307246,0.638203,0.682678,0.393748,0.398243,0.54618,46.68442
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,82.599225


In [28]:
#Save the preprocessed data
X_scaled.to_csv('/workspaces/2025SE-Mike.N-HSC-AT2/data/concrete_preprocessed.csv', index = False)