In [62]:
#Dependencies
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.inspection import permutation_importance
from xgboost import XGBRegressor
import xgboost as xgb
from optuna import create_study
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.samplers import RandomSampler
from optuna.integration import XGBoostPruningCallback
from optuna.distributions import IntDistribution, FloatDistribution
from optuna.pruners import MedianPruner
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.io import arff

In [63]:
arff_file = arff.loadarff('/workspaces/2025SE-Mike.N-HSC-AT2/data/cement.arff')
X_full = pd.DataFrame(arff_file[0])
y = X_full['strength']
X_full = X_full.drop(['strength'], axis = 1)

In [64]:
# Select categorical columns
categorical_cols = [cname for cname in X_full.columns if
                    X_full[cname].nunique() < 10 and 
                    X_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_full.columns if 
                X_full[cname].dtype in ['int64', 'float64']]

features = categorical_cols + numerical_cols
print(categorical_cols, numerical_cols)


[] ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer', 'coarse_aggregate', 'fine_aggregate', 'age']


No categorical features, so every feature gets treated in the same manner.

In [65]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0


In [66]:
#Check for missing values
X_full.isnull().sum()

cement                0
blast_furnace_slag    0
fly_ash               0
water                 0
superplasticizer      0
coarse_aggregate      0
fine_aggregate        0
age                   0
dtype: int64

No missing values.

In [67]:
# Check for duplicates
X_full.duplicated().sum()

34

In [68]:
# Remove duplicates
X_full = X_full.drop_duplicates()
X_full.duplicated().sum()

0

In [69]:
# Preprocessing for numerical data
# RobustScaler is used to scale the data, as it is less prone to outliers
scaler = RobustScaler()
# Imputer is used to fill in missing values. We don't treat 0s as missing values.
imputer = SimpleImputer(strategy='mean')
preprocessing = Pipeline(steps = [('imputer', imputer), ('scaler', scaler)])
X_full = preprocessing.fit_transform(X_full)
X_full = pd.DataFrame(X_full, columns = features)

In [70]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age
count,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0
mean,0.111817,0.36724,0.466334,-0.13165,-0.001606,0.061943,-0.074572,0.430364
std,0.701311,0.606037,0.543051,0.821784,0.592474,0.734861,0.838592,1.52264
min,-1.070695,-0.140351,0.0,-2.465305,-0.6,-1.575472,-1.935484,-0.642857
25%,-0.473915,-0.140351,0.0,-0.720894,-0.6,-0.339623,-0.542144,-0.333333
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.526085,0.859649,1.0,0.279106,0.4,0.660377,0.457856,0.666667
max,1.879694,2.381754,1.691891,2.363146,2.62,1.669811,2.212279,8.02381


In [71]:
#Add the target column back into the dataset
X_full['strength'] = y
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
count,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0,996.0
mean,0.111817,0.36724,0.466334,-0.13165,-0.001606,0.061943,-0.074572,0.430364,35.721553
std,0.701311,0.606037,0.543051,0.821784,0.592474,0.734861,0.838592,1.52264,16.843789
min,-1.070695,-0.140351,0.0,-2.465305,-0.6,-1.575472,-1.935484,-0.642857,2.331808
25%,-0.473915,-0.140351,0.0,-0.720894,-0.6,-0.339623,-0.542144,-0.333333,23.522163
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.955659
75%,0.526085,0.859649,1.0,0.279106,0.4,0.660377,0.457856,0.666667,46.230572
max,1.879694,2.381754,1.691891,2.363146,2.62,1.669811,2.212279,8.02381,82.599225


In [72]:
#Save the preprocessed data
X_full.to_csv('/workspaces/2025SE-Mike.N-HSC-AT2/data/cement_preprocessed.csv', index = False)