In [1]:
#Dependencies
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerPathCollection
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

In [2]:
X_full = pd.read_csv('/workspaces/2025SE-Mike.N-HSC-AT2/data/concrete_engineered.csv')

In [3]:
X_full.shape

(1030, 12)

Choosing features. Categorical variables with high cardinality are discarded.

In [4]:
# Select categorical columns
categorical_cols = [cname for cname in X_full.columns if
                    X_full[cname].nunique() < 10 and 
                    X_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_full.columns if 
                X_full[cname].dtype in ['int64', 'float64']]

features = categorical_cols + numerical_cols
print(categorical_cols, numerical_cols)


[] ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer', 'coarse_aggregate', 'fine_aggregate', 'age', 'strength', 'cement_to_water_ratio', 'fine_aggregate_to_water_ratio', 'coarse_aggregate_to_water_ratio']


No categorical features, so every feature gets treated in the same manner.

In [5]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength,cement_to_water_ratio,fine_aggregate_to_water_ratio,coarse_aggregate_to_water_ratio
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,35.817836,1.578275,4.344763,5.443181
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,16.705679,0.648105,0.824908,0.842966
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808,0.53125,2.605263,3.453441
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.707115,1.069502,3.88835,4.830208
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,34.442774,1.480718,4.299479,5.451804
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,46.136287,1.875,4.791273,5.974522
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225,3.746827,7.840442,8.695688


Check for misisng values

In [6]:
#Check for missing values
X_full.isnull().sum()

cement                             0
blast_furnace_slag                 0
fly_ash                            0
water                              0
superplasticizer                   0
coarse_aggregate                   0
fine_aggregate                     0
age                                0
strength                           0
cement_to_water_ratio              0
fine_aggregate_to_water_ratio      0
coarse_aggregate_to_water_ratio    0
dtype: int64

Some missing values in the new ratios.

In [7]:
# Check for duplicates
X_full.duplicated().sum()

25

In [8]:
# Remove duplicates
X_full = X_full.drop_duplicates()
X_full.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1025    False
1026    False
1027    False
1028    False
1029    False
Length: 1005, dtype: bool

In [9]:
X_full.shape

(1005, 12)

In [10]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength,cement_to_water_ratio,fine_aggregate_to_water_ratio,coarse_aggregate_to_water_ratio
count,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0
mean,278.629055,72.043134,55.535075,182.074378,6.031647,974.376468,772.686617,45.856716,35.250273,1.557999,4.327273,5.437567
std,104.345003,86.170555,64.207448,21.34074,5.919559,77.579534,80.339851,63.734692,16.284808,0.640746,0.822079,0.852458
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808,0.53125,2.605263,3.453441
25%,190.68,0.0,0.0,166.61,0.0,932.0,724.3,7.0,23.523542,1.066633,3.880851,4.803931
50%,265.0,20.0,0.0,185.7,6.1,968.0,780.0,28.0,33.798114,1.450262,4.276464,5.441176
75%,349.0,142.5,118.27,192.94,10.0,1031.0,822.2,56.0,44.86834,1.826602,4.780488,5.974522
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225,3.746827,7.840442,8.695688


In [11]:
for col in X_full.columns:
    Q1 = X_full[col].quantile(0.25)
    Q3 = X_full[col].quantile(0.75)
    IQR = Q3 - Q1
    print(f'Outliers are a {col} above {Q3 + IQR * 1.5} or below {Q1 - IQR * 1.5}')
    arr = []
    for x in X_full[col].to_numpy():
        if x < (Q1 - IQR * 1.5) or x > (Q3 + IQR * 1.5):
            arr.append(x)
    print(f"Number of outliers in {col} is {len(arr)}")

Outliers are a cement above 586.48 or below -46.79999999999998
Number of outliers in cement is 0
Outliers are a blast_furnace_slag above 356.25 or below -213.75
Number of outliers in blast_furnace_slag is 2
Outliers are a fly_ash above 295.675 or below -177.405
Number of outliers in fly_ash is 0
Outliers are a water above 232.43499999999997 or below 127.11500000000004
Number of outliers in water is 15
Outliers are a superplasticizer above 25.0 or below -15.0
Number of outliers in superplasticizer is 10
Outliers are a coarse_aggregate above 1179.5 or below 783.5
Number of outliers in coarse_aggregate is 0
Outliers are a fine_aggregate above 969.0500000000002 or below 577.4499999999998
Number of outliers in fine_aggregate is 5
Outliers are a age above 129.5 or below -66.5
Number of outliers in age is 59
Outliers are a strength above 76.885537188 or below -8.493654843999998
Number of outliers in strength is 8
Outliers are a cement_to_water_ratio above 2.966555643157661 or below -0.0733206

In [12]:
outlier_detector = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
outliers = outlier_detector.fit_predict(X_full)
X_scores = outlier_detector.negative_outlier_factor_
X_full = X_full[outliers != -1]

In [13]:
outlier_detector.negative_outlier_factor_

array([-1.04390164, -1.00858709, -1.03851129, ..., -1.02230001,
       -0.99008257, -1.0170046 ])

In [14]:
outliers

array([1, 1, 1, ..., 1, 1, 1])

In [15]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength,cement_to_water_ratio,fine_aggregate_to_water_ratio,coarse_aggregate_to_water_ratio
count,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0
mean,278.302168,74.597146,56.210265,181.520962,6.196211,972.005918,773.845863,40.085177,35.286029,1.559998,4.35071,5.440756
std,103.98642,86.020483,62.732791,21.451564,5.911781,75.926197,76.993446,53.127591,15.831514,0.637917,0.832566,0.843509
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808,0.53125,2.605263,3.453441
25%,190.68,0.0,0.0,164.9,0.0,932.0,734.3,7.0,23.876726,1.069502,3.880851,4.82788
50%,264.75,24.0,0.0,185.0,6.435,968.0,779.32,28.0,33.780187,1.451784,4.299479,5.444444
75%,342.0,142.8,118.27,192.0,10.0,1028.4,821.0,28.0,44.544976,1.826602,4.815804,5.974522
max,540.0,359.4,195.0,247.0,32.2,1125.0,992.6,365.0,82.599225,3.746827,7.840442,8.695688


In [16]:
y = X_full['strength'].copy()
X_full = X_full.drop(['strength'], axis = 1)
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X_full, y, test_size=0.25, random_state=42)
features.remove('strength')

In [17]:
# Preprocessing for numerical data
# Imputing NANs as a result of feature engineering
# RobustScaler is used to scale the data, as it is less prone to outliers
# Outlier Detection
scaler = MinMaxScaler()
X_train_valid = scaler.fit_transform(X_train_valid)
X_test = scaler.transform(X_test)
X_train_valid = pd.DataFrame(X_train_valid, columns = features)
X_test = pd.DataFrame(X_test, columns = features)

In [24]:
y_train_valid.shape

(678,)

In [25]:
y_test.shape

(226,)

In [18]:
X_train_valid.shape

(678, 11)

In [19]:
X_test.shape

(226, 11)

In [20]:
X_train_valid.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,cement_to_water_ratio,fine_aggregate_to_water_ratio,coarse_aggregate_to_water_ratio
count,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0,678.0
mean,0.397305,0.211383,0.292638,0.477592,0.191156,0.525993,0.449615,0.106973,0.316289,0.332858,0.379007
std,0.236028,0.23873,0.322895,0.173768,0.183447,0.233678,0.19198,0.150503,0.198607,0.159788,0.164933
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.201689,0.0,0.0,0.350259,0.0,0.404321,0.351982,0.01105,0.167389,0.247325,0.256772
50%,0.355479,0.072343,0.0,0.50499,0.197981,0.512346,0.464927,0.069061,0.277018,0.319144,0.379787
75%,0.546347,0.40345,0.606667,0.560878,0.310559,0.704938,0.559458,0.146409,0.402837,0.418279,0.485603
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
X_full = pd.concat([X_train_valid, X_test], ignore_index=True, sort=False)
y_full = pd.concat([y_train_valid, y_test], ignore_index=True, sort=False)

In [26]:
#Add the target column back into the dataset
X_full['strength'] = y_full

X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,cement_to_water_ratio,fine_aggregate_to_water_ratio,coarse_aggregate_to_water_ratio,strength
count,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0,904.0
mean,0.402516,0.20756,0.288258,0.477213,0.192429,0.527796,0.451194,0.102445,0.319926,0.333407,0.379096,35.286029
std,0.237412,0.239345,0.321707,0.17127,0.183596,0.23434,0.19316,0.146761,0.198383,0.159033,0.160906,15.831514
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.005525,0.0,0.0,0.0,2.331808
25%,0.202466,0.0,0.0,0.344511,0.0,0.404321,0.351982,0.01105,0.167389,0.243657,0.262185,23.876726
50%,0.371575,0.066778,0.0,0.50499,0.199845,0.515432,0.464927,0.069061,0.286273,0.323621,0.3798,33.780187
75%,0.547945,0.397329,0.606513,0.560878,0.310559,0.701852,0.569493,0.069061,0.402837,0.422247,0.480916,44.544976
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,82.599225


In [27]:
#Save the preprocessed data
X_full.to_csv('/workspaces/2025SE-Mike.N-HSC-AT2/data/concrete_preprocessed.csv', index = False)