## Data Wrangling ##

In [25]:
#Dependencies
import pickle
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerPathCollection
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

In [26]:
X_full = pd.read_csv('../data/concrete_engineered.csv')

In [27]:
X_full.shape

(1030, 9)

Choosing features. Categorical variables with high cardinality are discarded.

In [28]:
# Select categorical columns
categorical_cols = [cname for cname in X_full.columns if
                    X_full[cname].nunique() < 10 and 
                    X_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_full.columns if 
                X_full[cname].dtype in ['int64', 'float64']]

features = categorical_cols + numerical_cols
print(categorical_cols, numerical_cols)


[] ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer', 'coarse_aggregate', 'fine_aggregate', 'age', 'strength']


No categorical features, so every feature gets treated in the same manner.

In [29]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,35.817836
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,16.705679
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.707115
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,34.442774
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,46.136287
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


Check for misisng values

In [30]:
#Check for missing values
X_full.isnull().sum()

cement                0
blast_furnace_slag    0
fly_ash               0
water                 0
superplasticizer      0
coarse_aggregate      0
fine_aggregate        0
age                   0
strength              0
dtype: int64

Imputer for CI/CD. Ingredients in concrete formula are ratios so assumption of 0s for NaNs is best.

In [31]:
imputer = SimpleImputer(strategy='constant', fill_value=0)
X_full[numerical_cols] = imputer.fit_transform(X_full[numerical_cols])

In [32]:
# Check for duplicates
X_full.duplicated().sum()

np.int64(25)

In [33]:
# Remove duplicates
X_full = X_full.drop_duplicates()
X_full.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1025    False
1026    False
1027    False
1028    False
1029    False
Length: 1005, dtype: bool

In [34]:
X_full.shape

(1005, 9)

In [35]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
count,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0
mean,278.629055,72.043134,55.535075,182.074378,6.031647,974.376468,772.686617,45.856716,35.250273
std,104.345003,86.170555,64.207448,21.34074,5.919559,77.579534,80.339851,63.734692,16.284808
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,190.68,0.0,0.0,166.61,0.0,932.0,724.3,7.0,23.523542
50%,265.0,20.0,0.0,185.7,6.1,968.0,780.0,28.0,33.798114
75%,349.0,142.5,118.27,192.94,10.0,1031.0,822.2,56.0,44.86834
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


In [36]:
for col in X_full.columns:
    Q1 = X_full[col].quantile(0.25)
    Q3 = X_full[col].quantile(0.75)
    IQR = Q3 - Q1
    print(f'Outliers are a {col} above {Q3 + IQR * 1.5} or below {Q1 - IQR * 1.5}')
    arr = []
    for x in X_full[col].to_numpy():
        if x < (Q1 - IQR * 1.5) or x > (Q3 + IQR * 1.5):
            arr.append(x)
    print(f"Number of outliers in {col} is {len(arr)}")

Outliers are a cement above 586.48 or below -46.79999999999998
Number of outliers in cement is 0
Outliers are a blast_furnace_slag above 356.25 or below -213.75
Number of outliers in blast_furnace_slag is 2
Outliers are a fly_ash above 295.675 or below -177.405
Number of outliers in fly_ash is 0
Outliers are a water above 232.43499999999997 or below 127.11500000000004
Number of outliers in water is 15
Outliers are a superplasticizer above 25.0 or below -15.0
Number of outliers in superplasticizer is 10
Outliers are a coarse_aggregate above 1179.5 or below 783.5
Number of outliers in coarse_aggregate is 0
Outliers are a fine_aggregate above 969.0500000000002 or below 577.4499999999998
Number of outliers in fine_aggregate is 5
Outliers are a age above 129.5 or below -66.5
Number of outliers in age is 59
Outliers are a strength above 76.885537188 or below -8.493654843999998
Number of outliers in strength is 8


In [37]:
outlier_detector = LocalOutlierFactor(n_neighbors=20, contamination=0.005)
outliers = outlier_detector.fit_predict(X_full)
X_scores = outlier_detector.negative_outlier_factor_
X_full = X_full[outliers != -1]

In [38]:
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,278.636537,72.248699,55.334785,182.153203,6.018323,973.790641,773.013163,45.811812,35.219046
std,104.113454,86.181643,63.975239,21.354483,5.899988,77.370435,80.328452,63.77227,16.247082
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,190.68,0.0,0.0,166.61,0.0,932.0,728.45,7.0,23.523887
50%,265.0,20.0,0.0,185.7,6.1,968.0,780.0,28.0,33.762261
75%,345.5,142.5,118.27,192.94,10.0,1029.4,824.0,56.0,44.781121
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


In [39]:
y = X_full['strength'].copy()
X_full = X_full.drop(['strength'], axis = 1)
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X_full, y, test_size=0.25, random_state=42)
features.remove('strength')

In [40]:
# Preprocessing for numerical data
# Imputing NANs as a result of feature engineering
# RobustScaler is used to scale the data, as it is less prone to outliers
# Outlier Detection
scaler = MinMaxScaler()
X_train_valid = scaler.fit_transform(X_train_valid)
pickle.dump(scaler, open('../model/scaler.pkl', 'wb'))
X_test = scaler.transform(X_test)
X_train_valid = pd.DataFrame(X_train_valid, columns = features)
X_test = pd.DataFrame(X_test, columns = features)

In [41]:
y_train_valid.shape

(749,)

In [42]:
y_test.shape

(250,)

In [43]:
X_train_valid.shape

(749, 8)

In [44]:
X_test.shape

(250, 8)

In [45]:
X_train_valid.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age
count,749.0,749.0,749.0,749.0,749.0,749.0,749.0,749.0
mean,0.40398,0.203587,0.282423,0.483552,0.186455,0.504464,0.446173,0.118561
std,0.240175,0.242705,0.326439,0.16838,0.184337,0.224145,0.203051,0.160442
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.202466,0.0,0.0,0.358164,0.0,0.380814,0.322378,0.035714
50%,0.365297,0.055648,0.0,0.510579,0.186335,0.485465,0.466633,0.074176
75%,0.563927,0.402615,0.606513,0.568383,0.310559,0.663953,0.572504,0.151099
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
X_full = pd.concat([X_train_valid, X_test], ignore_index=True, sort=False)
y_full = pd.concat([y_train_valid, y_test], ignore_index=True, sort=False)

In [47]:
#Add the target column back into the dataset
X_full['strength'] = y_full

X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,0.40328,0.201026,0.283768,0.482261,0.186904,0.502298,0.449105,0.123109,35.219046
std,0.237702,0.239793,0.328078,0.170495,0.183229,0.224914,0.201526,0.175199,16.247082
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.331808
25%,0.202466,0.0,0.0,0.358164,0.0,0.380814,0.337306,0.016484,23.523887
50%,0.372146,0.055648,0.0,0.510579,0.189441,0.485465,0.466633,0.074176,33.762261
75%,0.555936,0.396494,0.606513,0.568383,0.310559,0.663953,0.57702,0.151099,44.781121
max,1.0,1.0,1.026154,1.0,1.0,1.0,1.0,1.0,82.599225


In [48]:
#Save the preprocessed data
X_full.to_csv('../data/concrete_preprocessed.csv', index = False)