In [1]:
#Dependencies
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import pickle
from scipy.io import arff

In [2]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Example feature engineering: log transformation
        X['cement_to_water_ratio'] = X['cement'] / X['water']
        X['coarse_aggregate_to_water_ratio'] = X['coarse_aggregate'] / X['water']
        X['fine_aggregate_to_water_ratio'] = X['fine_aggregate'] / X['water']
        # Impute inf and -inf with 0
        X.replace([np.inf, -np.inf], 0, inplace=True)
        return X

imputer = SimpleImputer(strategy='constant', fill_value=0)
scaler = MinMaxScaler()
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
model = pickle.load(open('../model/xgboost.pkl', 'rb'))

class OutlierDetector(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Fit the model and predict outliers
        outliers = self.model.fit_predict(X)
        # Remove outliers
        mask = outliers == 1
        return X[mask].reset_index(drop=True)

preprocessing = Pipeline([('outlier_detector', OutlierDetector(lof)),
                        ('imputer', imputer),
                        ('scaler', scaler)])

In [3]:
arff_data = arff.loadarff('../data/cement.arff')
X_full = pd.DataFrame(arff_data[0])
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,35.817836
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,16.705679
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.707115
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,34.442774
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,46.136287
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


In [4]:
feature_engineer = FeatureEngineer()
X_full = feature_engineer.transform(X_full)
X_full.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength,cement_to_water_ratio,coarse_aggregate_to_water_ratio,fine_aggregate_to_water_ratio
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.165631,73.895485,54.187136,181.566359,6.203112,972.918592,773.578883,45.662136,35.817836,1.578275,5.443181,4.344763
std,104.507142,86.279104,63.996469,21.355567,5.973492,77.753818,80.175427,63.169912,16.705679,0.648105,0.842966,0.824908
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808,0.53125,3.453441,2.605263
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.707115,1.069502,4.830208,3.88835
50%,272.9,22.0,0.0,185.0,6.35,968.0,779.51,28.0,34.442774,1.480718,5.451804,4.299479
75%,350.0,142.95,118.27,192.0,10.16,1029.4,824.0,56.0,46.136287,1.875,5.974522,4.791273
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225,3.746827,8.695688,7.840442


In [5]:
features = ['cement', 'blast_furnance_slag', 'fly_ash', 'water', 'superplasticizer', 'coarse_aggregate', 'fine_aggregate', 'age', 'cement_to_water_ratio', 'coarse_aggregate_to_water_ratio', 'fine_aggregate_to_water_ratio']
y = X_full['strength'].copy()
X_full.drop(columns=['strength'], inplace=True)
X_full = preprocessing.fit_transform(X_full)
X_full = pd.DataFrame(X_full, columns=features)
X_full['strength'] = y

In [6]:
X_full.describe()

Unnamed: 0,cement,blast_furnance_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,cement_to_water_ratio,coarse_aggregate_to_water_ratio,fine_aggregate_to_water_ratio,strength
count,978.0,978.0,978.0,978.0,978.0,978.0,978.0,978.0,978.0,978.0,978.0,978.0
mean,0.407142,0.207548,0.272638,0.477832,0.190841,0.518877,0.452638,0.116292,0.323822,0.37808,0.333608,35.859596
std,0.236848,0.24065,0.316337,0.171718,0.183934,0.231132,0.19707,0.15919,0.199748,0.158911,0.159307,16.91238
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.331808
25%,0.206336,0.0,0.0,0.344511,0.0,0.398176,0.356623,0.016484,0.167389,0.264889,0.245258,23.523715
50%,0.389726,0.066778,0.0,0.50499,0.194099,0.507599,0.466257,0.074176,0.291639,0.381203,0.324911,34.346247
75%,0.56621,0.39712,0.591054,0.560878,0.313665,0.691185,0.572504,0.151099,0.409092,0.479889,0.418279,46.579275
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,82.599225


In [7]:
pre_training = Pipeline([('feature_engineer', FeatureEngineer()),
                        ('outlier_detector', OutlierDetector(lof)),
                        ('imputer', imputer),
                        ('scaler', scaler)])

In [10]:
pre_training.fit(X_full)
pickle.dump(pre_training, open('../model/pre_training.pkl', 'wb'))