# Steel plate faults multiclass classification

In [3]:
# load libraries
from e2eml.classification import classification_blueprints as cb
from e2eml.full_processing.postprocessing import save_to_production, load_for_production
from e2eml.test.classification_blueprints_test import steel_fault_multiclass_data
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import classification_report

TypeError: Cannot create a consistent method resolution
order (MRO) for bases ClassificationModels, PreprocessingBluePrint

# Feature engineering
Load & preprocess steel faults dataset.

In [None]:
# load steel faults data
test_df, test_target, val_df, val_df_target, test_categorical_cols = steel_fault_multiclass_data()

# Using e2eml - Run and save a pipeline
In this notebook we configure a custom pipeline. Due to the object-oriented approach we can easily set this up.
Under the hood the main and mostly used blueprint pipeline looks like this:

def pipeline(self):
        logging.info('Start blueprint.')
        try:
            if df.empty:
                skip_train = False
            else:
                self.dataframe = df
                skip_train = True
        except AttributeError:
            skip_train = False
        self.train_test_split(how=self.train_split_type)
        self.datetime_converter(datetime_handling='all', force_conversion=False)
        if preprocessing_type == 'nlp':
            self.pos_tagging_pca()
        self.rare_feature_processor(threshold=0.03, mask_as='miscellaneous')
        self.cardinality_remover(threshold=100)
        self.onehot_pca()
        self.category_encoding(algorithm='target')
        self.delete_high_null_cols(threshold=0.5)
        self.fill_nulls(how='static')
        self.data_binning(nb_bins=10)
        #self.skewness_removal()
        self.outlier_care(method='isolation', how='append')
        self.remove_collinearity(threshold=0.8)
        self.clustering_as_a_feature(algorithm='dbscan', eps=0.3, n_jobs=-1, min_samples=10)
        for nb_cluster in range(2, 10):
            self.clustering_as_a_feature(algorithm='kmeans', nb_clusters=nb_cluster)
        if self.low_memory_mode:
            self.reduce_memory_footprint()
        self.automated_feature_selection(metric='logloss')
        self.sort_columns_alphabetically()
        if skip_train:
            pass
        else:
            self.lgbm_train(tune_mode=self.tune_mode)
        self.lgbm_predict(feat_importance=True)
        self.classification_eval('lgbm')
        self.prediction_mode = True
        logging.info('Finished blueprint.')

From here we can make custom choices by:
- skipping steps
- changing parameters
- or even extend

We follow these steps:
- instantiate class
- define and run pipeline
- save and load pipeline
- predict on new data

In [None]:
# Instantiate class
steel_faults_ml = cb.ClassificationBluePrint(datasource=test_df,
                                       target_variable=test_target,
                                       categorical_columns=test_categorical_cols,
                                       preferred_training_mode='auto',
                                       tune_mode='accurate')

In [None]:
"""
Define custom pipeline...
- Please note, that there are logical and technical dependencies. Not everything is possible.
"""
def custom_pipeline(df, steel_faults_ml):
    try:
        if df.empty:
            steel_faults_ml.prediction_mode = False
        else:
            steel_faults_ml.dataframe = df
            steel_faults_ml.prediction_mode = True
    except AttributeError:
        steel_faults_ml.prediction_mode = False
    steel_faults_ml.train_test_split(how=steel_faults_ml.train_split_type)
    try:
        print(steel_faults_ml.df_dict["Y_train"])
    except AttributeError:
        #does not exist in prediction mode
        pass
    steel_faults_ml.datetime_converter(datetime_handling='all', force_conversion=False)
    steel_faults_ml.pos_tagging_pca()
    # we removed rare feature processing
    steel_faults_ml.cardinality_remover(threshold=200) #raised
    steel_faults_ml.onehot_pca()
    steel_faults_ml.category_encoding(algorithm='target')
    """
    Custom pipelines allow you to inject your own data manipulation or to access the data in between.
    The train and test data is always stored in the df_dict attribute.
    """
    try:
        print(steel_faults_ml.df_dict["X_train"].isna().sum())
    except AttributeError:
        #changed location in prediction mode
        print(steel_faults_ml.dataframe.isna().sum())
    steel_faults_ml.delete_high_null_cols(threshold=0.4) # lowered to 40%
    steel_faults_ml.fill_nulls(how='iterative_imputation') # we changed to iterative filling instead of imputation with 0
    steel_faults_ml.data_binning(nb_bins=5) # we change the bins
    steel_faults_ml.outlier_care(method='isolation', how='append')
    steel_faults_ml.remove_collinearity(threshold=0.8)
    steel_faults_ml.clustering_as_a_feature(algorithm='dbscan', eps=0.3, n_jobs=-1, min_samples=10)
    for nb_cluster in range(2, 20):
        steel_faults_ml.clustering_as_a_feature(algorithm='GLMM', nb_clusters=nb_cluster) #changed from kmeans
    steel_faults_ml.automated_feature_selection(metric='logloss') # needs to be xgboost compatible
    steel_faults_ml.sort_columns_alphabetically()
    if steel_faults_ml.prediction_mode:
        pass
    else:
        steel_faults_ml.lgbm_train(tune_mode=steel_faults_ml.tune_mode)
    steel_faults_ml.lgbm_predict(feat_importance=True)
    steel_faults_ml.classification_eval('lgbm')
    steel_faults_ml.prediction_mode = True # mandatory


In [None]:
# Run custom blueprint
custom_pipeline(None, steel_faults_ml)

In [None]:
# Save pipeline
try:
    save_to_production(steel_faults_ml, file_name='steel_faults_instance')
except AttributeError:
    print("""Unfortunately this does not work when e2eml has to label encode the target labels automatically.
    For saving a pipeline please provide encoded target labels. As of now this is just a fallback solution.
    This might be solved in future releases.""")

# Predict on new data
In the beginning we kept a holdout dataset. We use this to simulate prediction on completely new data.

In [None]:
# load stored pipeline...we skip this accordingly
steel_faults_ml_loaded = load_for_production(file_name='steel_faults_instance')

In [None]:
# label encode targets
val_df_target = steel_faults_ml_loaded.label_encoder_decoder(val_df_target, mode='transform')

# predict on new data
custom_pipeline(val_df, steel_faults_ml_loaded)

# access predicted labels
val_y_hat = steel_faults_ml_loaded.predicted_classes['lgbm']

In [None]:
# Assess prediction quality on holdout data
print(classification_report(val_df_target, val_y_hat))
try:
    matthews = matthews_corrcoef(val_df_target, val_y_hat)
except Exception:
    print("Matthew failed.")
    matthews = 0
print(matthews)