# Handling Outliers
In this notebook we try out different approaches for handling outliers

In [1]:
# imports 
import pandas as pd
import numpy as np
from src.pipelines import build_pipeline
from src.pipelines import pipeline_utils
from src.pipelines import pipeline_cleaning
from collections import Counter
from sklearn.model_selection import GridSearchCV

# disable warnings globally
import warnings
warnings.filterwarnings("ignore")

# define the test steps for this notebook
def add_test_steps(custom_pipeline: build_pipeline.CustomPipeline):    
    # additional feature selection by removing certain columns
    pipeline_utils.add_remove_feature_transformer(custom_pipeline, ['age'])
    
    # discretize numerical features
    pipeline_utils.add_kbinsdiscretizer(custom_pipeline, number_of_bins=2)

    # add encoder and scaler
    pipeline_utils.add_binary_encoder_and_minmaxscaler(custom_pipeline)

    # add estimator
    pipeline_utils.apply_lgbm_classifier(custom_pipeline)

First we take a look at the mcc score without any outlier handling:

In [2]:

# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_prediction=True, 
    force_cleaning=True, 
    skip_storing_cleaning=True, 
    skip_evaluation=False, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

# Lets look at the mcc score without outlier removal
# this is a dummy handler that doesn't change the outliers
pipeline_utils.add_outlier_handling(
        custom_pipeline=pipe,
        outlier_handling_func=pipeline_cleaning.OutlierHandler().handle_outliers
    )

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7106 [std=0.]
    validation_f1-score: 0.6361 [std=0.]
    validation_mcc: 0.4541 [std=0.]
Class distribution after resampling: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 234540


Next we try out different outlier removal options:

In [3]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_prediction=True, 
    force_cleaning=True, 
    skip_storing_cleaning=True, 
    skip_evaluation=False, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0, zscore_threshold=999)
pipeline_utils.add_outlier_handling(
        custom_pipeline=pipe,
        outlier_handling_func=outlier_remover.handle_outliers
    )
# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7106 [std=0.]
    validation_f1-score: 0.6361 [std=0.]
    validation_mcc: 0.4541 [std=0.]
Class distribution after resampling: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 234540


In the following we try out different values for the categorical data with `cat_threshold`

In [4]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_prediction=True, 
    force_cleaning=True, 
    skip_storing_cleaning=True, 
    skip_evaluation=False, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.05, zscore_threshold=999)
pipeline_utils.add_outlier_handling(
        custom_pipeline=pipe,
        outlier_handling_func=outlier_remover.handle_outliers
    )

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.681 [std=0.]
    validation_f1-score: 0.5606 [std=0.]
    validation_mcc: 0.39 [std=0.]
Class distribution after resampling: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 234540


Decreasing the categorical threshold resultes in a better mcc score

In [5]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_prediction=True, 
    force_cleaning=True, 
    skip_storing_cleaning=True, 
    skip_evaluation=False, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.1, zscore_threshold=999)
pipeline_utils.add_outlier_handling(
        custom_pipeline=pipe,
        outlier_handling_func=outlier_remover.handle_outliers
    )

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.6601 [std=0.]
    validation_f1-score: 0.5472 [std=0.]
    validation_mcc: 0.3576 [std=0.]
Class distribution after resampling: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 234540


In [6]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_prediction=True, 
    force_cleaning=True, 
    skip_storing_cleaning=True, 
    skip_evaluation=False, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.001, zscore_threshold=2)
pipeline_utils.add_outlier_handling(
        custom_pipeline=pipe,
        outlier_handling_func=outlier_remover.handle_outliers
    )

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7129 [std=0.]
    validation_f1-score: 0.6362 [std=0.]
    validation_mcc: 0.4584 [std=0.]
Class distribution after resampling: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 234540


Not applying any outlier removal on the categorial values seems to supply the best score. We set the `cat_threshold` at 0.
In the next step we try out outlier removal of numerical values by adjusting the `zscore_value`

In [7]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_prediction=True, 
    force_cleaning=True, 
    skip_storing_cleaning=True, 
    skip_evaluation=False, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0, zscore_threshold=0)
pipeline_utils.add_outlier_handling(
        custom_pipeline=pipe,
        outlier_handling_func=outlier_remover.handle_outliers
    )

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7038 [std=0.]
    validation_f1-score: 0.5852 [std=0.]
    validation_mcc: 0.4322 [std=0.]
Class distribution after resampling: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 234540


Lowering the `zscore threshold` to 0 removes all numerical features. The mcc score is worse in this case.

In [15]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(
    skip_storing_prediction=True, 
    force_cleaning=True, 
    skip_storing_cleaning=True, 
    skip_evaluation=False, 
    use_validation_set=True,
    use_cross_validation=False)
add_test_steps(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0, zscore_threshold=4)
pipeline_utils.add_outlier_handling(
        custom_pipeline=pipe,
        outlier_handling_func=outlier_remover.handle_outliers
    )

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same

loading data
Class distribution before cleaning: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    validation_accuracy: 0.7179 [std=0.]
    validation_f1-score: 0.6461 [std=0.]
    validation_mcc: 0.4692 [std=0.]
Class distribution after resampling: Counter({2: 133433, 3: 78496, 1: 22611})
Length overall: 234540


Increasing the `zscore threshold` keeps more numerical values. In this case the mcc score is better then before.

With the outlier thresholds from above we score a very good mcc score of 0.469 on the test data