# Handling Outliers
In this notebook we try out different approaches for handling outliers

In [1]:
# imports 
import pandas as pd
from src.pipelines import build_pipeline
from src.pipelines import pipeline_utils
from src.pipelines import pipeline_cleaning
from collections import Counter

# disable warnings globally
import warnings
warnings.filterwarnings("ignore")

# define the test steps for this notebook
def add_test_steps(custom_pipeline: build_pipeline.CustomPipeline):    
    # additional feature selection by removing certain columns
    pipeline_utils.add_remove_feature_transformer(custom_pipeline, ['age'])
    
    # discretize numerical features
    pipeline_utils.add_kbinsdiscretizer(custom_pipeline, number_of_bins=2)

    # add encoder and scaler
    pipeline_utils.add_binary_encoder_and_minmaxscaler(custom_pipeline)

    # add estimator
    pipeline_utils.apply_knn_classifier(custom_pipeline, 9)

First we take a look at the mcc score without any outlier handling:

In [2]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
add_test_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

# Lets look at the mcc score without outlier removal
pipe.apply_outlier_handler(pipeline_cleaning.OutlierHandler()) # this is a dummy handler that doesn't change the outliers

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    fit_time: 7.112922239303589
    score_time: 0.6036509513854981
    test_accuracy: 0.6443797217210917
    test_f1-score: 0.5262957731065031
    test_mcc: 0.31777805062903136
Class distribution after resampling: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601


Next we try out different outlier removal options:

In [4]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
add_test_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.2, zscore_threshold=2)

pipe.apply_outlier_handler(outlier_remover)

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    fit_time: 2.0616127014160157
    score_time: 0.2197401523590088
    test_accuracy: 0.7309687270196824
    test_f1-score: 0.6165766727285675
    test_mcc: 0.47585972410983696
Class distribution after resampling: Counter({2: 28866, 3: 24710, 1: 1373})
Length overall: 54949


In the following we try out different values for the categorical data with `cat_threshold`

In [8]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
add_test_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.05, zscore_threshold=2)

pipe.apply_outlier_handler(outlier_remover)

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    fit_time: 6.231108474731445
    score_time: 0.5136741161346435
    test_accuracy: 0.7184283403251899
    test_f1-score: 0.6079346288226055
    test_mcc: 0.45632142390423047
Class distribution after resampling: Counter({2: 102586, 3: 67946, 1: 11823})
Length overall: 182355


Decreasing the categorical threshold resultes in a worse mcc score

In [10]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
add_test_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.3, zscore_threshold=2)

pipe.apply_outlier_handler(outlier_remover)

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    fit_time: 1.0502899646759034
    score_time: 0.10308704376220704
    test_accuracy: 0.7279120251107697
    test_f1-score: 0.6237066857126063
    test_mcc: 0.47028671487216034
Class distribution after resampling: Counter({2: 12540, 3: 10276, 1: 647})
Length overall: 23463


In [17]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
add_test_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.26, zscore_threshold=2)

pipe.apply_outlier_handler(outlier_remover)

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    fit_time: 1.7402104377746581
    score_time: 0.1529468536376953
    test_accuracy: 0.7430417357615406
    test_f1-score: 0.6189715534362922
    test_mcc: 0.49763506127692103
Class distribution after resampling: Counter({2: 21530, 3: 17520, 1: 1081})
Length overall: 40131


With an `cat_threshold` of 0.26 the mcc score increases a lot.  
In the next step we try out outlier removal of numerical values by adjusting the `zscore_value`

In [2]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
add_test_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.26, zscore_threshold=0)

pipe.apply_outlier_handler(outlier_remover)

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    fit_time: 0.9579393863677979
    score_time: 0.11315603256225586
    test_accuracy: 0.7374778943148274
    test_f1-score: 0.6359166321950342
    test_mcc: 0.48379876708728353
Class distribution after resampling: Counter({2: 12755, 3: 9224, 1: 800})
Length overall: 22779


Lowering the `zscore threshold` to 0 removes all numerical features. The mcc score is worse in this case.

In [4]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
add_test_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.26, zscore_threshold=4)

pipe.apply_outlier_handler(outlier_remover)

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same

loading data
Class distribution before cleaning: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    fit_time: 1.4093745231628418
    score_time: 0.13780512809753417
    test_accuracy: 0.7404315841004674
    test_f1-score: 0.6133692957091037
    test_mcc: 0.4925470548443788
Class distribution after resampling: Counter({2: 21580, 3: 17573, 1: 1083})
Length overall: 40236


Increasing the `zscore threshold` keeps more numerical values. In this case the mcc score is also worse then before.

In [5]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
add_test_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

outlier_remover = pipeline_cleaning.OutlierRemover(cat_threshold=0.26, zscore_threshold=2.3)
pipe.apply_outlier_handler(outlier_remover)

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same

loading data
preparing data
running pipeline
evaluating pipeline
    fit_time: 1.3670090198516847
    score_time: 0.12748055458068847
    test_accuracy: 0.7439723293503182
    test_f1-score: 0.6330425745590418
    test_mcc: 0.5017422856729208
Class distribution after resampling: Counter({2: 18048, 3: 14953, 1: 925})
Length overall: 33926


With the outlier thresholds from above we score a very good mcc score of 0.5.