# Handling Outliers
In this notebook we try out different approaches for handling outliers

In [5]:
# imports 
import pandas as pd
from src.pipelines import build_pipeline
from src.pipelines import pipeline_utils
from src.pipelines import pipeline_cleaning
from collections import Counter

# disable warnings globally
import warnings
warnings.filterwarnings("ignore")

First we take a look at the mcc score without any outlier handling:

In [4]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
pipeline_utils.add_best_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

# Lets look at the mcc score without outlier removal
pipe.apply_outlier_handler(pipeline_cleaning.OutlierHandler()) # this is a dummy handler that doesn't change the outliers

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same


loading data
Class distribution before cleaning: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601
preparing data
running pipeline
evaluating pipeline
    fit_time: 8.715714120864869
    score_time: 0.7221852302551269
    test_accuracy: 0.7104385646964736
    test_f1-score: 0.6376468513487623
    test_mcc: 0.45373242728065505
Class distribution after resampling: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601


Next we try out different outlier removal options:

In [None]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_storing_cleaning=True, skip_evaluation=False)
pipeline_utils.add_best_steps(pipe)
pipeline_utils.apply_lgbm_classifier(pipe)

# Lets look at the mcc score without outlier removal
pipe.apply_outlier_handler(pipeline_cleaning.OutlierHandler()) # this is a dummy handler that doesn't change the outliers

# Check the class distribution before outlier removal
print('Class distribution before cleaning:', Counter(pipe.y_train['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after outlier removal
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))
# the length from both outputs should be the same
