# Dealing with Imbalanced Multi-Class Data by Sampling
In the following we try out different sampling techniques to balance the target variable.

In [1]:
# imports 
import pandas as pd
from src.pipelines import build_pipeline
from src.pipelines import pipeline_utils
from src.pipelines import pipeline_cleaning
from collections import Counter

# disable warnings globally
import warnings
warnings.filterwarnings("ignore")

In [6]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_evaluation=True)
pipeline_utils.add_best_steps(pipe)
#pipeline_utils.add_randomsampling(pipe, oversampling_strategy='auto', undersampling_strategy='auto')
pipe.apply_outlier_handler(pipeline_cleaning.OutlierRemover(cat_threshold=0.05, zscore_value=3))

# Check the class distribution before resampling
print('Class distribution before cleaning:', Counter(pipe.y_train_raw['damage_grade']))
print('Length overall:', len(pipe.y_train_raw['damage_grade']))

# run the pipeline
pipe.run()

# Check the class distribution after resampling
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))


loading data
Class distribution before cleaning: Counter({2: 148259, 3: 87218, 1: 25124})
Length overall: 260601
preparing data
storing cleaned data
running pipeline
Class distribution after resampling: Counter({2: 102586, 3: 67946, 1: 11823})
Length overall: 182355


In [7]:
lgbm_pipeline = build_pipeline.CustomPipeline(
        force_cleaning=False,
        skip_storing_cleaning=True,
        skip_evaluation=False,
        skip_error_evaluation=True,
        skip_feature_evaluation=False,
        print_evaluation=True,
        skip_storing_prediction=True
        )
#pipeline_utils.add_best_steps(custom_pipeline=lgbm_pipeline)
pipeline_utils.apply_lgbm_classifier(lgbm_pipeline)
lgbm_pipeline.run()

loading data
preparing data
running pipeline


ValueError: Names provided are not unique: ['discretizer', 'encoder_and_scaler', 'feature_remover', 'oversampling', 'undersampling', 'feature_remover', 'discretizer', 'encoder_and_scaler', 'oversampling', 'undersampling', 'feature_remover', 'discretizer', 'encoder_and_scaler', 'feature_remover', 'discretizer', 'encoder_and_scaler', 'estimator']