# Dealing with Imbalanced Multi-Class Data by Sampling
In the following we try out different sampling techniques to balance the target variable.

In [7]:
# imports 
import pandas as pd
from src.pipelines import build_pipeline
from src.pipelines import pipeline_utils
from src.pipelines import pipeline_cleaning
from collections import Counter
from src.features import sampling_strategies

# disable warnings globally
import warnings
warnings.filterwarnings("ignore")

# define the test steps for this notebook
def add_test_steps(custom_pipeline: build_pipeline.CustomPipeline):    
    # additional feature selection by removing certain columns
    pipeline_utils.add_remove_feature_transformer(custom_pipeline, ['age'])
    
    # discretize numerical features
    pipeline_utils.add_kbinsdiscretizer(custom_pipeline, number_of_bins=2)

    # add encoder and scaler
    pipeline_utils.add_binary_encoder_and_minmaxscaler(custom_pipeline)

    # add estimator
    pipeline_utils.apply_knn_classifier(custom_pipeline, 9)

Lets check the distribution of the target variable before any sampling:

In [8]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_evaluation=True)
add_test_steps(pipe)

# run the pipeline
pipe.run()

# Check the class distribution before resampling
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))

loading data
preparing data
storing cleaned data
running pipeline
Class distribution after resampling: Counter({2: 18048, 3: 14953, 1: 925})
Length overall: 33926


The distribution of the target variable for each value is: 2: 18048, 3: 14953, 1: 925  
As we can see the distribution of 2 and 3 are rather close to each other but 1 has a large minority.

Lets apply some sampling techniques.

In [9]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_evaluation=False, use_kfold_shuffle=True)
add_test_steps(pipe)
randomResampler = sampling_strategies.RandomSampler(oversampling_strategy='auto', undersampling_strategy='auto')

pipe.apply_sampler(randomResampler)
# run the pipeline
pipe.run()

# Check the class distribution after resampling
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))

loading data
preparing data
storing cleaned data
running pipeline
evaluating pipeline
    fit_time: 1.2757976055145264
    score_time: 8.387667322158814
    test_accuracy: 0.7924608728433545
    test_f1-score: 0.7869465967933185
    test_mcc: 0.690829606718539
Class distribution after resampling: Counter({1: 18048, 2: 18048, 3: 18048})
Length overall: 54144


In [10]:
# build and prepare pipeline
pipe = build_pipeline.CustomPipeline(skip_storing_prediction=True, force_cleaning=True, skip_evaluation=True)
add_test_steps(pipe)
randomResampler = sampling_strategies.RandomSampler(oversampling_strategy={1: 1500}, undersampling_strategy={2: 15000})

pipe.apply_sampler(randomResampler)
# run the pipeline
pipe.run()

# Check the class distribution after resampling
print('Class distribution after resampling:', Counter(pipe.y_train))
print('Length overall:', len(pipe.y_train))

loading data
preparing data
storing cleaned data
running pipeline
Class distribution after resampling: Counter({2: 15000, 3: 14953, 1: 1500})
Length overall: 31453
