In [1]:
# %%writefile Imbalanced.py

import numpy as np
import pandas as pd

class Imbalanced:
        
    def handle(df):
        # function for returning imbalanced fields
        
        imbalance_threshold = 0.85

        imbalanced_fields = {}
        info_message = ''
        
        # feature statistics in separate lists with the following format:
        # [ column name , number of unique values , number of missing values ]
        features_stats = [
            (
                i,
                df[i].value_counts(),
                df[i].isnull().sum(),
            )
            for i in df.columns
        ]

        for column_stats in features_stats:
            # check for imbalanced columns
            feature = column_stats[0]

            # get the unique category counts for the feature
            category_counts = df[feature].value_counts().to_dict()
            max_count = max(category_counts.values())
            max_category = ''
            for key, value in category_counts.items():
                if value == max_count:
                    max_category = key

            # if the maximum number of elements in a category is larger than the imbalance threshold
            # store the imbalanced field information in a dictionary to be returned
            # { feature : [percentage of imbalancedness , value with maximum occurence]}
            if max_count > (len(df) * imbalance_threshold):
                imbalanced_fields[str(feature)] = [str(max_count / len(df) * 100), str(max_category)]
                info_message = info_message + "'" + str(feature) + "' \tfeature is imbalanced with \t" + str(max_count / len(df) * 100) + "% \tof its entries containing the value: \t" + str(max_category) + ".\n"
        
        return imbalanced_fields, info_message
