In [1]:
# if code is running on IBM Cloud Pak, uncomment
# %%writefile ExcludeFields.py

from timeit import default_timer as timer
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from loguru import logger
import warnings
warnings.filterwarnings('ignore')

class ExcludeFields:
    
    def handle(self, df):
        # function for excluding:
        #     - fields with more than 50% of missing values
        #     - fields with constant values
        #     - categorical fields with more than 90% unique values

        logger.info('Started excluding fields...')
        start = timer()

        discarding_threshold = 0.9
        imbalance_threshold = 0.85
        self.discarded_columns = []
        
        print()

        boolean_features = self.cols_group["cols_flag"]
#         df[boolean_features] = df[boolean_features].astype(int)

        cols_num = self.cols_group["cols_num"]
        cols_categ = self.cols_group["cols_categ"]

        logger.info('Numerical columns: {} ', cols_num)
        logger.info('Categorical columns: {} ', cols_categ)
            
        # categorical and numerical feature statistics in separate lists with the following format:
        # [ column index , number of unique values , number of missing values ]
        cat_features_stats = [
            (
                i,
                df[i].value_counts(),
                df[i].isnull().sum(),
            )
            for i in cols_categ
        ]

        num_features_stats = [
            (
                i,
                df[i].value_counts(),
                df[i].isnull().sum(),
            )
            for i in cols_num
        ]
        
        for column_stats in cat_features_stats:
            # discard columns that have more than half of the entries are missing
            if column_stats[2] > 0.5 * len(df):
                self.discarded_columns.append(column_stats[0])

            # discard columns with only 1 unique value or
            # a number of unique values larger than the discarding threshold
            if (column_stats[1].shape[0] == 1) or (
                column_stats[1].shape[0] >= (len(df) * discarding_threshold)
            ):
                self.discarded_columns.append(column_stats[0])
            
            # check for imbalanced columns
            if column_stats[0] not in self.discarded_columns:
                feature = column_stats[0]
                
                # get the unique category counts for the feature
                category_counts = df[feature].value_counts().to_dict()
                max_count = max(category_counts.values())
                max_category = ''
                for key, value in category_counts.items():
                    if value == max_count:
                        max_category = key
                        
                # if the maximum number of elements in a category is larger than the imbalance threshold
                # ask the user whether to remove this feature or to keep it
                if max_count > (len(df) * imbalance_threshold):
                    text = print("'" + str(feature) + "' feature is imbalanced with " + str(max_count / len(df) * 100) + "% of its entries containing the value: " + str(max_category) + ". Do you wish to remove this feature from data preprocessing?")
                    exclude = int(input("Enter '1' to remove this feature, '0' to keep it for further analysis : "))
                    print()
                    if exclude == 1:
                        self.discarded_columns.append(column_stats[0])
                        logger.debug('Field exclusion of imbalanced feature "{}" succeded by user request ', feature)

        for column_stats in num_features_stats:
            # discard columns that have more than half of the entries are missing
            if column_stats[2] > 0.5 * len(df):
                self.discarded_columns.append(column_stats[0])

            # discard columns with only 1 unique value
            if column_stats[1].shape[0] <= 1:
                self.discarded_columns.append(column_stats[0])

        # exclude the discarded columns from the dataframe
        df.drop(self.discarded_columns, axis=1, inplace=True)
        df.reset_index(drop=True)

        # update the numerical and categorical columns
        cols_num = list(set(cols_num) - set(self.discarded_columns))
        cols_categ = list(set(cols_categ) - set(self.discarded_columns))
        
        
        logger.info('Numerical columns after field exclusion: {} ', cols_num)
        logger.info('Categorical columns after field exclusion: {} ', cols_categ)
        
        end = timer()
        logger.info('Completed excluding fields in {} seconds', round(end-start, 6))  
        return df

Writing ExcludeFields.py
