In [1]:
# %%writefile ExcludeFields.py

from timeit import default_timer as timer
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from loguru import logger
import warnings
warnings.filterwarnings('ignore')

class ExcludeFields:
    
    def handle(self, df, cols_imbalanced):
        # function for excluding:
        #     - fields with more than 50% of missing values
        #     - fields with constant values
        #     - categorical fields with more than 90% unique values

        logger.info('Features to be excluded by user request: {} ', cols_imbalanced)
        logger.info('Started excluding further features...')
        start = timer()

        discarding_threshold = 0.9
        self.discarded_columns = cols_imbalanced
        
        print(self.discarded_columns)

        boolean_features = self.cols_group["cols_flag"]
#         df[boolean_features] = df[boolean_features].astype(int)

        cols_num = self.cols_group["cols_num"]
        cols_categ = self.cols_group["cols_categ"]

        logger.info('Numerical columns: {} ', cols_num)
        logger.info('Categorical columns: {} ', cols_categ)
            
        # categorical and numerical feature statistics in separate lists with the following format:
        # [ column index , number of unique values , number of missing values ]
        cat_features_stats = [
            (
                i,
                df[i].value_counts(),
                df[i].isnull().sum(),
            )
            for i in cols_categ
        ]

        num_features_stats = [
            (
                i,
                df[i].value_counts(),
                df[i].isnull().sum(),
            )
            for i in cols_num
        ]
        
        for column_stats in cat_features_stats:
            # discard columns that have more than half of the entries are missing
            if column_stats[2] > 0.5 * len(df):
                if column_stats[0] not in self.discarded_columns:
                    self.discarded_columns.append(column_stats[0])

            # discard columns with only 1 unique value or
            # a number of unique values larger than the discarding threshold
            if (column_stats[1].shape[0] == 1) or (
                column_stats[1].shape[0] >= (len(df) * discarding_threshold)
            ):
                if column_stats[0] not in self.discarded_columns:
                    self.discarded_columns.append(column_stats[0])

        for column_stats in num_features_stats:
            # discard columns that have more than half of the entries are missing
            if column_stats[2] > 0.5 * len(df):
                if column_stats[0] not in self.discarded_columns:
                    self.discarded_columns.append(column_stats[0])

            # discard columns with only 1 unique value
            if column_stats[1].shape[0] <= 1:
                if column_stats[0] not in self.discarded_columns:
                    self.discarded_columns.append(column_stats[0])

        # exclude the discarded columns from the dataframe
        df.drop(self.discarded_columns, axis=1, inplace=True)
        df.reset_index(drop=True)

        # update the numerical and categorical columns
        cols_num = list(set(cols_num) - set(self.discarded_columns))
        cols_categ = list(set(cols_categ) - set(self.discarded_columns))
        
        
        logger.info('Numerical columns after feature exclusion: {} ', cols_num)
        logger.info('Categorical columns after feature exclusion: {} ', cols_categ)
        
        end = timer()
        logger.info('Completed excluding features in {} seconds', round(end-start, 6))  
        return df