In [1]:
# if code is running on IBM Cloud Pak, uncomment
# %%writefile MissingValues.py

from timeit import default_timer as timer
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from loguru import logger
import warnings
warnings.filterwarnings('ignore')

class MissingValues:

    def handle(self, df, _n_neighbors=3):
        # function for handling missing values in the data
        logger.info('Started handling of missing values...')
        start = timer()
        self.count_missing = df.isna().sum().sum()

        if self.count_missing != 0:
            logger.info('Found a total of {} missing value(s)', self.count_missing)
            df = df.dropna(how='all')   # remove the row or column with all values missing
            df.reset_index(drop=True)
            
            logger.info('Started handling missing values... ')
            self.missing_categ = 'knn'
            imputer = KNNImputer(n_neighbors=_n_neighbors)
            df = MissingValues._impute(self, df, imputer)
 
        else:
            logger.debug('{} missing values found', self.count_missing)
        end = timer()
        logger.info('Completed handling of missing values in {} seconds', round(end-start, 6))  
        return df

    def _impute(self, df, imputer):
        # function for imputing missing values in the data
        boolean_features = self.cols_group["cols_flag"]
        cols_num = self.cols_group["cols_num"]    

        for feature in df.columns: 
            if feature in cols_num:
                # check if there is any missing value
                if df[feature].isna().sum().sum() != 0:
                    try:
                        df_imputed = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)))
                        counter = df[feature].isna().sum().sum() - df_imputed.isna().sum().sum()

                        if (df[feature].fillna(-9999) % 1  == 0).all():
                            df[feature] = df_imputed
                            # round back to INTs, if original data were INTs
                            df[feature] = df[feature].round()
                            df[feature] = df[feature].astype('Int64')                                        
                        else:
                            df[feature] = df_imputed
                        if counter != 0:
                            logger.debug('KNN imputation of {} value(s) succeeded for feature "{}"', counter, feature)
                    except:
                        logger.warning('KNN imputation failed for feature "{}"', feature)
                        
            else:
                if df[feature].isna().sum()!= 0:
                    try:
                        mapping = dict()
                        mappings = {k: i for i, k in enumerate(df[feature].dropna().unique(), 0)}
                        mapping[feature] = mappings
                        df[feature] = df[feature].map(mapping[feature])

                        df_imputed = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)), columns=[feature])    
                        counter = sum(1 for i, j in zip(list(df_imputed[feature]), list(df[feature])) if i != j)

                        # round to integers before mapping back to original values
                        df[feature] = df_imputed
                        df[feature] = df[feature].round()
                        df[feature] = df[feature].astype('Int64')  

                        # map values back to original
                        mappings_inv = {v: k for k, v in mapping[feature].items()}
                        df[feature] = df[feature].map(mappings_inv)
                        if counter != 0:
                            logger.debug('{} imputation of {} value(s) succeeded for feature "{}"', self.missing_categ.upper(), counter, feature)
                    except:
                        logger.warning('{} imputation failed for feature "{}"', str(self.missing_categ).upper(), feature)
        return df

Writing MissingValues.py
