In [1]:
# if code is running on IBM Cloud Pak, uncomment
# %%writefile Rescale.py

import os
import sys
from timeit import default_timer as timer
import pandas as pd
from loguru import logger
from scipy.stats import kstest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


In [2]:
class Rescale:
    
    def handle(self, df):
        # function for rescaling numerical columns
        logger.info('Started rescaling numerical fields...')
        start = timer()
        
        # first the column is checked for normal distribution with Kolmogorov-Smirnov test
        # Null Hypothesis: Sample is distributed according to the standard normal
        # Confidence level is chosen to be 95%
        alpha = 0.05  
        
        # if the column entries are normally distributed, z-score is used to scale the column
        # else, column entries are scaled with MinMaxScalar to be in range [0,1]

        cols_num = self.cols_group["cols_num"]
        for feature in cols_num:
            # perform Kolmogorov Smirnov test on numeric columns
            statistic, p_value = kstest(df[feature].astype(np.float64), 'norm')
            if p_value < alpha:
                # p-value is lower than our threshold alpha, so we reject the null hypothesis
                # data are not distributed normally
                # use min max scalar if the range is greater than 5
                if (df[feature].max() - df[feature].min()) > 5:
                    df = Rescale._minmaxscaler(self, df, feature)
                
            else:
                # p-value is higher than our threshold alpha, so we cannot reject the null hypothesis
                # use z-score if the variance is greater than 1
                if df[feature].astype(np.float64).var() > 1:
                    df = Rescale._standardscaler(self, df, feature)        

        end = timer()
        logger.info('Completed rescaling numerical fields in {} seconds', round(end-start, 6))  
        return df
    
    def _minmaxscaler(self, df, feature):
        # function that scales values with MinMaxScalar
        mms = MinMaxScaler()
        df[feature + '_mms'] = mms.fit_transform(df[[feature]])

        return df  
    
    def _standardscaler(self, df, feature):
        # function that scales values with StandardScalar (z-score)
        ss = StandardScalar()
        df[feature + '_ss'] = ss.fit_transform(df[[feature]])
        