In [1]:
# if code is running on IBM Cloud Pak, uncomment
# %%writefile Encoding.py

import os
import sys
from timeit import default_timer as timer
import pandas as pd
import numpy as np
from loguru import logger

class Encoding:

    def handle(self, df):
        # function for encoding of categorical features in the data
            # select non numeric features
            cols_categ = self.cols_group["cols_categ"]
            target_cols = cols_categ
            # check if all columns should be encoded
            logger.info('Started encoding categorical features... Method: one-hot encoding')
            start = timer()
            for feature in target_cols:
                try:
                    # skip encoding of datetime features
                    pd.to_datetime(df[feature])
                    logger.debug('Skipped encoding for DATETIME feature "{}"', feature)
                except:
                    try:
                        # ONEHOT encode if not more than 10 unique values to encode
                        if df[feature].nunique() <=10:
                            df = Encoding._to_onehot(self, df, feature)
                            logger.debug('Encoding to ONEHOT succeeded for feature "{}"', feature)
                        # LABEL encode if not more than 20 unique values to encode
                        elif df[feature].nunique() <=20:
                            df = Encoding._to_label(self, df, feature)
                            logger.debug('Encoding to LABEL succeeded for feature "{}"', feature)
                        # skip encoding if more than 20 unique values to encode
                        else:
                            logger.debug('Encoding skipped for feature "{}"', feature)   
                    except:
                        logger.warning('Encoding failed for feature "{}"', feature)    
            end = timer()
            logger.info('Completed encoding of categorical features in {} seconds', round(end-start, 6))
            return df

    def _to_onehot(self, df, feature, limit=10):  
        # function that encodes categorical features to OneHot encodings    
        one_hot = pd.get_dummies(df[feature], prefix=feature)
        if one_hot.shape[1] > limit:
            logger.warning('ONEHOT encoding for feature "{}" creates {} new features. Consider LABEL encoding instead.', feature, one_hot.shape[1])
        # join the encoded df
        df = df.join(one_hot)
        return df

    def _to_label(self, df, feature):
        # function that encodes categorical features to label encodings 
        le = preprocessing.LabelEncoder()

        df[feature + '_lab'] = le.fit_transform(df[feature].values)
        mapping = dict(zip(le.classes_, range(len(le.classes_))))
        
        for key in mapping:
            try:
                if isnan(key):               
                    replace = {mapping[key] : key }
                    df[feature].replace(replace, inplace=True)
            except:
                pass
        return df  

Writing Encoding.py
