In [1]:
# %%writefile AutoDataPrep.py
    
import os
import sys
from timeit import default_timer as timer
import pandas as pd
from loguru import logger

# if code is running on IBM Cloud Pak, uncomment this part and run
# from EDA import *
# from Outliers import *
# from MissingValues import *
# from Duplicates import *
# from ExcludeFields import *
# from Rescale import *
# from Encoding import *
# from FieldAssingment import *

# if code is running on IBM Cloud Pak, comment this part and run
%run EDA.ipynb
%run Outliers.ipynb
%run MissingValues.ipynb
%run Duplicates.ipynb
%run ExcludeFields.ipynb
%run Rescale.ipynb
%run Encoding.ipynb
%run FieldAssignment.ipynb

class AutoDataPrep:

    def __init__(self, input_data, cols_imbalanced):  
        '''
        input_data (dataframe)..........Pandas dataframe
        OUTPUT (dataframe)..............a cleaned Pandas dataframe, accessible through the 'output' instance
        
        AutoDataPrep class handles different stages of data preprocessing:
        
        + Field assignment
        + Excluding fields
        + Missing values
        + Outliers
        + Duplicate Values
        + Rescale Fields
        + Feature Encoding
        
        '''
        
        start = timer()
        self._initialize_logger()
        
        output_data = input_data.copy()
            
        # Field Assignment
        df, self.cols_group = FieldAssignment.handle(self, input_data)
        
        # validate the input parameters
        self._validate_params(output_data)
        self.outlier_param = 1.5

        # initialize our class and start the auto data preprocessing
        self.output = self._clean_data(output_data, input_data, cols_imbalanced)  

        end = timer()
        logger.info('AutoDataPrep process completed in {} seconds', round(end-start, 6))

        print('AutoDataPrep process completed in', round(end-start, 6), 'seconds')
        print('Logfile saved to:', os.path.join(os.getcwd(), 'auto_data_prep.log'))

    def _initialize_logger(self):
        # function for initializing the logging process
        logger.remove()
        logger.add('auto_data_prep.log', mode='w', format='{time:DD-MM-YYYY HH:mm:ss.SS} - {level} - {message}')
        return

    def _validate_params(self, df):
        # function for validating the input parameters
        logger.info('Started validation of input parameters...')
        
        if type(df) != pd.core.frame.DataFrame:
            raise ValueError('Invalid value for "df" parameter.')
        
        logger.info('Completed validation of input parameters')
        return
    
    def _clean_data(self, df, input_data, cols_imbalanced):
        # function for starting the AutoDataPrep process
        df = df.reset_index(drop=True)
        
        feature = df.columns[0]  # first column is given as an example. user input will be taken..
        EDA.explore(self, df, feature)  
        
        df = ExcludeFields.handle(self, df, cols_imbalanced)
        df = MissingValues.handle(self, df)
        df = Outliers.handle(self, df)
        df = MissingValues.handle(self, df)
        df = Duplicates.handle(self, df)
        df = Rescale.handle(self, df)
        df = Encoding.handle(self, df)    

        return df 