In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

class prepare_data:
    
    '''
    Initialize parameters used for prepping data
    '''
    def __init__(self, x_scaler, y_scaler):
        self.bad_cols = []
        self.x_scaler = x_scaler
        self.y_scaler = y_scaler
    
    ''' 
    Fills class list attribute with 
    columns containing missing data
    '''
    def find_NAN_cols(self, df):
    
        # search for columns with missing data
        for col in list(df):
            if df[col].isnull().values.any():
                self.bad_cols.append(col)
                
    '''
    drops columns if they contain missing data
    '''
    def drop_NAN_cols(self, df):
        
        # search columns
        for col in list(df):
            # check if column is bad
            if col in self.bad_cols:
                # drop column if bad
                df = df.drop([col], axis = 1)
        return df
        
    '''
    Convert columns with strings to categories.
    This function uses a one-hot encoding approach
    '''
    def str2cat(self, df):
        
        # cycle through columns that are not a numeric
        for col in list(df.select_dtypes(exclude=[np.number])):
            df[col] = df[col].astype('category')
            
        cat_cols = df.select_dtypes(['category']).columns
        df = pd.get_dummies(df, columns=cat_cols)
        return df
    
    '''
    Splits data into train, test, and validation sets,
    then normalizes each one separately
    '''
    def split_norm(self, x, y, test_size = .2, val_size = .2):

        # Split data into train and test
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size = 0.20, random_state = 101)

        # Split off for validation
        x_train, x_val, y_train, y_val = train_test_split(
            x_train, y_train, test_size = 0.20, random_state = 101)
        
        # Cast as dataframe
        y_train = pd.DataFrame(y_train)
        y_test = pd.DataFrame(y_test)
        y_val = pd.DataFrame(y_val)

        # Scale data
        x_train = pd.DataFrame(self.x_scaler.fit_transform(x_train))
        x_test = pd.DataFrame(self.x_scaler.fit_transform(x_test))
        x_val = pd.DataFrame(self.x_scaler.fit_transform(x_val))
        
        # Scale data
        y_train = pd.DataFrame(self.y_scaler.fit_transform(y_train))
        y_test = pd.DataFrame(self.y_scaler.fit_transform(y_test))
        y_val = pd.DataFrame(self.y_scaler.fit_transform(y_val))
        
        # Return our datasets
        return x_train, x_test, x_val, y_train, y_test, y_val
    
    '''
    Runs multiple functions above
    '''
    def pre_process(self, df):
        # clean entire dataframe
        self.find_NAN_cols(df)
        new_df = self.drop_NAN_cols(df)
        new_df = self.str2cat(new_df)
        return(new_df)