# Data Preprocessing Functions
> Updated 10/2/2019

In [3]:
def percent_NA(data):
    """
    Returns a pandas dataframe denoting the total number of NA values and the percentage of NA values in each column.
    The column names are noted on the index.
    
    Parameters
    ----------
    data: dataframe
    """
    # pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()

    # instantiate columns for missing data
    total = null_sum.sort_values(ascending=False)
    percent = ( ((null_sum / len(data.index))*100).round(2) ).sort_values(ascending=False)
    
    # concatenate along the columns to create the complete dataframe
    df_NA = pd.concat([total, percent], axis=1, keys=['Number of NA', 'Percent NA'])
    
    # drop rows that don't have any missing data; omit if you want to keep all rows
    df_NA = df_NA[ (df_NA.T != 0).any() ]
    
    return df_NA

In [2]:
def drop_vars(data):
    """
    Drops features with missing values greater than 40%.
    
    Parameters
    ----------
    data : dataframe
    """
    
    # instantiate columns for missing data
    percent = ( (data.isnull().sum() / data.isnull().count())*100 ).sort_values(ascending=False)
    
    # pandas dataframe containing percentage of null data
    missing_values = pd.DataFrame(percent, columns=['Percent'])
    
    # boolean denoting features with missing values above 40%
    is_greater_40 = missing_values['Percent'] > 40
    
    # drop features with missing values above 40%
    data = data.drop( (missing_values[is_greater_40]).index, axis=1 )   # drop columns with percent > 40%
    
    return data

In [1]:
def group_xy(col1, col2):
    '''
    Groups integers xn, yn from two columns into the form: [[x1, y1], [x2, y2]...[xn, yn]]. A form suitable for K-means.
    
    Parameters
    ----------
    col1: xn column
    col2: yn column
    '''
    data_trans = []
    
    for i in range(len(col1)):
        data_trans.append([ col1.iloc[i], col2.iloc[i] ])
    
    return data_trans

In [1]:
from sklearn.preprocessing import LabelEncoder

def encode_categoricals(data, cols):
    """
    Encodes categorical variables with no NA values. Takes in a dataframe and the columns to encode. 
    Not one hot encoding.
    
    Parameters
    ----------
    data : dataframe
    cols: columns/variables to encode
    """
    # Instantiate LabelEncoder
    le = LabelEncoder()

    for items in cols_to_encode:
        data[cols_to_encode] = data[cols_to_encode].apply(le.fit_transform)
    
    return data

In [None]:
def standardize(data):
    """
    Standardizes the data by subtracting the data's mean from all the data points and scales to unit variance function.
    
    Code provided by: "https://stackoverflow.com/questions/31152967/normalise-2d-numpy-array-zero-mean-unit-variance"
    
    Parameters
    ----------
    data : data
    """
    standardized = (data - data.mean(axis=0)) / data.std(axis=0)
    return standardized

def destandardize(standardized_data, data):
    """
    De-standardizes the data.
        
    Parameters
    ----------
    standardized_data: standardized data
    data : original data
    """
    
    destandardized = standardized_data*data.std(axis=0) + data.mean(axis=0)
    return destandardized

In [None]:
def process_dates(data, col_name):
    """
    Converts dates to datetime variables.
    
    Parameters
    ----------
    data : data
    col_name: string name of column with dates
    """
    data[col_name] = data.index.map(lambda x: data[col_name][x].split()[0])   # x is the columnar object before .map()
    data[col_name] = pd.to_datetime(data[col_name])
    return data