# HMDA Cleaner

The purpose of this program is to clean HMDA data further data needs.

This program is designed to be used for future data HMDA publications.

## Import HMDA Data, Packages, and Check Dataset.

In [None]:
#This code does not need to be changed.
#Imports packages to be used by program
import pandas as pd
import numpy as np

In [None]:
#This code does not need to be changed.
#Quality of Life settings
#This will allow all columns to be displayed when reviewing the data.
pd.options.display.max_columns = None

In [None]:
#Loads in Sample
HMDA_dataset_file_location = r'2019 HMDA Sample.csv'

In [None]:
#Sets Sample as dataframe.
HMDA_dataset = pd.read_csv(HMDA_dataset_file_location)
HMDA_dataset.head()

In [None]:
'''
This code does not need to be changed unless column headers change.(Ex.'state_code' changes to 'state')
Program to call for future HDMA data.
'''
def HMDA_cleaner(HMDA_dataset_file_location, Name_of_Output_CSV):
    #load in dataset.
    HMDA_dataset = pd.read_csv(HMDA_dataset_file_location)
    
    '''
    Clean Race data.
    '''
    #Groups racial categories and adds names.
    df1 = HMDA_dataset.replace(to_replace = {'applicant_race-1' : {
                                (2,21,22,23,24,25,26,27):'Asian',
                                  (3):'Black',
                                 (5):'White',
                                 (1, 4,41,42,43,44):'Other',
                                 (6,7, np.nan):'Not Reported'}})
    #Remove 'Not Reported' rows.
    df2 = df1[df1['applicant_race-1'] != 'Not Reported']
    #Clean Ethnicity Data.
    df3 = df2.replace(to_replace = {'applicant_ethnicity-1' : {
                                (1,11,14,12,13):'Hispanic',
                                  (2):'Not Hispanic',
                                 (3,4,np.nan):'Not Reported',}})
    #Remove 'Not Reported' rows.
    df4 = df3[df3['applicant_ethnicity-1'] != 'Not Reported']
    #Get dummy variables for applicant race and ethnicity.
    df5 = pd.get_dummies(df4, columns = ['applicant_race-1', 'applicant_ethnicity-1'])
    
    '''
    Clean Income 
    '''
    #Remove rows that had no income reported or an income less than 0.
    def income_reported_to_categorical(table, column):
        values = table[column].values
        ir_array =  []
        for i in values:
            if i == np.nan:
                ir_array.append('No Report')
            elif i <= 0:
                ir_array.append('Zero or Negative')
            elif i > 0:
                ir_array.append('Reported')
            else:
                ir_array.append('No Report')
        table['Income Reported'] = ir_array     
        return table
    #This removes all non reported and 0 or negative rows.
    df6 = income_reported_to_categorical(df5,'income')
    df7 = df6[df6["Income Reported"].str.contains("Reported") == True]
    #Creates log Income column.
    df7_2 = df7.copy()
    df7_2['Log Income'] = np.log(df7['income'])
    
    '''
    Create approval and denial indicator column.
    This also removes withdrawn applications and other instances that aren't under the Approval or Denial categories.
    Preapproval request indicators could also be used as an indicator variable if need be.
    '''
    def action_taken_to_approval_indicator(table, column):
        values = table[column].values
        ir_array =  []
        for i in values:
            if i == 1:
                ir_array.append('Approved')
            elif i == 2:
                ir_array.append('Approved')
            elif i == 3:
                ir_array.append('Denied')
            else:
                ir_array.append('other')
        table2 = table.copy()
        table2['Approval Indicator'] = ir_array
        table3 = table2[table2["Approval Indicator"].str.contains("other") == False]
        table4 = pd.get_dummies(table3, columns = ['Approval Indicator'])
        return table4
    df8 = action_taken_to_approval_indicator(df7_2, 'action_taken')
   
    '''
    Clean applicant gender
    '''
    df9 = df8.replace(to_replace = {'applicant_sex' : {(1):'Male',(2):'Female'}})
    df10 = pd.get_dummies(df9, columns = ['applicant_sex'])
    
    '''
    Clean Loan to Value Ratios
    '''
    #This removes all non-numeric rows.
    df11 = df10[pd.to_numeric(df10['loan_to_value_ratio'], errors = 'coerce').notnull()]
    
    '''
    Clean Debt to Income    
    '''
    #Drops no answer rows.
    df12 = df11.dropna(subset=['debt_to_income_ratio'])
    #Drops exempt rows.
    df13 = df12[df12["debt_to_income_ratio"].str.contains("Exempt") == False]
    #Rename Column
    df13_5 = df13.rename(columns = {'debt_to_income_ratio': 'DTI'})
    df13_4 =  pd.get_dummies(df13_5, columns = ['DTI'])
   
    
    '''
    Clean Loan Amount and create log(Loan Amount)
    '''
    #This drops all N/A rows.
    df14 = df13_4.dropna(subset=['loan_amount'])
    #This removes all non-numeric rows.
    df15 = df14[pd.to_numeric(df14['loan_amount'], errors = 'coerce').notnull()]
    #This removes all rows with a value less than or equal to 0.
    df16 = df15[~(df15['loan_amount'] <= 0)]
    #Creates log(Loan Amount)
    df16['Log Loan Amount'] = np.log(df16['loan_amount'])
    
    '''
    Clean Loan Type
    '''
    df17 = df16.replace(to_replace = {'loan_type' : {(1):'Conventional',
                                                    (2): 'FHA',(3): 'VA',
                                                    #USDA Rural Housing Service or Farm Service Agency.
                                                    (4): 'RHS or FSA'}})
    df18 = df17.rename(columns = {'loan_type': 'Loan Type'})
    df19 = pd.get_dummies(df18, columns = ['Loan Type'])
    
    '''
    Filter Loan Purpose
    '''
    #This study is only interested in Home Purchase loans.
    #It is leaving out home improvement, refinancing, Cash-out refinanciing, and other loan purposes.
    df20 = df19[df12["loan_purpose"] == 1]
     
    '''
    Compile final Data Frame
    '''
    #Rename columns
    dfrename = df20.rename(columns = {'Approval Indicator_Approved' : 'Approved',
                                       'Approval Indicator_Denied' : 'Denied',
                                        'income': 'Income',
                                        'applicant_race-1_Asian': 'Asian',
                                        'applicant_race-1_Black': 'Black',
                                        'applicant_race-1_Other': 'Other',
                                        'applicant_race-1_White': 'White',
                                        'applicant_ethnicity-1_Hispanic': 'Hispanic',
                                        'applicant_ethnicity-1_Not Hispanic': 'Not Hispanic',
                                        'applicant_sex_Female': 'Female',
                                        'applicant_sex_Male': 'Male',
                                        'loan_to_value_ratio': 'LTV',
                                        'loan_amount': 'Loan Amount',
                                        'denial_reason-1': 'Denial Reason',
                                        'state_code' : 'State',
                                        'county_code':'County Code',
                                        'census_tract': 'Census Tract'})
    #Pull columns
    Final_df = dfrename[['State','County Code','Census Tract', 'Approved', 'Denied', 
                         'White', 'Black', 'Asian', 'Other', 'Hispanic', 'Not Hispanic', 
                         'Male', 'Female','Income', 'Log Income', 
                         'Loan Amount', 'Log Loan Amount','LTV','Loan Type_Conventional', 
                         'Loan Type_FHA','Loan Type_VA','Loan Type_RHS or FSA','DTI_20%-<30%',
                         'DTI_30%-<36%','DTI_36','DTI_38','DTI_39','DTI_40','DTI_41',
                         'DTI_42','DTI_43','DTI_44','DTI_45','DTI_46','DTI_47',
                         'DTI_48','DTI_50%-60%','DTI_<20%','DTI_>60%']]
    
    '''
    Save the Dataframe to a csv
    '''
    Final_df.to_csv(Name_of_Output_CSV, index=False)
    
    '''
    Outputs
    '''
    print('Data written to CSV sucessfully.')
    print(Final_df.shape[0],'Rows,',Final_df.shape[1],'Columns.')
    return Final_df.head()

In [None]:
#Run HMDA_cleaner.
#Enter file location in the HMDA_cleaner(r'file_location') format.
HMDA_cleaner(r'2019 HMDA Sample.csv', '2019 HMDA Clean Sample.csv')

### The CSV is now ready to be used for further descriptive statistics and the LPM.