# Regress Mortgage Approvals on Race, Sex, and Other Variables

The purpose of this program is to regress a mortgage approval variable against race, ethnicity, gender, and other control variables found in HMDA data. This script will be designed for a distributive environment using the model below.

$P(Approval = 1 | Race/Sex, \chi_ji, \alpha_i) = \beta_0 + \lambda_ji * Race/Sex + \beta_ji * \chi_ji + \alpha_i + \mu $

Where $\lambda_ji$ are the variables of interest, $\beta_ji$ are the coefficients on the control variables, 
alpha_i are the fixed effects, and $\chi_j$ are the control variables.

Variables of Interest
- White
- Black
- Asian
- Hispanic
- Other
- Male 
- Female

Control Variables
- Income (log)
- Loan to Value ratio
- Debt to Income ratio
- Loan Amount (log)
- Pre-Approval indicators

Variables ommited in model to prevent perfect collinearity.
- Race - White
- Sex - Male

Filters
- Loan Purpose
- Occupancy Type

Clustered Standard errors
- by Lender
- by State
- by County
- by Census Tract

In [9]:
import pandas as pd
pd.options.display.max_columns = None

import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.models import Sequential

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
import sys
print("Python Version", sys.version)
tf.test.is_built_with_cuda()
print("TF Version", tf.version.VERSION)

Num GPUs Available:  1
Python Version 3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
TF Version 2.6.0


- FE: https://www.youtube.com/watch?v=FCm3_Id6RKM
- Custom Model: https://www.youtube.com/watch?v=S6tLSI8bjGs

### Load in Dataset

In [10]:
# Load in HMDA Data
HMDA = r'HMDA Clean IL SAMPLE.csv'
HMDA = pd.read_csv(HMDA)
#HMDA

### Further Cleaning

In [11]:
#Clean df
HMDA_clean_1 = HMDA.copy()
HMDA_clean_1 = HMDA_clean_1.dropna()
HMDA_clean_1['Census_Tract'] = HMDA_clean_1['Census_Tract'].apply(str)

#Filter Occupancy type to Principoal residence. Omits secondary residence purposes and investment purposes.
# "Occupancy_Type" = 1, Second Residence" = 2, "Investment Property" = 3.
HMDA_clean_2_1 = HMDA_clean_1[HMDA_clean_1["Occupancy_Type"] == 1]

#Sets County_Code and Census_Tract as strings.
HMDA_clean_2 = HMDA_clean_2_1.copy()
HMDA_clean_2['County_Code'] = HMDA_clean_2['County_Code'].astype(str)
HMDA_clean_2['Census_Tract'] = HMDA_clean_2['Census_Tract'].astype(str)
HMDA_clean_2

#Sets dummy variables for different columns
HMDA_clean_3 = pd.get_dummies(HMDA_clean_2, columns = ['Year','Race', 'Sex', 'DTI_Ratio', 'Loan_Type'])
HMDA_clean_3.columns

Index(['index', 'Lender_LEI', 'State', 'County_Code', 'Census_Tract',
       'Approved', 'Denied', 'Income', 'Log_Income', 'Loan_Amount',
       'Log_Loan_Amount', 'LTV', 'Preapproval', 'Occupancy_Type', 'Year_2019',
       'Year_2020', 'Year_2021', 'Race_0_White', 'Race_Asian', 'Race_Black',
       'Race_Latinx', 'Race_Other', 'Sex_0_Male', 'Sex_Female',
       'DTI_Ratio_0%-20%', 'DTI_Ratio_20%-<30%', 'DTI_Ratio_30%-<36%',
       'DTI_Ratio_36', 'DTI_Ratio_37', 'DTI_Ratio_38', 'DTI_Ratio_39',
       'DTI_Ratio_40', 'DTI_Ratio_41', 'DTI_Ratio_42', 'DTI_Ratio_43',
       'DTI_Ratio_44', 'DTI_Ratio_45', 'DTI_Ratio_46', 'DTI_Ratio_47',
       'DTI_Ratio_48', 'DTI_Ratio_49', 'DTI_Ratio_50%-60%', 'DTI_Ratio_>60%',
       'Loan_Type_Conventional', 'Loan_Type_FHA', 'Loan_Type_RHS or FSA',
       'Loan_Type_VA'],
      dtype='object')

In [12]:
#Remove Columns to prevent Multicollinearity, remove index, and remove outcome variables. 
HMDA_clean_4 = HMDA_clean_3.drop(['Race_0_White', 'Sex_0_Male', 'DTI_Ratio_0%-20%','index'], axis = 1)
df_clean = HMDA_clean_4.copy()

### Create Fixed Effects Set

### Model 1 Training Set

In [13]:
#Remove y variable and columns not to be used
train_set_1 = df_clean.drop(['Denied', 'Occupancy_Type', 'Lender_LEI', 'County_Code',
                            'Census_Tract', 'Income', 'Loan_Amount', 'Preapproval','State'], axis = 1)
train_1 = np.array(train_set_1)
x_1 = train_set_1.drop(['Approved'], axis = 1)
y_1 = train_set_1['Approved']

train_set_1.shape

(184853, 34)

# Model 1

In [14]:
[len(x_1.keys())]

[33]

In [105]:
def build_sgd_model(training_set):
    model = keras.Sequential([ 
        #layers.InputLayer(input_shape = (1,1)),
        layers.Dense(1, input_shape = [len(x_1.keys())], activation = 'relu', name = 'output')
        ])
        
    
    optimizer = tf.keras.optimizers.SGD(learning_rate = .01, momentum = .3)
    
    model.compile(loss = 'mse',
                  optimizer = optimizer,
                  metrics = ['mae', 'mse'])
    return model
    
model_1 = build_sgd_model(train_set_1)

In [106]:
model_1.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
output (Dense)               (None, 1)                 34        
Total params: 34
Trainable params: 34
Non-trainable params: 0
_________________________________________________________________


### Train The Model

In [107]:
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end = '')

Epochs = 5

history = model_1.fit(
    x_1,
    y_1,
    epochs = Epochs,
    verbose = 0,
    callbacks = [PrintDot()],
    validation_split = 0,
    use_multiprocessing = True
    )


.....

In [108]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist

Unnamed: 0,loss,mae,mse,epoch
0,0.918718,0.917466,0.918718,0
1,0.917113,0.917113,0.917113,1
2,0.917113,0.917113,0.917113,2
3,0.917113,0.917113,0.917113,3
4,0.917113,0.917113,0.917113,4


In [64]:
model_1.save('test model.h5')

In [57]:
model_1.evaluate(x_1, y_1, callbacks = [PrintDot()],use_multiprocessing = True)



[0.0760369673371315, 0.15233708918094635, 0.0760369673371315]

In [109]:
cheese = model_1.get_layer(index = 0)
cheese.get_weights()

[array([[-0.6672151 ],
        [-1.1603408 ],
        [-7.5232253 ],
        [ 0.15732604],
        [ 0.22950746],
        [-0.3470996 ],
        [-0.3057156 ],
        [-0.29832223],
        [ 0.37868693],
        [ 0.19945171],
        [ 0.288951  ],
        [-0.01032618],
        [ 0.38781   ],
        [ 0.29603264],
        [-0.39604366],
        [-0.31818324],
        [-0.22955036],
        [ 0.31359056],
        [-0.2198296 ],
        [-0.213558  ],
        [-0.22459546],
        [ 0.17849594],
        [-0.03501233],
        [-0.1084208 ],
        [-0.42035535],
        [ 0.37922338],
        [-0.10671175],
        [-0.08700678],
        [-0.12854314],
        [-0.41781574],
        [-0.14560299],
        [-0.09741985],
        [ 0.24159299]], dtype=float32),
 array([-0.08060312], dtype=float32)]

In [19]:
def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    
    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Mean Abs Error')
    plt.plot(hist['epoch'], hist['mse']
    

SyntaxError: unexpected EOF while parsing (466573885.py, line 9)

In [47]:
class HMDARegress(keras.Sequential):
    
    def __init__(self):
        super().__init__()
        self.dense = tf.keras.layers.Dense(1, activation = 'relu', name = 'output')
        
    def call(self, inpus, training = False):
        x = self.dense(inputs)
        return x
        
        
model = HMDARegress()
model.call

<bound method HMDARegress.call of <__main__.HMDARegress object at 0x000002B1422C1580>>

In [None]:
def linreg(x, y, Epochs):
    model = tf.keras.experimental.LinearModel()        
    
    optimizer = tf.keras.optimizers.SGD(learning_rate = .01, momentum = .3)
    
    model.compile(loss = 'mse',
                  optimizer = optimizer,
                  metrics = ['mae', 'mse'])
    
    model.fit(x, y, epochs = Epochs)
    
    return model
    
model_1 = linreg(x_1, y_1, 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [71]:
model_1.save('test1.tf')

INFO:tensorflow:Assets written to: test1.tf\assets


In [91]:
cheese = model_1.get_layer(index = 0)
cheese.get_weights()

[array([[nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan]], dtype=float32)]