# <center>Bad Bank Behavior<br>Analyzing Bank Mortgage during the 2007 Housing Bubble</center>  

<center>Michael Siebel</center>
<center>August 2020</center>

<br>
    
## Table of Contents
- [Goals](#Goals)<br>
- [Load Packages](#Load-Packages)<br>
- [Set Up Functions](#Set-Up-Functions)<br>
- [Implement Data Cleanings](#Implement-Data-Cleanings)<br>
- [Analysis Functions](#Analysis-Functions)<br>
- [Imbalanced Prediction](#Imbalanced-Prediction)
- [Downsampling Prediction](#Downsampling-Prediction)<br>
- [Upsampling Prediction](#Upsampling-Prediction)<br>
- [Conclusion](#Conclusion)<br>

# Goals  
<br>

 

***

# Load Functions

In [1]:
# Load functions
%run Functions.ipynb
pd.set_option("display.max_columns", 200)
pd.set_option('display.max_rows', 200)

# Load data
file_to_open = open('..\Data\Pickle\df.pkl', 'rb') 
df  = pickle.load(file_to_open) 
file_to_open.close()

# Drop mergeID column
df = df.drop(labels='Loan ID', axis=1)

# Convert Inf values to NA
df = df.replace([np.inf, -np.inf], np.nan)

Using TensorFlow backend.


In [2]:
## Bank and Classifier Lists
banks = ['Bank of America','Wells Fargo Bank','CitiMortgage',
         'JPMorgan Chase','GMAC Mortgage','SunTrust Mortgage',
         'AmTrust Bank','PNC Bank','Flagstar Bank']

banks_plus = banks + ['All Banks']
clfs_str = ['Keras NN', 'RFC', 'RUS Boost', 'RFC PCA'] 

## Create an environment variable to avoid using the GPU. This can be changed.
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

***

# Modeling

In [3]:
# Drop "OTHER"  and "SMALL LOAN BANKS" Categories
df = df[df['Bank'] != 'Other']
df['Bank'].value_counts()

Bank of America      650087
CitiMortgage         260698
Wells Fargo Bank     214039
JPMorgan Chase       202997
GMAC Mortgage        178160
SunTrust Mortgage    141398
PNC Bank             100351
AmTrust Bank          79360
Flagstar Bank         66637
Name: Bank, dtype: int64

In [4]:
# Verify Bank Counts
df['Bank'].value_counts()

Bank of America      650087
CitiMortgage         260698
Wells Fargo Bank     214039
JPMorgan Chase       202997
GMAC Mortgage        178160
SunTrust Mortgage    141398
PNC Bank             100351
AmTrust Bank          79360
Flagstar Bank         66637
Name: Bank, dtype: int64

In [5]:
# Variables to drop
dropvars = ['Original Date', 'File Year', 'File Quarter', 'Month', 'Region', 'FIPS',
            'Reported Period', 'Zip Code', 'Mortgage Insurance Type', 'Property State',
            'Original Loan Term', 'First Payment', 'Original Loan-to-Value (LTV)',
            'Property Type', 'Number of Units']
df = df.drop(labels=dropvars, axis=1)
df = df.filter(regex=r'^(?!Asset).*$')
df = df.filter(regex=r'^(?!Liab).*$')
df = df.filter(regex=r'^(?!Eqtot).*$')
df = df.filter(regex=r'^(?!Dep).*$')

# Missingness to drop
df = df.dropna()

# All data
y_all = df['Foreclosed']
X_all = df.drop(labels=['Foreclosed', 'Zero Balance Code'], axis=1) 

# Split Train (70%)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.7, 
                                                    stratify = y_all, random_state=2019)
# Split Val (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, 
                                                stratify = y_test, random_state=2019)

# One hot encoding on remaining data
Bnk_train = X_train['Bank'].reset_index().iloc[:,1]
X_train = onehotencoding(X_train)
Bnk_val = X_val['Bank'].reset_index().iloc[:,1]
X_val = onehotencoding(X_val)
Bnk_test = X_test['Bank'].reset_index().iloc[:,1]
X_test = onehotencoding(X_test)

print('Shape:', X_train.shape)

Shape: (480512, 71)


In [6]:
# Update Macroeconomic variables (will not use test set)
X_train, X_val, X_test = pca_fred(X_train, X_val, X_test, n_components=4)

# Check columns
X_train.columns

Index(['Reported Period', 'Original Interest Rate',
       'Original Mortgage Amount_x', 'Original Loan Term', 'First Payment',
       'Original Loan-to-Value (LTV)',
       'Original Combined Loan-to-Value (CLTV)', 'Single Borrower',
       'Original Debt to Income Ratio', 'Loan Purpose', 'Number of Units',
       'Mortgage Insurance %', 'Harmonized Credit Score',
       'Original Mortgage Amount_y', 'Loan Change (1 Year)',
       'Loan Change (5 Years)', 'FIPS', 'Median Household Income', 'numemp',
       'Asset (5 Yr)', 'Asset (1 Yr)', 'Lnlsnet (5 Yr)', 'Lnlsnet (1 Yr)',
       'Liab (5 Yr)', 'Liab (1 Yr)', 'Dep (5 Yr)', 'Dep (1 Yr)',
       'Eqtot (5 Yr)', 'Eqtot (1 Yr)', 'Origination Channel_B',
       'Origination Channel_C', 'Origination Channel_R', 'Bank_AmTrust Bank',
       'Bank_Bank of America', 'Bank_CitiMortgage', 'Bank_Flagstar Bank',
       'Bank_GMAC Mortgage', 'Bank_JPMorgan Chase', 'Bank_PNC Bank',
       'Bank_SunTrust Mortgage', 'Bank_Wells Fargo Bank',
       'Fir

In [7]:
# List of banks
banks = ['Bank of America','Wells Fargo Bank','CitiMortgage',
         'JPMorgan Chase','GMAC Mortgage','SunTrust Mortgage',
         'AmTrust Bank','PNC Bank','Flagstar Bank']

# Run Function
Banks_X, Banks_y = Bank_Subsets(banks, df_X = X_train, df_y = y_train)
Banks_X_val, Banks_y_val = Bank_Subsets(banks, df_X = X_val, df_y = y_val)
Banks_X_test, Banks_y_test = Bank_Subsets(banks, df_X = X_test, df_y = y_test)
X_train = X_train.filter(regex=r'^(?!Bank).*$')
X_val = X_val.filter(regex=r'^(?!Bank).*$')
X_test = X_test.filter(regex=r'^(?!Bank).*$')

### 1) Run bagging models (Ind banks/all banks)
### 2) Determine best thresholds on test data
### 3) Middle layer vote classifier on all banks
### 4) Vote classifier on ind banks and majority of all banks

In [8]:
file_to_open = open('..\Data\Pickle\models.pkl', 'rb') 
vote_models = pickle.load(file_to_open) 
file_to_open.close()

file_to_open = open('..\Data\Pickle\model_thresholds.pkl', 'rb') 
vote_thresholds = pickle.load(file_to_open) 
file_to_open.close()

file_to_open = open('..\Data\Pickle\predictions.pkl', 'rb') 
vote_pred = pickle.load(file_to_open) 
file_to_open.close()

file_to_open = open('..\Data\Pickle\df_votes.pkl', 'rb') 
votes = pickle.load(file_to_open) 
file_to_open.close()

# Final vote

In [9]:
combined_votes = pd.Series()
combined_actuals = pd.Series()

# Middle layer
for bank_str in banks:
    print(bank_str)
    all_bnks_pred = ( votes['All Banks'].iloc[:,:len(clfs_str)].sum(axis=1) / 
                      len(clfs_str) ) \
                      .map(lambda x: 1 if x == 1.0 else 0)    
    votes[bank_str].loc[:,'All Banks'] = all_bnks_pred.loc[Bnk_test == bank_str].reset_index().iloc[:,1]
    total_vote = votes[bank_str].loc[:,'All Banks']
    print('Predicted Foreclosures', np.mean(total_vote).round(2))
    print('Actual Foreclosures', np.mean(Banks_y_test[bank_str]).round(2))
    print('F1 Score after Voting', f1_score(Banks_y_test[bank_str], total_vote).round(2))
    print('Recall after Voting', recall_score(Banks_y_test[bank_str], total_vote).round(2))        
    print('Precision after Voting', precision_score(Banks_y_test[bank_str], total_vote).round(2))    
    print('')
    
    # Combine banks
    combined_votes = pd.concat([combined_votes, total_vote], axis=0)
    combined_actuals = pd.concat([combined_actuals, Banks_y_test[bank_str]], axis=0)
    
print('Combined Predictions')
print('Predicted Foreclosures', np.mean(combined_votes).round(2))
print('Actual Foreclosures', np.mean(combined_actuals).round(2))
print('F1 Score after Voting', f1_score(combined_actuals, combined_votes).round(2))
print('Recall after Voting', recall_score(combined_actuals, combined_votes).round(2))        
print('Precision after Voting', precision_score(combined_actuals, combined_votes).round(2))    

Bank of America
Predicted Foreclosures 0.11
Actual Foreclosures 0.12
F1 Score after Voting 0.38
Recall after Voting 0.37
Precision after Voting 0.4

Wells Fargo Bank
Predicted Foreclosures 0.06
Actual Foreclosures 0.08
F1 Score after Voting 0.31
Recall after Voting 0.27
Precision after Voting 0.35

CitiMortgage
Predicted Foreclosures 0.05
Actual Foreclosures 0.08
F1 Score after Voting 0.29
Recall after Voting 0.25
Precision after Voting 0.36

JPMorgan Chase
Predicted Foreclosures 0.06
Actual Foreclosures 0.08
F1 Score after Voting 0.33
Recall after Voting 0.29
Precision after Voting 0.38

GMAC Mortgage
Predicted Foreclosures 0.06
Actual Foreclosures 0.1
F1 Score after Voting 0.29
Recall after Voting 0.24
Precision after Voting 0.37

SunTrust Mortgage
Predicted Foreclosures 0.07
Actual Foreclosures 0.1
F1 Score after Voting 0.32
Recall after Voting 0.27
Precision after Voting 0.39

AmTrust Bank
Predicted Foreclosures 0.06
Actual Foreclosures 0.1
F1 Score after Voting 0.25
Recall after V

In [10]:
combined_votes = pd.Series()
combined_actuals = pd.Series()

# Final vote
for bank_str in banks:
    print(bank_str)
    votes[bank_str].loc[:,'Majority'] = ( votes[bank_str].iloc[:,:(len(clfs_str)+1)].sum(axis=1) / 
                                        ( len(clfs_str)+1 ) ) \
                                        .map(lambda x: 1 if x > 0.67 else 0)
    total_vote = votes[bank_str].loc[:,'Majority']
    print('Predicted Foreclosures', np.mean(total_vote).round(2))
    print('Actual Foreclosures', np.mean(Banks_y_test[bank_str]).round(2))
    print('F1 Score after Voting', f1_score(Banks_y_test[bank_str], total_vote).round(2))
    print('Recall after Voting', recall_score(Banks_y_test[bank_str], total_vote).round(2))        
    print('Precision after Voting', precision_score(Banks_y_test[bank_str], total_vote).round(2))    
    print('')
    
    # Combine banks
    combined_votes = pd.concat([combined_votes, total_vote], axis=0)
    combined_actuals = pd.concat([combined_actuals, Banks_y_test[bank_str]], axis=0)
    
print('Combined Predictions')
print('Predicted Foreclosures', np.mean(combined_votes).round(2))
print('Actual Foreclosures', np.mean(combined_actuals).round(2))
print('F1 Score after Voting', f1_score(combined_actuals, combined_votes).round(2))
print('Recall after Voting', recall_score(combined_actuals, combined_votes).round(2))        
print('Precision after Voting', precision_score(combined_actuals, combined_votes).round(2))   
print('Accuracy after Voting', accuracy_score(combined_actuals, combined_votes).round(2)) 
print('')
print('Confusion Matrix')
print(confusion_matrix(combined_actuals, combined_votes))

Bank of America
Predicted Foreclosures 0.12
Actual Foreclosures 0.12
F1 Score after Voting 0.41
Recall after Voting 0.42
Precision after Voting 0.4

Wells Fargo Bank
Predicted Foreclosures 0.07
Actual Foreclosures 0.08
F1 Score after Voting 0.33
Recall after Voting 0.33
Precision after Voting 0.34

CitiMortgage
Predicted Foreclosures 0.07
Actual Foreclosures 0.08
F1 Score after Voting 0.31
Recall after Voting 0.3
Precision after Voting 0.34

JPMorgan Chase
Predicted Foreclosures 0.07
Actual Foreclosures 0.08
F1 Score after Voting 0.35
Recall after Voting 0.33
Precision after Voting 0.38

GMAC Mortgage
Predicted Foreclosures 0.1
Actual Foreclosures 0.1
F1 Score after Voting 0.34
Recall after Voting 0.34
Precision after Voting 0.34

SunTrust Mortgage
Predicted Foreclosures 0.09
Actual Foreclosures 0.1
F1 Score after Voting 0.37
Recall after Voting 0.36
Precision after Voting 0.39

AmTrust Bank
Predicted Foreclosures 0.09
Actual Foreclosures 0.1
F1 Score after Voting 0.31
Recall after Vot

# Final Scores
<br>

#### Unanimous vote in middle layer
#### 4/5ths vote in final layer
- Combined Predictions
- Predicted Foreclosures 0.1
- Actual Foreclosures 0.1
- F1 Score after Voting 0.37
- Recall after Voting 0.37
- Precision after Voting 0.37
- Accuracy after Voting 0.88

# Predictions

In [None]:
# Predicted Probabilities
def proba_func(X, bank_str, bank_dict, clfs_str = clfs_str,
               vote_models = vote_models, vote_thresholds = vote_thresholds):
    
    # Dictionaries
    bank_dict[bank_str] = pd.DataFrame()
    bank_dict['All Banks'] = pd.DataFrame()
    
    # Bottom Layer
    for clf in clfs_str:
        if clf == 'Keras NN':
            col = 0
            ## Bank-specific
            ### Predicted Probility
            proba = pd.DataFrame(vote_models[bank_str][clf].predict_proba(X[bank_str], batch_size=1000)).iloc[:,col]
            ### Classification
            bank_dict[bank_str][clf] = proba.map(lambda x: 1 if x >= vote_thresholds[bank_str][clf] else 0)                                                 
    
            ## All Banks
            ### Predicted Probility
            proba = pd.DataFrame(vote_models['All Banks'][clf].predict_proba(X[bank_str], batch_size=1000)).iloc[:,col]
            ### Classification
            bank_dict['All Banks'][clf] = proba.map(lambda x: 1 if x >= vote_thresholds['All Banks'][clf] else 0)

        else:
            col = 1
            ## Bank-specific
            ### Predicted Probility
            proba = pd.DataFrame(vote_models[bank_str][clf].predict_proba(X[bank_str])).iloc[:,col]
            ### Classification
            bank_dict[bank_str][clf] = proba.map(lambda x: 1 if x >= vote_thresholds[bank_str][clf] else 0)                                                 
    
            ## All Banks
            ### Predicted Probility
            proba = pd.DataFrame(vote_models['All Banks'][clf].predict_proba(X[bank_str])).iloc[:,col]
            ### Classification
            bank_dict['All Banks'][clf] = proba.map(lambda x: 1 if x >= vote_thresholds['All Banks'][clf] else 0)

    ## Vote
    votes = votes_clf_func(bank_dict, bnk_list = [bank_str, 'All Banks'], 
                           clfs_str = clfs_str, X = Bnk_test)

    # Middle Layer
    all_bnks_pred = ( votes['All Banks'].iloc[:,:len(clfs_str)].sum(axis=1) / 
                      len(clfs_str) ) \
                    .map(lambda x: 1 if x == 1.0 else 0)    
    votes[bank_str].loc[:,'All Banks'] = all_bnks_pred.loc[Bnk_test == bank_str].reset_index().iloc[:,1]

    # Final vote
    votes[bank_str].loc[:,'Majority'] = ( votes[bank_str].iloc[:,:(len(clfs_str)+1)].sum(axis=1) / 
                                        ( len(clfs_str)+1 ) ) \
                                        .map(lambda x: 1 if x > 0.67 else 0)

    return votes[bank_str].loc[:,'Majority']

In [None]:
bank_dict = {}
test_pp = {}
combined_votes = pd.Series()
combined_actuals = pd.Series()

for bank_str in banks:
    print(bank_str)
    
    test_pp[bank_str] = pd.DataFrame()
    test_pp[bank_str] = proba_func(Banks_X_test, bank_str, bank_dict, clfs_str,
                                   vote_models, vote_thresholds)
    print('Predicted Foreclosures', np.mean(test_pp[bank_str]).round(2))
    print('Actual Foreclosures', np.mean(Banks_y_test[bank_str]).round(2))
    print('F1 Score after Voting', f1_score(Banks_y_test[bank_str], test_pp[bank_str]).round(2))
    print('Recall after Voting', recall_score(Banks_y_test[bank_str], test_pp[bank_str]).round(2))        
    print('Precision after Voting', precision_score(Banks_y_test[bank_str], test_pp[bank_str]).round(2))  
    print('')

    # Combine banks
    combined_votes = pd.concat([combined_votes, test_pp[bank_str]], axis=0)
    combined_actuals = pd.concat([combined_actuals, Banks_y_test[bank_str]], axis=0)
    
print('Combined Predictions')
print('Predicted Foreclosures', np.mean(combined_votes).round(2))
print('Actual Foreclosures', np.mean(combined_actuals).round(2))
print('F1 Score after Voting', f1_score(combined_actuals, combined_votes).round(2))
print('Recall after Voting', recall_score(combined_actuals, combined_votes).round(2))        
print('Precision after Voting', precision_score(combined_actuals, combined_votes).round(2))   
print('Accuracy after Voting', accuracy_score(combined_actuals, combined_votes).round(2)) 
print('')
print('Confusion Matrix')
print(confusion_matrix(combined_actuals, combined_votes))

In [None]:
# Predictions on full test data
file_to_store = open("..\Data\Pickle\pred_votes_test.pkl", "wb")
pickle.dump(votes, file_to_store)
file_to_store.close()

In [None]:
pp_1 = pd.DataFrame(columns=X_train.columns)
pp_1

In [None]:
'''
Index(['Reported Period', 'Original Interest Rate',
       'Original Mortgage Amount_x', 'Original Loan Term', 'First Payment',
       'Original Loan-to-Value (LTV)',
       'Original Combined Loan-to-Value (CLTV)', 'Single Borrower',
       'Original Debt to Income Ratio', 'Loan Purpose', 'Number of Units',
       'Mortgage Insurance %', 'Harmonized Credit Score',
       'Original Mortgage Amount_y', 'Loan Change (1 Year)',
       'Loan Change (5 Years)', 'FIPS', 'Median Household Income', 'numemp',
       'Asset (5 Yr)', 'Asset (1 Yr)', 'Lnlsnet (5 Yr)', 'Lnlsnet (1 Yr)',
       'Liab (5 Yr)', 'Liab (1 Yr)', 'Dep (5 Yr)', 'Dep (1 Yr)',
       'Eqtot (5 Yr)', 'Eqtot (1 Yr)', 'Origination Channel_B',
       'Origination Channel_C', 'Origination Channel_R', 'Bank_AmTrust Bank',
       'Bank_Bank of America', 'Bank_CitiMortgage', 'Bank_Flagstar Bank',
       'Bank_GMAC Mortgage', 'Bank_JPMorgan Chase', 'Bank_PNC Bank',
       'Bank_SunTrust Mortgage', 'Bank_Wells Fargo Bank',
       'First Time Home Buyer_N', 'First Time Home Buyer_Y',
       'Property Type_CO', 'Property Type_CP', 'Property Type_MH',
       'Property Type_PU', 'Property Type_SF', 'Occupancy Type_I',
       'Occupancy Type_P', 'Occupancy Type_S',
       'Relocation Mortgage Indicator_N', 'Relocation Mortgage Indicator_Y',
       'Macroeconomy PCA 1', 'Macroeconomy PCA 2', 'Macroeconomy PCA 3',
       'Macroeconomy PCA 4']
'''