# <center>Bad Bank Behavior<br>Analyzing Bank Mortgage during the 2007 Housing Bubble</center>  

<center>Michael Siebel</center>
<center>August 2020</center>
<br>

## <center>Functions Script</center>

# Purpose  
<br>

> Load necessary packages and custom functions to be used in project

***

# Load Packages

In [1]:
# Core Packages
import pandas as pd
import numpy as np
import random
import glob
import pickle
import zipfile
# Convert Time Features
from datetime import datetime as dt
# Data Visualizations
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style(theme='chesterish', grid=False)
# Imputing Data
from sklearn.impute import KNNImputer
# Splitting Data
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Machine Learning Packages
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, VotingClassifier, BaggingClassifier 
from imblearn.ensemble import BalancedBaggingClassifier, RUSBoostClassifier, EasyEnsembleClassifier
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SVMSMOTE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Save Runtime
import time
# Model Selection and Hyperparameter Tuning
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
# Output Statistics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

***

# Set Up Functions  
<br>

> I set up all data wrangling and data analysis as a series of functions, which will enable me to reuse on data from subsequent years (future projects) and various analysis techniques (this project).

Create empty data frames and arrays for function parameters called in this script

In [2]:
# Initiate function parameters
df = pd.DataFrame()
df_acq = pd.DataFrame()
Banks = pd.DataFrame()
X_train = pd.DataFrame()
bank_str = ''
y_train = np.array([])
y_test = np.array([])

### Load Data
#### Fannie Mae data comes in two forms:  
#### 1) acquistion data and 2) performance data  

> 1) The acquisition data includes one observation for each loan with each feature representing knowledge Fannie Mae has when acquiring the loan (e.g., balance, primary lender, credit score, etc.).  

> 2) The performance data includes observations for each month each loan is held and information on the payment of the loan.  

I use the acquisition data as predictors for a dichotomous categorization of whether the homeowner defaulted on their loan, a target variable I create using the performance data and merging onto the acquisition data.  

In [3]:
# Load the data
def load_data(acq, per=None):
    # Load acquistion data
    df_acq = pd.read_csv(acq, sep='|', header=None)
    # Specify the name of the columns
    df_acq.columns = ['Loan ID','Origination Channel','Bank','Original Interest Rate',
                      'Original Mortgage Amount','Original Loan Term','Original Date','First Payment',
                      'Original Loan-to-Value (LTV)','Original Combined Loan-to-Value (CLTV)',
                      'Number of Borrowers','Original Debt to Income Ratio','Credit Score',
                      'First Time Home Buyer','Loan Purpose','Property Type','Number of Units',
                      'Occupancy Type','Property State','Zip Code','Mortgage Insurance %',
                      'Product Type','Co-Borrower Credit Score','Mortgage Insurance Type',
                      'Relocation Mortgage Indicator']
    
    if per is not None:
        df_per = pd.read_csv(per, sep='|', header=None)
        df_per.columns = ['Loan ID','MonthRep','Servicer','Curr Interest Rate','CAUPB','Loan Age',
                          'Months To Maturity','Add Months To Maturity','Maturity Date','MSA','CLDS',
                          'Mod Flag','Zero Bal Code','Zero Bal Date','Last Install Date','Foreclosure Date',
                          'Disposition Date','FCCCost','PPRC','Asset Rec Cost','MHRC','ATFHP',
                          'Net Sale Proceeds','Credit Enh Proceeds','RPMWP','OFP','NIBUPB','PFUPB','RMWPF',
                          'FPWA','Servicing Indicator']
    else: df_per = None
        
    return df_acq, df_per

### Create Target Variable
<br>

> Performance data is much larger as it is transaction based, while the acquistion data has the loan owner as its unit of analysis.  

> I retain only the most recent performance transaction relating to foreclosure, then drop all other variables except  Loan ID (the primary key) and merge performance data onto acquisition data.  

> I recode performance data into dichotomous categorization of whether loan was foreclosed upon.

In [4]:
# Merge foreclosures from performance data to acquistion data
def merge_df(df_acq, df_per):
    
    # Columns to maintain from Performance Data
    per_ColKeep = ['Loan ID','Foreclosure Date']
    df_per = df_per[per_ColKeep]
    df_per = df_per.drop_duplicates(subset='Loan ID', keep='last')
    df = pd.merge(df_acq, df_per, on='Loan ID', how='inner')
    
    # Set Foreclosed to binary
    df.loc[df['Foreclosure Date'].isnull(), 'Foreclosed'] = 0
    df.loc[df['Foreclosure Date'].notnull(),'Foreclosed'] = 1
    df = df.drop(columns='Foreclosure Date')
    df['Foreclosed'] = df['Foreclosed'].astype(int)
    
    return df

***

# ETL of External Data
<br>

> Federal Reserve Economic Data (FRED) includes macro economic data that is related to the housing market.  This data is merged on the date variable (mm/yyyy); it includes monthly data and quarterly data, the latter used carryforward hard coding (each quarter represented the beginning of the quarter) to cover each month.  Some FRED sets included four Census region subsets; these were merged on the date variable (mm/yyyy) and the property state variable, the latter was mapped to the four Census regions of Northeast, Midwest, South, and West.  Values were converted to quarterly and yearly deltas (e.g., the change in housing vacancies from 2006 Q4 to 2007 Q1 or 2006 Q1 to 2007 Q1).

> Federal Deposit Insurance Corporation (FDIC) data includes information on FDIC-backed banks, such as their number of employees, assets, debts, etc.  I used regular expressions to map FDIC data to the Bank variable; this included summing various instances of the same bank (from a different branch or functional area).  This data is merged on the bank variable and the date variable (mm/yyyy); it includes quarterly data, which used carrybackward hard coding (each quarter represented the end of the quarter) to cover each month.  Values were converted to quarterly and yearly deltas (e.g., the change in Bank of America liabilities from 2006 Q4 to 2007 Q1 or 2006 Q1 to 2007 Q1).

Merge FRED on Monthly Data

Carryforward hard coding if data is quarterly

In [5]:
def merge_fred_on_month(df_mnth, merge_df = df, varname = '', quarter=False, pct_change=1):
    # Split date var
    df_mnth['Month'] = df_mnth['DATE'].apply(str).apply(lambda x: x.split('/')[0].strip()).apply(str)
    df_mnth['Year'] = df_mnth['DATE'].apply(str).apply(lambda x: x.split('/')[1].strip()).apply(str)
    df_mnth = df_mnth.drop(labels='DATE', axis=1)
        
    # Retrieve name of main column
    var = df_mnth.columns[0]
    # Period change
    df_mnth[var] = df_mnth[var].pct_change(pct_change)
    # Ensure correct dtype
    df_mnth[var] = df_mnth[var].astype(float)
    
    # If quarterly data,
    # Carry first month of quarter forward
    if quarter:
        for i in range(df_mnth.shape[0]):
            if df_mnth['Month'][i]=='01':
                new_row = df_mnth.iloc[i,:].replace({'Month': '01'}, '02')
                df_mnth = df_mnth.append(new_row)
                new_row = df_mnth.iloc[i,:].replace({'Month': '01'}, '03')
                df_mnth = df_mnth.append(new_row)
            elif df_mnth['Month'][i]=='04':
                new_row = df_mnth.iloc[i,:].replace({'Month': '04'}, '05')
                df_mnth = df_mnth.append(new_row)
                new_row = df_mnth.iloc[i,:].replace({'Month': '04'}, '06')
                df_mnth = df_mnth.append(new_row)
            elif df_mnth['Month'][i]=='07':
                new_row = df_mnth.iloc[i,:].replace({'Month': '07'}, '08')
                df_mnth = df_mnth.append(new_row)
                new_row = df_mnth.iloc[i,:].replace({'Month': '07'}, '09')
                df_mnth = df_mnth.append(new_row)
            elif df_mnth['Month'][i]=='10':
                new_row = df_mnth.iloc[i,:].replace({'Month': '10'}, '11')
                df_mnth = df_mnth.append(new_row)
                new_row = df_mnth.iloc[i,:].replace({'Month': '10'}, '12')
                df_mnth = df_mnth.append(new_row)     
    
    # Create merge var
    df_mnth['Original Date'] = (df_mnth['Month'].map(str) + '/' + df_mnth['Year']).apply(str)
    df_mnth = df_mnth.rename(columns={var: varname})

    # Merge
    merge_df = pd.merge(merge_df, df_mnth, on='Original Date', how='inner')
    merge_df['Sort'] = (merge_df['Year'].map(str) + merge_df['Month']).apply(str)
    merge_df = merge_df.sort_values(by=['Sort'])
    merge_df = merge_df.drop(labels=['Year', 'Month', 'Sort'], axis=1)
    
    return merge_df

State to Region Conversion

In [6]:
def to_region(df_new, var, state_drop=False):
    # Region-State crosswalk
    Northeast = ['ME','VT','NH','MA','NY','RI','CT','PA','NJ']
    South = ['DE','MD','DC','WV','VA','KY','NC','TN','SC','GA','FL','AL','MS','AR','LA','OK','TX']
    Midwest = ['ND','SD','NE','KS','MN','IA','MO','WI','IL','MI','IN','OH']
    West = ['WA','OR','ID','MT','WY','CA','NV','UT','AZ','CO','NM','AK','HI']
    
    # Replace States with Census regions
    df_new['Region'] = df_new[var]
    df_new['Region'] = df_new['Region'].replace(Northeast, 'Northeast')
    df_new['Region'] = df_new['Region'].replace(South, 'South')
    df_new['Region'] = df_new['Region'].replace(Midwest, 'Midwest')
    df_new['Region'] = df_new['Region'].replace(West, 'West')
    
    # Drop State var
    if state_drop:
        df_new = df_new.drop(labels=var, axis=1)
    
    return df_new

Merge regional data

In [7]:
def region_merge(NE, SO, MW, WE, varname, df_orig = df, quarter=False, pct_change=1):
    # Northeast
    Northeast = df_orig[df_orig['Region']=='Northeast']
    Northeast = Northeast[['Loan ID', 'Region', 'Original Date']]
    Northeast = merge_fred_on_month(df_mnth = NE, merge_df = Northeast, 
                                    varname=varname, quarter=quarter, pct_change=pct_change)
    Northeast = Northeast.rename(columns={Northeast.columns[3]: varname})
    
    # South
    South = df_orig[df_orig['Region']=='South']
    South = South[['Loan ID', 'Region', 'Original Date']]
    South = merge_fred_on_month(df_mnth = SO, merge_df = South, 
                                    varname=varname, quarter=quarter, pct_change=pct_change)
    South = South.rename(columns={South.columns[3]: varname})
    
    # Midwest
    Midwest = df_orig[df_orig['Region']=='Midwest']
    Midwest = Midwest[['Loan ID', 'Region', 'Original Date']]
    Midwest = merge_fred_on_month(df_mnth = MW, merge_df = Midwest, 
                                  varname=varname, quarter=quarter, pct_change=pct_change)
    Midwest = Midwest.rename(columns={Midwest.columns[3]: varname})
    
    # West
    West = df_orig[df_orig['Region']=='West']
    West = West[['Loan ID', 'Region', 'Original Date']]
    West = merge_fred_on_month(df_mnth = WE, merge_df = West, 
                               varname=varname, quarter=quarter, pct_change=pct_change)
    West = West.rename(columns={West.columns[3]: varname})
    
    # Stack
    df_region = pd.concat([Northeast, South, Midwest, West])
    df_region = df_region[['Loan ID', varname]]
    
    # Merge
    df_new = pd.merge(df_orig, df_region, on='Loan ID', how='inner')

    return df_new

FRED Data merge wrapper (full US) 

In [8]:
def fred_merge(fred_df, df_orig = df, quarter=True, varname = ''):
    # Define units within year
    if quarter:
        pct_qtr = 1
        pct_year = 4
    else:
        pct_qtr = 4
        pct_year = 12    
    # Define variable name, if not set
    if varname=='':
        varname = str(fred_df)
    
    # convert datetime
    fred_df['DATE'] = pd.to_datetime(fred_df['DATE']).dt.strftime('%m/%Y').apply(str)
    
    # merge FRED data and convert to percent change
    df_new = merge_fred_on_month(fred_df, df_orig, varname, quarter=quarter, pct_change=pct_qtr)
    df_new = df_new.rename(columns={varname: str(varname + ' (Qtr)')})
    df_new = merge_fred_on_month(fred_df, df_new, varname, quarter=quarter, pct_change=pct_year)
    df_new = df_new.rename(columns={varname: str(varname + ' (Yr)')})
    
    return df_new

FRED merge wrapper (region) 

In [9]:
def fred_merge_region(NE, SO, MW, WE, df_orig = df, varname = '', quarter=True):
    # Define units within year
    if quarter:
        pct_qtr = 1
        pct_year = 4
    else:
        pct_qtr = 4
        pct_year = 12    
    # Define variable name, if not set
    if varname=='':
        varname = str(fred_df)
    
    # convert datetime
    NE['DATE'] = pd.to_datetime(NE['DATE']).dt.strftime('%m/%Y').apply(str)
    SO['DATE'] = pd.to_datetime(SO['DATE']).dt.strftime('%m/%Y').apply(str)
    MW['DATE'] = pd.to_datetime(MW['DATE']).dt.strftime('%m/%Y').apply(str)
    WE['DATE'] = pd.to_datetime(WE['DATE']).dt.strftime('%m/%Y').apply(str)

    # merge FRED data and convert to percent change
    df_new = region_merge(NE=NE, SO=SO, MW=MW, WE=WE, df_orig = df_orig, 
                          varname=varname, quarter=quarter, pct_change=pct_qtr)
    df_new = df_new.rename(columns={varname: str(varname + ' (Qtr)')})
    df_new = region_merge(NE=NE, SO=SO, MW=MW, WE=WE, df_orig = df_new, 
                          varname=varname, quarter=quarter, pct_change=pct_year)
    df_new = df_new.rename(columns={varname: str(varname + ' (Yr)')})
    
    return df_new

Convert FDIC Data to Monthly Data

Carrybackwards hard coding to convert quarterly data to monthly

In [10]:
def fdic_on_month(df_mnth):
    # Retrieve names of columns
    cols = df_mnth.columns
    
    # convert datetime
    df_mnth['repdte'] = pd.to_datetime(df_mnth['repdte']).dt.strftime('%m/%Y').apply(str)
    
    # Split date var
    df_mnth['Month'] = df_mnth['repdte'].apply(str).apply(lambda x: x.split('/')[0].strip()).apply(str)
    df_mnth['Year'] = df_mnth['repdte'].apply(str).apply(lambda x: x.split('/')[1].strip()).apply(str)
    
    # Carry first month of quarter forward
    for i in range(df_mnth.shape[0]):
        if df_mnth['Month'][i]=='03':
            new_row = df_mnth.iloc[i,:].replace({'Month': '03'}, '01')
            df_mnth = df_mnth.append(new_row)
            new_row = df_mnth.iloc[i,:].replace({'Month': '03'}, '02')
            df_mnth = df_mnth.append(new_row)
        elif df_mnth['Month'][i]=='06':
            new_row = df_mnth.iloc[i,:].replace({'Month': '06'}, '04')
            df_mnth = df_mnth.append(new_row)
            new_row = df_mnth.iloc[i,:].replace({'Month': '06'}, '05')
            df_mnth = df_mnth.append(new_row)
        elif df_mnth['Month'][i]=='09':
            new_row = df_mnth.iloc[i,:].replace({'Month': '09'}, '07')
            df_mnth = df_mnth.append(new_row)
            new_row = df_mnth.iloc[i,:].replace({'Month': '09'}, '08')
            df_mnth = df_mnth.append(new_row)
        elif df_mnth['Month'][i]=='12':
            new_row = df_mnth.iloc[i,:].replace({'Month': '12'}, '10')
            df_mnth = df_mnth.append(new_row)
            new_row = df_mnth.iloc[i,:].replace({'Month': '12'}, '11')
            df_mnth = df_mnth.append(new_row)          
        
    # Create merge var
    df_mnth['Original Date'] = (df_mnth['Month'].map(str) + '/' + df_mnth['Year']).apply(str)
    df_mnth['Sort'] = (df_mnth['Year'].map(str) + df_mnth['Month']).apply(str)
    df_mnth = df_mnth.sort_values(by=['Sort'])
    df_mnth = df_mnth.drop(labels=['Month', 'Year', 'Sort', 'repdte'], axis=1)
    
    return df_mnth

***

# Data Transformations

Removal of features with really high missingness or no data variation, and then mean/mode hard coding on features with low missingness

In [11]:
def missing_treat(df):
    # Find features with 10% missing or more
    condition = ( df.isnull().sum(axis=0)/df.shape[0]*100 ) 
    df_HighMissing = condition > 10 
    
    # Save features that contain missing data
    df_HighMissing = df_HighMissing.index[df_HighMissing.values == True]
    
    # remove high missing features
    df = df.drop(labels=df_HighMissing, axis=1)
        
    # impute on the mean for low missing features that are continuous   
    df_cont = df.select_dtypes(include=['float64', 'int64'])
    df[df_cont.columns] = df_cont.apply(lambda x: x.fillna(x.mean()),axis=0)
    
    # impute on the mode for low missing features that are categorical   
    df_cat = df.select_dtypes(include=['object'])
    df[df_cat.columns] = df_cat.apply(lambda x: x.fillna(x.mode()),axis=0)  
    
    return df

Impute using KNN

In [12]:
def KNN_imputations(df, X_cols, n_neighbors=2):
    KNN_impute = KNNImputer(n_neighbors=n_neighbors, weights="uniform")
    df = KNN_impute.fit_transform(df)
    df = pd.DataFrame(df, columns=X_cols)
    
    return df

Changing date features to numeric, if one decides to use time as a ordinal feature

In [13]:
def change_date(df, var_str):
    
    # Convert to ordinal
    df[var_str] = df[var_str].apply(lambda x: dt.strptime(x, '%m/%Y').toordinal())
    
    return df

One Hot Encoding

Converts categorical variables to dummy variables

In [14]:
def onehotencoding(df):
    columns = df.columns[df.isnull().any()]
    nan_cols = df[columns]

    df = df.drop(nan_cols.columns, axis=1)

    df_cat = df.select_dtypes(include=['object'])
    onehot = pd.get_dummies(df_cat)
    
    df_cont = df.drop(df_cat.columns, axis=1)

    df = pd.concat([df_cont,onehot,nan_cols], axis=1).reset_index(drop=True)
    return df

Feature Selection

Run permutation importance and score based on ROC-AUC

In [15]:
def relative_importance(X_train, y_train, bank_str, sample='bal', max_features=0.5):
    # Transform X
    ## define datasets 
    y = y_train  
    X = X_train
    readd = X.loc[:, str('Bank_' + bank_str)]
    X = X.filter(regex=r'^(?!Bank_).*$')
    X.loc[:, str('Bank_' + bank_str)] = readd
    
    ## Add interaction terms
    X = Bank_Interactions(X, bank_str = bank_str)
    
    ## Standardize Vars
    X_cols = X.columns
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    
    # Permutation importance for feature evaluation
    if sample=='bal':
        clf = BalancedRandomForestClassifier(n_estimators=50, random_state=2020, max_features=max_features, 
                                             replacement=False, n_jobs=-1)
    elif sample=='wgt':
        clf = RandomForestClassifier(n_estimators=50, random_state=2020, max_features=max_features, 
                                     class_weight={1: 0.1, 0: 0.9}, n_jobs=-1)     
    else:
        clf = RandomForestClassifier(n_estimators=50, random_state=2020, max_features=max_features, 
                                     n_jobs=-1)
    
    clf = clf.fit(X, y)
    result = permutation_importance(clf, X, y, n_repeats=10, scoring='roc_auc_score',
                                    random_state=2020)
    importances = pd.Series(result.importances_mean, index=X_cols)
    
    return(importances)

***

# Descriptive Statistics

Foreclosure Descriptive Statistics

In [16]:
def Foreclosure_Data(date1 = "", date2 = "", subset = "", df = df):
    # Subset by Date
    if (date1 == "" and date2 == ""):
        df_sub = df
    elif (date1 != "" and date2 != ""):
        date = pd.date_range(date1, date2)
        month_yr = np.array([])
        for i in range(len(date)): 
            month_yr = np.append( month_yr, str( str(date.month[i]) + '/' + str(date.year[i]) ) )
        month_yr = np.unique(month_yr)
        month_yr = np.char.zfill(month_yr, 7)
        df_sub = df[df['Original Date'].isin(month_yr)]
    elif (date2 == ""):
        df_sub = df[df['Original Date']==date1]
    elif (date1 == ""):
        df_sub = df[df['Original Date']==date2]
    
    # Subset by other variable
    if (date1 == "" and date2 == "" and subset != ""):
        df_sub = df[eval(subset)]
        
    # Foreclosures represented
    Foreclosed = ['Not Forclosed', 'Forclosed']
    Target = df_sub.groupby(['Foreclosed']).size().reset_index(name='Total')
    # Original Mortgage Amount
    ORM = df_sub.groupby(['Foreclosed']).agg({'Original Mortgage Amount': 'mean'}).round(2)
    # Credit Score
    CS = df_sub.groupby(['Foreclosed']).agg({'Harmonized Credit Score': 'mean'}).astype(int)
    # Original Debt to Income Ratio
    DTI = df_sub.groupby(['Foreclosed']).agg({'Original Debt to Income Ratio': 'mean'}).round(1)
    # First Time Home Buyer
    FT = df_sub[df_sub['First Time Home Buyer']=='Y'].groupby(['Foreclosed']).size().reset_index(name='Total')
    # Refinance
    LP = df_sub[df_sub['Loan Purpose']!=0].groupby(['Foreclosed']).size().reset_index(name='Total')
    # Original Interest Rate
    IR = df_sub.groupby(['Foreclosed']).agg({'Original Interest Rate': 'mean'}).round(2)
    # Original Loan Term
    LT = df_sub.groupby(['Foreclosed']).agg({'Original Loan Term': 'mean'}).astype(int)
    # Original Combined Loan-to-Value (CLTV)
    CLTV = df_sub.groupby(['Foreclosed']).agg({'Original Combined Loan-to-Value (CLTV)': 'mean'}).round(1)
    # Single Borrower Ratio
    SBR = df_sub.groupby(['Foreclosed']).agg({'Single Borrower': 'mean'}).round(2)
    # Mortgage Insurance Type
    MIT = df_sub.groupby(['Foreclosed']).agg({'Mortgage Insurance Type': 'mean'}).round(2)
    # Mortgage Insurance %
    MIP = df_sub.groupby(['Foreclosed']).agg({'Mortgage Insurance %': 'mean'}).round(2)    
    # Median Household Income
    MHI = df_sub.groupby(['Foreclosed']).agg({'Median Household Income': 'mean'}).round(2)
    
    # Create Dataset
    df_new = pd.DataFrame({ 'Foreclosed': Foreclosed, 
                            'Foreclosed (%)': ((Target['Total'] / df_sub.shape[0]) * 100).round(1),
                            'Foreclosed (N)': df_sub.groupby(['Foreclosed']).size(),
                            'Mortgage Amount ($)': ORM['Original Mortgage Amount'].tolist(),
                            'Credit Score': CS['Harmonized Credit Score'].tolist(),
                            'Debt to Income Ratio': DTI['Original Debt to Income Ratio'].tolist(),
                            'First Time Home Buyer (%)': ((FT['Total'] / Target['Total']) * 100).round(1).tolist(),
                            'Refinanced': ((LP['Total'] / Target['Total']) * 100).round(1).tolist(),
                            'Interest Rate': IR['Original Interest Rate'].tolist(),
                            'Loan Term': LT['Original Loan Term'].tolist(),
                            'Combined Loan-to-Value (CLTV)': CLTV['Original Combined Loan-to-Value (CLTV)'].tolist(),
                            'Single Borrower Ratio': SBR['Single Borrower'].tolist(),
                            'Mortgage Insurance Ratio': MIT['Mortgage Insurance Type'].tolist(),
                            'Mortgage Insurance %': MIP['Mortgage Insurance %'].tolist(),
                            'Estimated Household Income ($)': MHI['Median Household Income'].tolist()
                         })
    
    df_new = df_new.set_index('Foreclosed')
    return df_new

Bank Descriptive Statistics

In [17]:
def Bank_Data(date1 = "", date2 = "", subset = "", df = df):
    # Subset by Date
    if (date1 == "" and date2 == ""):
        df_sub = df
    elif (date1 != "" and date2 != ""):
        date = pd.date_range(date1, date2)
        month_yr = np.array([])
        for i in range(len(date)): 
            month_yr = np.append( month_yr, str( str(date.month[i]) + '/' + str(date.year[i]) ) )
        month_yr = np.unique(month_yr)
        month_yr = np.char.zfill(month_yr, 7)
        df_sub = df[df['Original Date'].isin(month_yr)]
    elif (date2 == ""):
        df_sub = df[df['Original Date']==date1]
    elif (date1 == ""):
        df_sub = df[df['Original Date']==date2]
    
    # Subset by other variable
    if (date1 == "" and date2 == "" and subset != ""):
        df_sub = df[eval(subset)]
        
    # Banks represented
    Banks = df_sub.groupby(['Bank']).size().reset_index(name='Total')
    # Foreclosures
    Target = df_sub.groupby(['Bank']).agg({'Foreclosed': 'mean'})
    # Original Mortgage Amount
    ORM = df_sub.groupby(['Bank']).agg({'Original Mortgage Amount': 'mean'}).round(2)
    # Credit Score
    CS = df_sub.groupby(['Bank']).agg({'Harmonized Credit Score': 'mean'}).astype(int)
    # Original Debt to Income Ratio
    DTI = df_sub.groupby(['Bank']).agg({'Original Debt to Income Ratio': 'mean'}).round(1)
    # First Time Home Buyer
    FT = df_sub[df_sub['First Time Home Buyer']=='Y'].groupby(['Bank']).size().reset_index(name='Total')
    # Refinance
    LP = df_sub[df_sub['Loan Purpose']!=0].groupby(['Bank']).size().reset_index(name='Total')
    # Original Interest Rate
    IR = df_sub.groupby(['Bank']).agg({'Original Interest Rate': 'mean'}).round(2)
    # Original Loan Term
    LT = df_sub.groupby(['Bank']).agg({'Original Loan Term': 'mean'}).astype(int)
    # Original Combined Loan-to-Value (CLTV)
    CLTV = df_sub.groupby(['Bank']).agg({'Original Combined Loan-to-Value (CLTV)': 'mean'}).round(1)
    # Single Borrower Ratio
    SBR = df_sub.groupby(['Bank']).agg({'Single Borrower': 'mean'}).round(2)
    # Mortgage Insurance Type
    MIT = df_sub.groupby(['Bank']).agg({'Mortgage Insurance Type': 'mean'}).round(2)
    # Mortgage Insurance %
    MIP = df_sub.groupby(['Bank']).agg({'Mortgage Insurance %': 'mean'}).round(2)     
    # Median Household Income
    MHI = df_sub.groupby(['Bank']).agg({'Median Household Income': 'mean'}).round(2)
    
    # Create Dataset
    df_new = pd.DataFrame({ 'Bank': Banks['Bank'], 
                            'Bank (%)': ((Banks['Total'] / df_sub.shape[0]) * 100).round(1),
                            'Bank (N)': Banks['Total'],
                            'Foreclosed (%)': ((Target['Foreclosed'] * 100).round(1)).tolist(), 
                            'Mortgage Amount ($)': ORM['Original Mortgage Amount'].tolist(),
                            'Credit Score': CS['Harmonized Credit Score'].tolist(),
                            'Debt to Income Ratio': DTI['Original Debt to Income Ratio'].tolist(),
                            'First Time Home Buyer (%)': ((FT['Total'] / Banks['Total']) * 100).round(1).tolist(),
                            'Refinance': ((LP['Total'] / Banks['Total']) * 100).round(1).tolist(),
                            'Interest Rate': IR['Original Interest Rate'].tolist(),
                            'Loan Term': LT['Original Loan Term'].tolist(),
                            'Combined Loan-to-Value (CLTV)': CLTV['Original Combined Loan-to-Value (CLTV)'].tolist(),
                            'Single Borrower Ratio': SBR['Single Borrower'].tolist(),
                            'Mortgage Insurance Ratio': MIT['Mortgage Insurance Type'].tolist(),
                            'Mortgage Insurance %': MIP['Mortgage Insurance %'].tolist(),                           
                            'Median Household Income ($)': MHI['Median Household Income'].tolist()
                        })
    
    df_new = df_new.set_index("Bank")
    return df_new

Isolate banks based on maximum, minimum, or other meaningful values

In [18]:
def search_Banks(col, df = Banks, func = max, subset = True):
    print(col, func.__name__, "value")
    if (subset): cols = col
    else: cols = df.columns
    values = pd.DataFrame(df[cols][df[col] == func(df[col])])
    return values

***

# Change Data Frames based on Bank in Question

Create single-Bank only subsets

In [19]:
'''
# List of banks for reference
Banks = ['AMTRUST BANK', 'BANK OF AMERICA, N.A.', 'CITIMORTGAGE, INC.', 
         'FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB', 
         'FIRST TENNESSEE BANK NATIONAL ASSOCIATION', 'FLAGSTAR CAPITAL MARKETS CORPORATION', 
         'GMAC MORTGAGE', 'JPMORGAN CHASE BANK, NATIONAL ASSOCIATION', 'OTHER', 
         'PNC BANK, N.A.', 'SUNTRUST MORTGAGE INC.', 'CHASE HOME FINANCE', 'SMALL LOAN BANKS']
'''

# Function to subset banking datasets
def Bank_Subsets(bank_strs, df_X = X_train, df_y = y_train):
    # Initiate Bank dictionaries
    X = {}
    y = {}

    # Bank Subset
    for bank_str in bank_strs:
        X[bank_str] = df_X.loc[:, df_X[str('Bank_' + bank_str)]==1] \
            .filter(regex=r'^(?!Bank_).*$')
        y[bank_str] = df_y[np.array(df_X[str('Bank_' + bank_str)]==1)]
    
    return X, y

Bank Interactions

Create interaction terms for loan-based continuous features to understand unique aspects of bank in question on these terms

In [20]:
def Bank_Interactions(df_X = X_train, bank_str = bank_str):
    # Use loan-based continuous features as interaction terms
    vars = ['Original Mortgage Amount', 'Original Interest Rate', 
            'Original Combined Loan-to-Value (CLTV)', 'Original Debt to Income Ratio', 
            'Mortgage Insurance %', 'Median Household Income',
            'Loan Change (1 Year)', 'Loan Change (5 Years)']
    
    # Times each loan-based continuous feature by Bank binary feature
    for var in vars:
        df_X.loc[:, str(var + ' [Int]')] = df_X[var] * df_X.loc[:, str('Bank_' + bank_str)]
        
    return df_X

***

# Data Visualizations of Model Performance

Function to plot target variable (and predictions)

Visualize the percentage and frequency of target variable

In [21]:
def target_values(df_depvar, data=False, prediction=False):
        
    # save target frequencies
    target_frequency = df_depvar.value_counts()
        
    # save target percentage
    target_percentage = round((df_depvar.value_counts()/df_depvar.count())*100).astype(int)
        
    # graphing target variable
    jtplot.style(ticks=True, grid=False)
    plt.figure(figsize=(14,4))
    target_percentage.plot.barh(stacked=True, color='#ca2c92').invert_yaxis()
    if data:
        plt.suptitle('Bar Chart of Target Variable', fontsize=18)
    elif prediction:
        plt.suptitle('Bar Chart of Predictions', fontsize=18)
    else:
        plt.suptitle('Percent of Mortage Defaults', fontsize=18)
    plt.ylabel('Foreclosed')
    plt.xlabel('Percentage')
    plt.xlim([0,100])
    # plt.yticks([0, 1], ['Did not Foreclose', 'Foreclosed'])
    plt.show()
    
    # display frequency of foreclosures
    print('Frequency of Foreclosures\n', target_frequency, '\n', sep='')
    
    # display percentage of foreclosures
    print('Percentage of Foreclosures\n', target_percentage, '\n', sep='')

Visualize scores at various classification thresholds

In [22]:
def threshold(target_prob, y_test = y_test):
    # Determine threshold
    threshold = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 
                 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
    
    acc = []
    prec = []
    f1 = []
    auc = []
    best_auc = {'Threshold': 0.5, 'Best ROC AUC Score': 0.0}
    best_acc = {'Threshold': 0.5, 'Best Accuracy Score': 0.0}
    best_prec = {'Threshold': 0.5, 'Best Precision Score': 0.0}
    best_f1 = {'Threshold': 0.5, 'Best F1 Score': 0.0}
    for i in range(len(threshold)):
        y_pred = target_prob.map(lambda x: 1 if x >= threshold[i] else 0)
        
        # Accuracy
        acc.append(accuracy_score(y_test, y_pred))
        # Precision
        prec.append(precision_score(y_test, y_pred))
        # F1 
        f1.append(f1_score(y_test, y_pred))
        # Avg
        auc.append(roc_auc_score(y_test, y_pred))
        
        # Save best accuracy
        if (best_acc['Best Accuracy Score'] < acc[i]):
            best_acc = {'Threshold': threshold[i], 'Best Accuracy Score': acc[i]}
        # Save best precision
        if (best_prec['Best Precision Score'] < prec[i]):
            best_prec = {'Threshold': threshold[i], 'Best Precision Score': prec[i]}      
        # Save best f1
        if (best_f1['Best F1 Score'] < f1[i]):
            best_f1 = {'Threshold': threshold[i], 'Best F1 Score': f1[i]}       
        # Save best Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
        if (best_auc['Best ROC AUC Score'] < auc[i]):
            best_auc = {'Threshold': threshold[i], 'Best ROC AUC Score': auc[i]}   
    
    # Plot
    df_plot = pd.DataFrame({'Threshold': threshold, 'Accuracy': acc, 
                            'Precision': prec, 'F1': f1, 'ROC AUC': auc})
    plt.figure(figsize=(12,4))
    plt.plot(df_plot['Threshold'], df_plot.iloc[:,1:5])
    plt.title('Scores at Various Thresholds')
    plt.legend(['Accuracy', 'Precision', 'F1', 'ROC AUC'])
    print(plt.show())
    
    # Scores
    y_pred = target_prob.map(lambda x: 1 if x >= best_auc['Threshold'] else 0)
    print(classification_report(y_test, y_pred))
    
    return( best_auc )

Plot Target Classes

Visualizes if there are any obvious classification boundaries

In [23]:
def plot_2d_space(X, y, label='Classes'):   
    colors = ['#1F77B4', '#FF7F0E']
    markers = ['o', 's']
    for l, c, m in zip(np.unique(y), colors, markers):
        plt.scatter(
            X[y==l, 0],
            X[y==l, 1],
            c=c, label=l, marker=m
        )
    plt.title(label)
    plt.legend(loc='upper right')
    plt.show()

***