# <center>Bad Bank Behavior<br>Analyzing Bank Mortgage during the 2007 Housing Bubble</center>  

<center>Michael Siebel</center>
<center>August 2020</center>
<br>

## <center>Model Selection Script</center>

# Purpose  
<br>

> Runs a relative importance analysis called permutation importance to determine the most important features in each of the algorithms that will be modelled subsequently.  Features with low importance may be subject for removal from the model selection.

> The permutation importance of a feature is calculated as follows. First, a baseline metric, defined by scoring, is evaluated on a (potentially different) dataset defined by the X. Next, a feature column from the validation set is permuted and the metric is evaluated again. The permutation importance is defined to be the difference between the baseline metric and metric from permutating the feature column.

***

# Load Functions

In [1]:
%run Functions.ipynb
pd.set_option("display.max_columns", 200)
pd.set_option('display.max_rows', 200)

file_to_open = open('..\Data\df.pkl', 'rb') 
df  = pickle.load(file_to_open) 
file_to_open.close()

# Drop mergeID column
df = df.drop(labels='Loan ID', axis=1)

# Convert Inf values to NA
df = df.replace([np.inf, -np.inf], np.nan)

***

# Set Up Data

In [2]:
print('Shape:\n', df.shape)
print('\nColumns:\n', df.columns)

Shape:
 (3789014, 56)

Columns:
 Index(['Origination Channel', 'Bank', 'Original Interest Rate',
       'Original Mortgage Amount', 'Original Loan Term', 'Original Date',
       'Original Combined Loan-to-Value (CLTV)', 'Single Borrower',
       'Original Debt to Income Ratio', 'First Time Home Buyer',
       'Loan Purpose', 'Property Type', 'Occupancy Type', 'Property State',
       'Zip Code', 'Mortgage Insurance %', 'Mortgage Insurance Type',
       'File Year', 'File Quarter', 'Foreclosed', 'Month', 'Year',
       'Harmonized Credit Score', 'Loan Change (1 Year)',
       'Loan Change (5 Years)', 'Median Household Income', 'Region',
       'Household Financial Obligations (Qtr)',
       'Household Financial Obligations (Yr)',
       'Consumer Debt Service Payment (Qtr)',
       'Consumer Debt Service Payment (Yr)', 'National Home Price Index (Qtr)',
       'National Home Price Index (Yr)',
       'Mortgage Debt Service Payments (Qtr)',
       'Mortgage Debt Service Payments (Yr)', '

In [3]:
# Banks
df['Bank'].value_counts()

BANK OF AMERICA, N.A.                        975374
OTHER                                        838987
CITIMORTGAGE, INC.                           401156
SMALL LOAN BANKS                             367310
GMAC MORTGAGE                                262019
JPMORGAN CHASE BANK, NATIONAL ASSOCIATION    257610
SUNTRUST MORTGAGE INC.                       195171
FLAGSTAR CAPITAL MARKETS CORPORATION         125532
AMTRUST BANK                                 119659
FIRST TENNESSEE BANK NATIONAL ASSOCIATION     97673
PNC BANK, N.A.                                94593
CHASE HOME FINANCE                            43964
FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB       9966
Name: Bank, dtype: int64

In [4]:
# Drop "OTHER"  and "SMALL LOAN BANKS" Categories
df = df[df['Bank'] != 'OTHER']
df = df[df['Bank'] != 'SMALL LOAN BANKS']
df['Bank'].value_counts()

BANK OF AMERICA, N.A.                        975374
CITIMORTGAGE, INC.                           401156
GMAC MORTGAGE                                262019
JPMORGAN CHASE BANK, NATIONAL ASSOCIATION    257610
SUNTRUST MORTGAGE INC.                       195171
FLAGSTAR CAPITAL MARKETS CORPORATION         125532
AMTRUST BANK                                 119659
FIRST TENNESSEE BANK NATIONAL ASSOCIATION     97673
PNC BANK, N.A.                                94593
CHASE HOME FINANCE                            43964
FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB       9966
Name: Bank, dtype: int64

In [5]:
# Variables for One Hot encoding
df_cat = df.select_dtypes(include=['object'])
df_cat.tail()

Unnamed: 0,Origination Channel,Bank,Original Date,First Time Home Buyer,Property Type,Occupancy Type,Property State,File Year,File Quarter,Month,Year,Region
1456323,C,"BANK OF AMERICA, N.A.",12/2005,N,PU,P,CO,2005,Q4,12,2005,West
1456324,R,"BANK OF AMERICA, N.A.",12/2005,N,SF,P,KY,2005,Q4,12,2005,South
1456326,C,"BANK OF AMERICA, N.A.",12/2005,N,SF,P,ME,2005,Q4,12,2005,Northeast
1456327,C,"BANK OF AMERICA, N.A.",12/2005,N,SF,P,AR,2005,Q4,12,2005,South
1456328,C,FLAGSTAR CAPITAL MARKETS CORPORATION,12/2005,N,SF,P,CA,2005,Q4,12,2005,West


In [6]:
# Variables to drop
dropvars = ['Original Date', 'File Year', 'File Quarter', 'Month', 'Year', 'Region',
            'Zip Code', 'Mortgage Insurance Type']  # 'Property State', 

# All data
df = df.drop(labels=dropvars, axis=1)
df = df.dropna()
All_y = df['Foreclosed']
All_X = df.drop(labels='Foreclosed', axis=1) 


# Cut dataset in half for faster runtimes
X_train, X_test, y_train, y_test = train_test_split(All_X, All_y, test_size = 0.5, 
                                                    stratify = All_y, random_state=2019)
# X_keep, X_ignore, y_keep, y_ignore
# X_train, X_test, y_train, y_test = train_test_split(X_keep, y_keep, test_size = 0.3, 
#                                                     stratify = y_keep, random_state=2019)

# One hot encoding on remaining data
X_train = onehotencoding(X_train)
X_train = X_train.drop(labels=['First Time Home Buyer_U'], axis=1)
X_test = onehotencoding(X_test)
X_test = X_test.drop(labels=['First Time Home Buyer_U'], axis=1)

# Save columns
X_cols = X_train.columns

print('Shape:', X_train.shape)

Shape: (1180043, 116)


***

# Missingness

In [7]:
# Missing
print((X_train.isna().sum() / X_train.shape[0] * 100).round(2))

Original Interest Rate                            0.0
Original Mortgage Amount                          0.0
Original Loan Term                                0.0
Original Combined Loan-to-Value (CLTV)            0.0
Single Borrower                                   0.0
Original Debt to Income Ratio                     0.0
Loan Purpose                                      0.0
Mortgage Insurance %                              0.0
Harmonized Credit Score                           0.0
Loan Change (1 Year)                              0.0
Loan Change (5 Years)                             0.0
Median Household Income                           0.0
Household Financial Obligations (Qtr)             0.0
Household Financial Obligations (Yr)              0.0
Consumer Debt Service Payment (Qtr)               0.0
Consumer Debt Service Payment (Yr)                0.0
National Home Price Index (Qtr)                   0.0
National Home Price Index (Yr)                    0.0
Mortgage Debt Service Paymen

In [8]:
'''
# impute using KNN
imp_cols = ['numemp', 'asset (5 Yr)', 'asset (1 Yr)', 'lnlsnet (5 Yr)', 
            'lnlsnet (1 Yr)', 'liab (5 Yr)', 'liab (1 Yr)', 'dep (5 Yr)', 
            'dep (1 Yr)', 'eqtot (5 Yr)', 'eqtot (1 Yr)', 
            'Original Combined Loan-to-Value (CLTV)', 'Original Debt to Income Ratio',
            'Harmonized Credit Score']
X_train = KNN_imputations(df_X=X_train, df_y=y_train, X_cols=imp_cols, n_neighbors=3)

# Missing
X_train[imp_cols].isna().sum()
'''

"\n# impute using KNN\nimp_cols = ['numemp', 'asset (5 Yr)', 'asset (1 Yr)', 'lnlsnet (5 Yr)', \n            'lnlsnet (1 Yr)', 'liab (5 Yr)', 'liab (1 Yr)', 'dep (5 Yr)', \n            'dep (1 Yr)', 'eqtot (5 Yr)', 'eqtot (1 Yr)', \n            'Original Combined Loan-to-Value (CLTV)', 'Original Debt to Income Ratio',\n            'Harmonized Credit Score']\nX_train = KNN_imputations(df_X=X_train, df_y=y_train, X_cols=imp_cols, n_neighbors=3)\n\n# Missing\nX_train[imp_cols].isna().sum()\n"

In [9]:
def pca_fred(X_train=X_train, X_test=X_test, n_components=4):
    # FRED Subset
    columns = ['Household Financial Obligations (Qtr)', 'Household Financial Obligations (Yr)', 
         'Consumer Debt Service Payment (Qtr)', 'Consumer Debt Service Payment (Yr)',
         'National Home Price Index (Qtr)', 'National Home Price Index (Yr)',
         'Mortgage Debt Service Payments (Qtr)', 'Mortgage Debt Service Payments (Yr)',
         'Monthly Supply of Houses (Qtr)', 'Monthly Supply of Houses (Yr)',
         'Vacant Housing Units for Sale (Qtr)', 'Vacant Housing Units for Sale (Yr)',
         'Homeownership Rate (Qtr)', 'Homeownership Rate (Yr)', 'Vacant Housing Units for Rent (Qtr)',
         'Vacant Housing Units for Rent (Yr)', 'Rental Vacancy Rate (Qtr)', 'Rental Vacancy Rate (Yr)']
    fred_train =  X_train[columns]
    fred_test = X_test[columns]
    
    # Fit PCA
    dimredu = PCA(n_components=n_components, random_state=2020).fit(fred_train)
    fred_train = pd.DataFrame(dimredu.transform(fred_train), columns=['Macroeconomy PCA 1',
                                                                      'Macroeconomy PCA 2',
                                                                      'Macroeconomy PCA 3',
                                                                      'Macroeconomy PCA 4'])
    fred_test = pd.DataFrame(dimredu.transform(fred_test), columns=['Macroeconomy PCA 1',
                                                                    'Macroeconomy PCA 2',
                                                                    'Macroeconomy PCA 3',
                                                                    'Macroeconomy PCA 4'])
    
    # Subsitute PCA columns
    X_train = X_train.drop(labels=columns, axis=1)
    X_train = pd.concat([X_train, fred_train], axis=1)
    X_test = X_test.drop(labels=columns, axis=1)
    X_test = pd.concat([X_test, fred_test], axis=1)
    
    return X_train, X_test

In [10]:
# Update Macroeconomic variables (will not use test set)
from sklearn.decomposition import PCA
X_train, X_test = pca_fred(X_train, X_test, n_components=4)

# Save columns
X_cols = X_train.columns
X_cols

Index(['Original Interest Rate', 'Original Mortgage Amount',
       'Original Loan Term', 'Original Combined Loan-to-Value (CLTV)',
       'Single Borrower', 'Original Debt to Income Ratio', 'Loan Purpose',
       'Mortgage Insurance %', 'Harmonized Credit Score',
       'Loan Change (1 Year)',
       ...
       'Property State_VA', 'Property State_VT', 'Property State_WA',
       'Property State_WI', 'Property State_WV', 'Property State_WY',
       'Macroeconomy PCA 1', 'Macroeconomy PCA 2', 'Macroeconomy PCA 3',
       'Macroeconomy PCA 4'],
      dtype='object', length=102)

***

# Check for Multicolinearity

In [11]:
# Remove multicolinear columns
X_train = X_train.filter(regex=r'^(?!asset).*$')
X_train = X_train.filter(regex=r'^(?!liab).*$')
X_train = X_train.filter(regex=r'^(?!eqtot).*$')
X_train = X_train.filter(regex=r'^(?!dep).*$') 

X_test = X_test.filter(regex=r'^(?!asset).*$')
X_test = X_test.filter(regex=r'^(?!liab).*$')
X_test = X_test.filter(regex=r'^(?!eqtot).*$')
X_test = X_test.filter(regex=r'^(?!dep).*$') 

# List of banks
Banks = ['AMTRUST BANK', 'BANK OF AMERICA, N.A.', 'CITIMORTGAGE, INC.', 
         'FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB', 
         'FIRST TENNESSEE BANK NATIONAL ASSOCIATION', 'FLAGSTAR CAPITAL MARKETS CORPORATION', 
         'GMAC MORTGAGE', 'JPMORGAN CHASE BANK, NATIONAL ASSOCIATION', 
         'PNC BANK, N.A.', 'SUNTRUST MORTGAGE INC.', 'CHASE HOME FINANCE']

# Create dictionary of subsetted bank data
Banks_X, Banks_y = Bank_Subsets(Banks, df_X = X_train, df_y = y_train)
Banks_X_test, Banks_y_test = Bank_Subsets(Banks, df_X = X_test, df_y = y_test)

# Remove Banks from full data
X_train = X_train.filter(regex=r'^(?!Bank_).*$') 
X_test = X_test.filter(regex=r'^(?!Bank).*$')

# Number of final columns
print('Number of final columns:', len(X_train.columns))

Number of final columns: 83


***

# Vote by Committee

In [None]:
rfc = BalancedRandomForestClassifier(random_state=2020, n_estimators=200,
                                     replacement=False, n_jobs=-1)
rus = RUSBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=2020)

# Full Data
y_full = y_train
X_full = X_train
X_test_full = X_test

# # Subset
# subset = subset_full
# X_full = X_full.loc[:, subset]
# X_test_full = X_test_full.loc[:, subset]
# 
# # Standardize Vars
# X_cols = X_full.columns
scaler = StandardScaler().fit(X_full)
X_full = scaler.transform(X_full)
X_test_full = scaler.transform(X_test_full)

# Model Bal
param_grid = {
    'max_depth': list(range(3, 40)),
    'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
    'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
    'max_features': [0.5, 'sqrt', 'log2']    
}

CV_ful = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
fulrfc_models = pickle.dumps(CV_ful)
CV_ful = pickle.loads(fulrfc_models)
CV_ful.fit(X_full, y_full)
print(CV_ful.best_params_)
print("F1 Best Score:", CV_ful.best_score_)

fulrfc_proba = pd.DataFrame(CV_ful.predict_proba(X_test_full)).loc[:,1]
print(threshold(fulrfc_proba, y_test))

In [None]:
def model_banks(Banks_X, Banks_y, Banks_X_test, bank_str, subset, clf):
    # Dictionaries
    bnk_models = {}
    bnk_proba = {}
    bnk_pred = {}
    bnk_scores = {}

    # Load Bank Data
    y = Banks_y[bank_str]
    X = Banks_X[bank_str] 
    X_test_bnk = Banks_X_test[bank_str]
    y_test_bnk = Banks_y_test[bank_str]
    
    # Subset
    # subset = subset[bank_str]
    # X = X.loc[:, subset]
    # X_test_bnk = X_test_bnk.loc[:, subset]
        
    # Standardize Vars
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    X_test_bnk = scaler.transform(X_test_bnk)
    
    # Model Bal
    param_grid = {
        'max_depth': list(range(3, 40)),
        'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
        'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
        'max_features': [0.5, 'sqrt', 'log2']    
    }

    # Model
    bnk_models[bank_str] = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
    rfc_model = pickle.dumps(bnk_models[bank_str])
    bnk_models[bank_str] = pickle.loads(rfc_model)
    bnk_models[bank_str].fit(X_full, y_full)
    print(bnk_models[bank_str].best_params_)
    print("F1 Best Score:", bnk_models[bank_str].best_score_)
    
    bnk_proba[bank_str] = pd.DataFrame(bnk_models[bank_str].predict_proba(X_test_full)).loc[:,1]
    best_f1 = threshold(bnk_proba[bank_str], y_test_bnk)

    bnk_pred[bank_str] = bnk_proba[bank_str].map(lambda x: 1 if x >= best_f1['Threshold'] else 0)  
    return bnk_pred[bank_str]
    
clfs = []
votes = pd.DataFrame()
for bank_str in ['AMTRUST BANK']:  
    print(bank_str)
    votes.loc[:,'rfc'] = model_banks(Banks_X, Banks_y, Banks_X_test, bank_str, clf = rfc)
    # votes.loc[:,'wgt'] = model_banks(Banks_X, Banks_y, Banks_X_test, bank_str, subset_wgt, wgt)
    # votes.loc[:,'rus'] = model_banks(Banks_X, Banks_y, Banks_X_test, bank_str, subset_rus, rus)
    
    # Full Data
    # subset = subset_full
    # X_test_bnk = Banks_X_test[bank_str].loc[:, subset]
    scaler = StandardScaler().fit(X_full)
    X_test_bnk = scaler.transform(X_test_bnk)
    
    # Rfc
    CV_ful = pickle.loads(fulrfc_models)
    votes.loc[:,'fulrfc'] = pd.DataFrame(CV_ful.predict_proba(X_test_bnk)).iloc[:,1]
    votes.loc[:,'fulrfc'] = votes.loc[:,'fulrfc'].map(lambda x: 1 if x >= 0.5 else 0)  

In [None]:
votes.loc[:,'maj'] = ( votes.iloc[:,0:2].sum(axis=1) / 2 ).map(lambda x: 1 if x >= 0.5 else 0)
votes.loc[:,'true'] = np.array(Banks_y_test['AMTRUST BANK'])
print(np.mean(votes.loc[:,'maj']))
display(votes.head(25))

In [None]:
print(roc_auc_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'rfc']))
#print(roc_auc_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'wgt']))
#print(roc_auc_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'rus']))
print(roc_auc_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'fulrfc']))
#print(roc_auc_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'fulwgt']))
#print(roc_auc_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'fulrus']))
print(roc_auc_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'maj']))

In [None]:
print(f1_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'rfc']))
#print(f1_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'wgt']))
#print(f1_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'rus']))
print(f1_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'fulrfc']))
#print(f1_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'fulwgt']))
#print(f1_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'fulrus']))
print(f1_score(Banks_y_test['AMTRUST BANK'], votes.loc[:,'maj']))