# <center>Bad Bank Behavior<br>Analyzing Bank Mortgage during the 2007 Housing Bubble</center>  

<center>Michael Siebel</center>
<center>August 2020</center>
<br>

## <center>Model Selection Script</center>

# Purpose  
<br>

> Runs a relative importance analysis called permutation importance to determine the most important features in each of the algorithms that will be modelled subsequently.  Features with low importance may be subject for removal from the model selection.

> The permutation importance of a feature is calculated as follows. First, a baseline metric, defined by scoring, is evaluated on a (potentially different) dataset defined by the X. Next, a feature column from the validation set is permuted and the metric is evaluated again. The permutation importance is defined to be the difference between the baseline metric and metric from permutating the feature column.

***

# Load Functions

In [21]:
%run Functions.ipynb
pd.set_option("display.max_columns", 999)
pd.set_option('display.max_rows', 999)

file_to_open = open('..\Data\df.pickle', 'rb') 
df  = pickle.load(file_to_open) 
file_to_open.close()

# Drop mergeID column
df = df.drop(labels='Loan ID', axis=1)

NameError: name 'pandas' is not defined

***

# Set Up Data

In [None]:
print('Shape:\n', df.shape)
print('\nColumns:\n', df.columns)

In [None]:
# Banks
df['Bank'].value_counts()

In [None]:
# Variables for One Hot encoding
df_cat = df.select_dtypes(include=['object'])
df_cat

In [None]:
# Variables to drop
dropvars = ['Original Date', 'File Year', 'File Quarter', 'Month', 'Region',
            'Zip Code', 'Mortgage Insurance Type']  # 'Property State', 

# All data
All_X = df.drop(labels=dropvars, axis=1)
All_y = All_X['Foreclosed']
All_X = All_X.drop(labels='Foreclosed', axis=1) 

# split dataset
X_ignore, X_keep, y_ignore, y_keep = train_test_split(All_X, All_y, test_size = 0.2, 
                                                      stratify = All_y, random_state=2019)
X_train, X_test, y_train, y_test = train_test_split(X_keep, y_keep, test_size = 0.5, 
                                                    stratify = y_keep, random_state=2019)

# One hot encoding on remaining data
X_train = onehotencoding(X_train)
X_test = onehotencoding(X_test) 
X_cols = X_train.columns

print(X_cols)

***

# Data Imputations

In [None]:
# Shape
print(X_train.shape)

# Missing
print((X_train.isna().sum() / X_train.shape[0] * 100).round(2))

In [None]:
# impute using KNN
X_train = KNN_imputations(X_train, X_cols)

# Missing
X_train[['Household Financial Obligations (Qtr)', 'Household Financial Obligations (Yr)', 
         'Consumer Debt Service Payment (Qtr)', 'Consumer Debt Service Payment (Yr)',
         'National Home Price Index (Qtr)', 'National Home Price Index (Yr)',
         'Mortgage Debt Service Payments (Qtr)', 'Mortgage Debt Service Payments (Yr)',
         'Monthly Supply of Houses (Qtr)', 'Monthly Supply of Houses (Yr)',
         'Vacant Housing Units for Sale (Qtr)', 'Vacant Housing Units for Sale (Yr)',
         'Homeownership Rate (Qtr)', 'Homeownership Rate (Yr)', 'Vacant Housing Units for Rent (Qtr)',
         'Vacant Housing Units for Rent (Yr)', 'Rental Vacancy Rate (Qtr)', 'Rental Vacancy Rate (Yr)',
         'numemp', 'asset (Qtr)',  'asset (Yr)', 'lnlsnet (Qtr)', 'lnlsnet (Yr)', 'liab (Qtr)', 'liab (Yr)',
         'dep (Qtr)', 'dep (Yr)', 'eqtot (Qtr)', 'eqtot (Yr)']].isna().sum()

In [None]:
# Missing
(X_train.isna().sum() / X_train.shape[0] * 100).round(2)

***

# Relative Importance Analysis

In [None]:
# Relative importance for balanced classes
rel_imp_bal = {}
for bank_str in np.unique(df['Bank']):
    print(bank_str)
    rel_imp_bal[bank_str] = relative_importance(X_train, y_train, bank_str, sample='bal')
    print(rel_imp_bal[bank_str])
    
# Save final dictionary
file_to_store = open("..\Data\rel_imp_bal.pickle", "wb")
pickle.dump(rel_imp_bal, file_to_store)
file_to_store.close()

In [None]:
# Relative importance for weighted classes
rel_imp_wgt = {}
for bank_str in np.unique(df['Bank']):
    print(bank_str)
    rel_imp_bal[bank_str] = relative_importance(X_train, y_train, bank_str, sample='wgt')
    print(rel_imp_bal[bank_str])
    
# Save final dictionary
file_to_store = open("..\Data\rel_imp_wgt.pickle", "wb")
pickle.dump(rel_imp_wgt, file_to_store)
file_to_store.close()

In [None]:
# Relative importance for unbalanced classes
rel_imp_unbal = {}
for bank_str in np.unique(df['Bank']):
    print(bank_str)
    rel_imp_unbal[bank_str] = relative_importance(X_train, y_train, bank_str, sample='none')
    print(rel_imp_unbal[bank_str])
    
# Save final dictionary
file_to_store = open("..\Data\rel_imp_unbal.pickle", "wb")
pickle.dump(rel_imp_unbal, file_to_store)
file_to_store.close()

***