In [6]:
# Core Packages
import pandas as pd

acq, per = pd.DataFrame({'Index': [1, 2]}), pd.DataFrame({'Index': [1, 2]})
x = ['Acquisition_2007Q1', 'Acquisition_2008Q2']
y = ['Perf_2007Q1', 'Perf_2008Q2']

for i in range(len(x)):
    acq['Year'], per['Year'] = x[i][12:16], x[i][12:16]
    acq['Quarter'], per['Quarter'] = x[i][16:18], x[i][16:18]
    
acq

Unnamed: 0,Index,Year,Quarter
0,1,2008,Q2
1,2,2008,Q2


In [None]:
"""
Grabs the entire Federal Deposit Insurance Corporation (FDIC) Statistics on
Depository Institutions (SDI) data set.

Note that this is a large data set! There are roughly 85 zip files each of
which is between 40 and 84 MB.

"""
import pandas as pd
import requests

base_url = 'https://www7.fdic.gov/sdi/Resource/AllReps/All_Reports_'

# use pandas to construct a list of quarterly dates
present = '20071231'
datetimes = pd.date_range('20070331', end=present, freq='Q')
dates = datetimes.format(formatter=lambda t: t.strftime('%Y%m%d'))

for date in dates:
    print(date)
    # ...construct the url...
    tmp_url = base_url + date + '.zip'

    # ...make the connection and grab the zipped files...
    tmp_buffer = requests.get(tmp_url)

    # ...save them to disk...
    with open('All_Reports_' + date + '.zip', 'wb') as tmp_zip_file:
        tmp_zip_file.write(tmp_buffer.content)

    print('Done with files for ' + date + '!')


In [None]:
"""
Grabs the entire Federal Deposit Insurance Corporation (FDIC) institutions data
set which catalogues the history of mergers and acquisitions for all FDIC
regulated instutitions and turns it into a Pandas DataFrame and picles the
object for future use.

"""
import zipfile

import pandas as pd
import requests

# download the data
#base_url = 'https://www7.fdic.gov/IDASP/'
filename = 'institutions2/INSTITUTIONS2.CSV'
#tmp_buffer = requests.get(base_url + filename)
#
#with open(filename, 'wb') as tmp_zip_file:
#    tmp_zip_file.write(tmp_buffer.content)
#
## convert to pandas DataFrame
#tmp_buffer = zipfile.ZipFile(filename)
#tmp_file = tmp_buffer.namelist()[1]

used_cols = ['CERT', 'CHANGEC1']
dtypes = {}
tmp_dataframe = pd.read_csv(filename,
                            usecols=used_cols,
                            )
#tmp_dataframe = pd.read_csv(tmp_buffer.open(tmp_file),
#                            usecols=used_cols,
#                            )


In [None]:
"""
This script imports the subset of the FDIC SDI data used in the analysis,
converts the data to a Pandas data frame and writes the object to disk.

There are on the order of 50 corrupted observations in the various zip files.
Not clear why there are 90 entries in those rows instead of 89

"""
from datetime import datetime
import glob
import zipfile

import pandas as pd

# use pandas to construct a list of quarterly dates
present = '20071231'
datetimes = pd.date_range('20070331', end=present, freq='Q')

# get a list of zip files over which to iterate
zip_files = glob.glob('*.zip')

# only want to return a subset of cols (save on memory usage!)
used_columns = ['cert', 'repdte', 'asset', 'lnlsnet', 'liab', 'dep', 'eqtot',
                'numemp',
                ]
used_dtypes = {'cert': int, 'repdte': datetime, 'asset': float,
               'lnlsnet': float, 'liab': float, 'eqtot': float, 'dep': float,
               'numemp': float}

# create a container for the individual dataframes
dataframes = []

for zip_file in zip_files[0:4]:

    tmp_buffer = zipfile.ZipFile(zip_file)
    
    # want to work with the assets and liabilities file
    tmp_file = tmp_buffer.namelist()[5]
    
    tmp_dataframe = pd.read_csv(tmp_buffer.open(tmp_file),
                                index_col=['cert', 'repdte'],
                                error_bad_lines=False,  # skips the mangled obs
                                usecols=used_columns,
                                #dtype=used_dtypes,
                                parse_dates=True,
                                )
    
    dataframes.append(tmp_dataframe)

    print('Done with ' + zip_file + '!')

# concatenate the quarterly dataframes into a single data frame
combined_dataframe = pd.concat(dataframes)

# convert units from thousands to billions of USD
combined_dataframe[['asset', 'lnlsnet', 'liab', 'dep', 'eqtot']] /= 1e6

# convert units from nummber of people to thousands of people
combined_dataframe['numemp'] /= 1e3

# convert to panel (major_axis: cert, minor_axis: repdte)
combined_panel = combined_dataframe.to_panel()

# pickle the object for later use!
combined_panel.to_pickle('FDIC_SDI_panel_nominal.pkl')


In [None]:
# load the pickled data
FDIC_SDI_panel = combined_dataframe

# compute the by quarter totals for each measure
totals = FDIC_SDI_panel.sum()

# compute the base quarter totals for each measure
base_qtr='2007-03-31'
totals_base_qtr = totals.copy()
totals_base_qtr[:] = totals[base_qtr]
totals_base_qtr.fillna(method='bfill', inplace=True)

def janicki_prescott_norm(item):
    """
    In order to make sure results are comparable across years, I follow 
    Janicki and Prescott (2006) and deflate and re-scale each measure of bank 
    size by dividing by banking sector totals relative to some base quarter. 
    Specifically, let :math:`S_{i,t}^{raw}` denote the raw size of bank :math:`i`
    in year :math:`t` based on one of the six size measures detailed above. The 
    normalized size of bank :math:`i` relative to the base quarter is defined as
    follows:
             
    .. math::
    
        S_{i,t}^{norm} = \frac{S_{i,t}^{raw}}{\sum_{j}S_{j,t}^{raw}}\sum_{j}S_{i,base}^{raw}
    
    where :math:\sum_{j}S_{j,t}^{raw}` is the banking sector total of some size 
    measure in year :math:`t` (i.e., total banking sector assets in year :math:`t`), 
    and :math:`\sum_{j}S_{j,base}^{raw}` is the banking sector total of the same
    size measure in the base quarter.
    
    """
    return (FDIC_SDI_panel[item] / totals[item]) * totals_base_qtr[item]

# apply the Janicki and Prescott (2006) normalized size measure 
for item in FDIC_SDI_panel.items:
    FDIC_SDI_panel[item] = janicki_prescott_norm(item)
    
# pickle the object for later use!
FDIC_SDI_panel.to_pickle('FDIC_SDI_normed_panel.pkl')


In [None]:
# Income Groups
income_stats = df['Median Household Income'].describe()

df['Income Category'] = np.nan
df['Income Category'][  df['Median Household Income'] <  income_stats['25%']] = 'Lowest Income'
df['Income Category'][( df['Median Household Income'] >= income_stats['25%'] ) & \
                      ( df['Median Household Income'] <  income_stats['50%'] )] = 'Low-Middle Income'
df['Income Category'][( df['Median Household Income'] >= income_stats['50%'] ) & \
                      ( df['Median Household Income'] <  income_stats['75%'] )] = 'High-Middle Income'
df['Income Category'][  df['Median Household Income'] >= income_stats['75%']] = 'Highest Income'

df.head()

In [None]:
# Income Groups

df['Income Category'] = np.nan
df['Income Category'][  df['Median Household Income'] <  df['Median Household Income'].quantile(0.2) ] = 'Lowest Income'
df['Income Category'][( df['Median Household Income'] >= df['Median Household Income'].quantile(0.2) ) & \
                      ( df['Median Household Income'] <  df['Median Household Income'].quantile(0.4) )] = 'Low-Middle Income'
df['Income Category'][( df['Median Household Income'] >= df['Median Household Income'].quantile(0.4) ) & \
                      ( df['Median Household Income'] <  df['Median Household Income'].quantile(0.6) )] = 'Middle Income'
df['Income Category'][( df['Median Household Income'] >= df['Median Household Income'].quantile(0.6) ) & \
                      ( df['Median Household Income'] <  df['Median Household Income'].quantile(0.8) )] = 'High-Middle Income'
df['Income Category'][  df['Median Household Income'] >= df['Median Household Income'].quantile(0.8) ] = 'Highest Income'

df.head()

In [None]:
# Bank Variables
df2 = df

NoBL = df.groupby(['Bank', 'Original Date', 'Zip Code']).size().reset_index(name='Number of Bank Loans')
TBL = df.groupby(['Bank', 'Original Date']).size().reset_index(name='Total Bank Loans')
PBL = pd.merge(NoBL, TBL, on=['Bank', 'Original Date'], how="left")
PBL['Proportion of Bank Loans'] = (PBL['Number of Bank Loans'] / PBL['Total Bank Loans']) * 100
df2 = pd.merge(df2, PBL, on=['Bank', 'Original Date', 'Zip Code'], how="left")

df2.head()

In [None]:
# List of banks
Banks = ['AMTRUST BANK', 'BANK OF AMERICA, N.A.', 'CITIMORTGAGE, INC.', 
         'FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB', 
         'FIRST TENNESSEE BANK NATIONAL ASSOCIATION', 'FLAGSTAR CAPITAL MARKETS CORPORATION', 
         'GMAC MORTGAGE, LLC', 'JPMORGAN CHASE BANK, NATIONAL ASSOCIATION', 'OTHER', 
         'PNC BANK, N.A.', 'SUNTRUST MORTGAGE INC.', 'CHASE HOME FINANCE', 'SMALL LOAN BANKS']

# Function to subset banking datasets
def Bank_Subsets(bank_strs, df_X = X_train, df_y = y_train):
    # Initiate Bank dictionaries
    X = {}
    y = {}

    # Bank Subset
    for bank_str in bank_strs:
        X[bank_str] = onehotencoding( df_X[df_X['Bank']==bank_str] \
            .drop(labels='Bank', axis=1) )
        y[bank_str] = y_train[X_train['Bank']==bank_str]
    
    return X, y

# Run Function
Banks_X, Banks_y = Bank_Subsets(Banks)

In [None]:
Banks = ['AMTRUST BANK', 'BANK OF AMERICA, N.A.', 'CITIMORTGAGE, INC.', 
         'FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB', 
         'FIRST TENNESSEE BANK NATIONAL ASSOCIATION', 'FLAGSTAR CAPITAL MARKETS CORPORATION', 
         'GMAC MORTGAGE, LLC', 'JPMORGAN CHASE BANK, NATIONAL ASSOCIATION', 'OTHER', 
         'PNC BANK, N.A.', 'SUNTRUST MORTGAGE INC.', 'CHASE HOME FINANCE', 'SMALL LOAN BANKS']

# Variables to drop
dropvars = ['Year', 'Quarter', 'Original Date', 'Credit Score', 
            'Zip Code', 'Mortgage Insurance Type']  # 'Property State', 

# All Data
All_X = onehotencoding( df.drop(labels=dropvars, axis=1) )
All_y = All_X['Foreclosed']
All_X = All_X.drop(labels='Foreclosed', axis=1) 

dropvars.append('Bank')

# Bank Datasets
Amtrust_X = onehotencoding( df[df['Bank']=='AMTRUST BANK'] \
    .drop(labels='Bank', axis=1) )
Amtrust_y = df[df['Bank']=='AMTRUST BANK']
Amtrust_X = Amtrust_X.drop(labels='Foreclosed', axis=1) 

BoA_X = onehotencoding( df[df['Bank']=='BANK OF AMERICA, N.A.'] \
    .drop(labels=dropvars, axis=1) )
BoA_y = BoA_X['Foreclosed']
BoA_X = BoA_X.drop(labels='Foreclosed', axis=1) 

Citi_X = onehotencoding( df[df['Bank']=='CITIMORTGAGE, INC.'] \
    .drop(labels=dropvars, axis=1) )
Citi_y = Citi_X['Foreclosed']
Citi_X = Citi_X.drop(labels='Foreclosed', axis=1) 

IndyMac_X = onehotencoding( df[df['Bank']=='FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB'] \
    .drop(labels=dropvars, axis=1) )
IndyMac_y = IndyMac_X['Foreclosed']
IndyMac_X = IndyMac_X.drop(labels='Foreclosed', axis=1) 

Tenn_X = onehotencoding( df[df['Bank']=='FIRST TENNESSEE BANK NATIONAL ASSOCIATION'] \
    .drop(labels=dropvars, axis=1) )
Tenn_y = Tenn_X['Foreclosed']
Tenn_X = Tenn_X.drop(labels='Foreclosed', axis=1) 

FlagStar_X = onehotencoding( df[df['Bank']=='FLAGSTAR CAPITAL MARKETS CORPORATION'] \
    .drop(labels=dropvars, axis=1) )
FlagStar_y = FlagStar_X['Foreclosed']
FlagStar_X = FlagStar_X.drop(labels='Foreclosed', axis=1) 

GMac_X = onehotencoding( df[df['Bank']=='GMAC MORTGAGE, LLC'] \
    .drop(labels=dropvars, axis=1) )
GMac_y = GMac_X['Foreclosed']
GMac_X = GMac_X.drop(labels='Foreclosed', axis=1) 

JPMorgan_X = onehotencoding( df[df['Bank']=='JPMORGAN CHASE BANK, NATIONAL ASSOCIATION'] \
    .drop(labels=dropvars, axis=1) )
JPMorgan_y = JPMorgan_X['Foreclosed']
JPMorgan_X = JPMorgan_X.drop(labels='Foreclosed', axis=1) 

Misc_X = onehotencoding( df[df['Bank']=='OTHER'] \
    .drop(labels=dropvars, axis=1) )
Misc_y = Misc_X['Foreclosed']
Misc_X = Misc_X.drop(labels='Foreclosed', axis=1) 

PNC_X = onehotencoding( df[df['Bank']=='PNC BANK, N.A.'] \
    .drop(labels=dropvars, axis=1) )
PNC_y = PNC_X['Foreclosed']
PNC_X = PNC_X.drop(labels='Foreclosed', axis=1) 

SunTrust_X = onehotencoding( df[df['Bank']=='SUNTRUST MORTGAGE INC.'] \
    .drop(labels=dropvars, axis=1) )
SunTrust_y = SunTrust_X['Foreclosed']
SunTrust_X = SunTrust_X.drop(labels='Foreclosed', axis=1) 

Chase_X = onehotencoding( df[df['Bank']=='CHASE HOME FINANCE'] \
    .drop(labels=dropvars, axis=1) )
Chase_y = Chase_X['Foreclosed']
Chase_X = Chase_X.drop(labels='Foreclosed', axis=1) 

Small_X = onehotencoding( df[df['Bank']=='SMALL LOAN BANKS'] \
    .drop(labels=dropvars, axis=1) )
Small_y = Small_X['Foreclosed']
Small_X = Small_X.drop(labels='Foreclosed', axis=1) 

In [None]:
# split dataset
X_train, X_test, y_train, y_test = train_test_split(All_X, All_y, test_size = 0.3, 
                                                    stratify = All_y, random_state=0)

In [None]:
def threshold(target_prob):
    # Determine threshold
    threshold = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]
    
    acc = []
    prec = []
    f1 = []
    best_avg = {'Iteration': -1, 'Threshold': 0.5, 'Best Avg Score': 0.0}
    best_acc = {'Iteration': -1, 'Threshold': 0.5, 'Best Accuracy Score': 0.0}
    best_prec = {'Iteration': -1, 'Threshold': 0.5, 'Best Precision Score': 0.0}
    best_f1 = {'Iteration': -1, 'Threshold': 0.5, 'Best F1 Score': 0.0}
    for i in range(len(threshold)):
        y_pred = target_prob.map(lambda x: 1 if x >= threshold[i] else 0)
        
        # Accuracy
        acc.append(accuracy_score(y_test, y_pred).round(2))
        # Precision
        prec.append(precision_score(y_test, y_pred).round(2))
        # F1 
        f1.append(f1_score(y_test, y_pred).round(2))
        # Avg
        avg.append( 3 * ( (acc[i]*prec[i]*f1[i]) / (acc[i]+prec[i]+f1[i]) ) )
        
        # Save best accuracy
        if (best_acc['Best Accuracy Score'] < acc[i]):
            best_acc = {'Iteration': i, 'Threshold': threshold[i], 'Best Accuracy Score': acc[i]}
        # Save best precision
        if (best_prec['Best Precision Score'] < prec[i]):
            best_prec = {'Iteration': i, 'Threshold': threshold[i], 'Best Precision Score': prec[i]}      
        # Save best f1
        if (best_f1['Best F1 Score'] < f1[i]):
            best_f1 = {'Iteration': i, 'Threshold': threshold[i], 'Best F1 Score': f1[i]}       
        # Save best avg
        if (best_avg['Best Avg Score'] < avg[i]):
            best_avg = {'Iteration': i, 'Threshold': threshold[i], 'Best Avg Score': avg[i]}   
        
    print(best_acc)
    print(best_prec)
    print(best_f1)
    print(best_avg)
    df_plot = pd.DataFrame({'Threshold': threshold, 'Accuracy': acc, 'Precision': prec, 
                            'F1 Score': f1, 'Average Score', avg})
    
    # Plot
    return( plt.plot(df_plot['Threshold'], df_plot.iloc[:,1:5]) )

In [None]:
def bal_bagging(X y, smote=False, random_state=2020, n_estimators=500, max_features=0.75,
                replacement=False, sampling_strategy='auto', pca=False, n_components=25):

    # SMOTE
    if smote:
        sm = SVMSMOTE(random_state=2020, out_step=0.75, n_jobs=-1)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    
    # define models
    model_bal = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=random_state, 
                                               max_features=max_features, replacement=replacement, 
                                               sampling_strategy=sampling_strategy, n_jobs=-1)
    model_wgt = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state, 
                                       max_features=max_features, 
                                       class_weight={0:0.1, 1:0.9}, n_jobs=-1)
    model_gbm = GradientBoostingClassifier(n_estimators=n_estimators, random_state=random_state, n_jobs=-1)
    model_knn = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
    
    if pca:
        dimredu = pca(n_components=n_components, random_state=random_state).fit(X_train)
        X_train = dimredu.transform(X_train)
        X_test = dimredu.transform(X_test)
    else:
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    
    # Model fit
    model_bal.fit(X_train, y_train)
    y_proba_bal = pd.DataFrame(model_bal.predict_proba(X_test), columns=["Did not Foreclose", "Foreclosed"])
    y_proba_wgt = pd.DataFrame(model_wgt.predict_proba(X_test), columns=["Did not Foreclose", "Foreclosed"])
    y_proba_gbm = pd.DataFrame(model_gbm.predict_proba(X_test), columns=["Did not Foreclose", "Foreclosed"])
    y_proba_knn = pd.DataFrame(model_knn.predict_proba(X_test), columns=["Did not Foreclose", "Foreclosed"])

    # Compare results via thresholds
    threshold_bal = threshold(y_proba_bal['Foreclosed'])
    threshold_reg = threshold(y_proba_reg['Foreclosed'])
    
    


In [15]:
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from imblearn.ensemble import EasyEnsembleClassifier # doctest: +NORMALIZE_WHITESPACE

X, y = make_classification(n_classes=2, class_sep=2,
    weights=[0.01, 0.99], n_informative=3, n_redundant=1, flip_y=0,
    n_features=5, n_clusters_per_class=1, n_samples=1000, random_state=10)
print('Original dataset shape %s' % Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=0)
model = EasyEnsembleClassifier(n_estimators=10, random_state=42, replacement=False)
model.fit(X_train, y_train) # doctest: +ELLIPSIS

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))

Original dataset shape Counter({1: 990, 0: 10})
[[  2   0]
 [  5 243]]


In [43]:
# Output Statistics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

# Table of predictions versus actuals
y_proba = pd.DataFrame(model.predict_proba(X_test), columns=["Did not Foreclose", "Foreclosed"])
threshold = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]

acc = []
prec = []
f1 = []
best_f1 = {'Iteration': -1, 'Threshold': 0.5, 'Best F1 Score': 0.0}
best_prec = {'Iteration': -1, 'Threshold': 0.5, 'Best Precision Score': 0.0}
for i in range(len(threshold)):
    y_pred = y_proba["Foreclosed"].map(lambda x: 1 if x >= threshold[i] else 0)
    
    # Overall Scores
    print('iteration', i, 'threshold', threshold[i])
    print('Accuracy:', end=' ')
    acc.append(accuracy_score(y_test, y_pred).round(2))
    print(acc[i])
    print('Precision:', end=' ')
    prec.append(precision_score(y_test, y_pred).round(2))
    print(prec[i])
    print('F1:', end=' ')
    f1.append(f1_score(y_test, y_pred).round(2))
    print(f1[i])
    print('')
    
    # Save best f1
    if (best_f1['Best F1 Score'] < f1[i]):
        best_f1 = {'Iteration': i, 'Threshold': threshold[i], 'Best F1 Score': f1[i]}

    # Save best precision
    if (best_prec['Best Precision Score'] < prec[i]):
        best_prec = {'Iteration': i, 'Threshold': threshold[i], 'Best Precision Score': prec[i]}

iteration 0
Accuracy: 0.98
Precision: 1.0
F1: 0.99

iteration 1
Accuracy: 0.98
Precision: 1.0
F1: 0.99

iteration 2
Accuracy: 0.97
Precision: 1.0
F1: 0.98

iteration 3
Accuracy: 0.91
Precision: 1.0
F1: 0.95

iteration 4
Accuracy: 0.89
Precision: 1.0
F1: 0.94



In [44]:
# Use best threshold
y_pred = y_proba["Foreclosed"].map(lambda x: 1 if x >= best_f1['Threshold'] else 0)

# Table of predictions versus actuals
target_values(y_pred, prediction=True)
    
# Confusion Matrix
print("\nConfusion matrix:")
PredictTable = pd.crosstab(y_test, np.array(y_pred))
PredictTable.columns = ['Predicted False', 'Predicted True']
PredictTable.index = ['Actual False', 'Actual True']
print(PredictTable)
# F1 Score
print("\nFinal F1 Score:")
print(best_f1['Best F1 Score'])
# Precision Table
print('\nFinal Precision Percentages:')
PrecisionTable = ( (PredictTable/(PredictTable.sum(0)))*100 ).round(1)
print(PrecisionTable.iloc[:,1])
# Recall Table
print('\nFinal Recall Percentages:')
RecallTable = ( (PredictTable.div(PredictTable.sum(axis=1), axis=0))*100 ).round(1)
print(RecallTable.iloc[0,:])

{'Iteration': 4, 'Best F1 Score': 0.94}

In [97]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
X, y = load_iris(return_X_y=True)
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('svr', make_pipeline(StandardScaler(),
                          LinearSVC(random_state=42)))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42
)
t = clf.fit(X_train, y_train)


In [99]:
t.score(X_test, y_test)

0.9473684210526315