In [25]:
# Load libraries
import requests
from pandas import read_csv
from pandas import read_html
from pandas import read_json
from pandas import read_excel
from pandas import read_stata
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
# Download online dataset to folder
source_url = "https://www.fdic.gov/bank/individual/failed/banklist.html"
target_file = "DownloadedDataset.html"

response = requests.get(source_url)
response.raise_for_status() # Verify OK
with open(target_file,"wb") as f:
    f.write(response.content)
print(target_file, "download ready.")

In [23]:
# Read online dataset directly into DataFrame
url = "https://www.fdic.gov/bank/individual/failed/banklist.html"
dataset = read_html(url)
print(dataset)

searchquery = 'Resolute Bank'
print("Searching for...", searchquery)
searchresult = pd.read_html(url,match='Resolute Bank')
print(searchresult)

[                             Bank Name           City  ST   CERT  \
0                 The First State Bank  Barboursville  WV  14361   
1                   Ericson State Bank        Ericson  NE  18265   
2     City National Bank of New Jersey         Newark  NJ  21111   
3                        Resolute Bank         Maumee  OH  58317   
4                Louisa Community Bank         Louisa  KY  58112   
..                                 ...            ...  ..    ...   
556                 Superior Bank, FSB       Hinsdale  IL  32646   
557                Malta National Bank          Malta  OH   6629   
558    First Alliance Bank & Trust Co.     Manchester  NH  34264   
559  National State Bank of Metropolis     Metropolis  IL   3815   
560                   Bank of Honolulu       Honolulu  HI  21029   

                   Acquiring Institution       Closing Date  
0                         MVB Bank, Inc.      April 3, 2020  
1             Farmers and Merchants Bank  February 14, 202

NameError: name 'pd' is not defined

In [None]:
# Describe & visualize data
# box and whisker plots
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
pyplot.show()
# histograms
dataset.hist()
pyplot.show()
# scatter plot matrix
scatter_matrix(dataset)
pyplot.show()

In [None]:
# Split into training & validation
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20, random_state=1)


In [None]:
# Make predictions on validation dataset - SVC
modelSVC = SVC(gamma='auto')
modelSVC.fit(X_train, Y_train)
predictions = modelSVC.predict(X_validation)

# Evaluate predictions
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
# Make predictions on validation dataset - Logistic
modelCART = DecisionTreeClassifier()
modelCART.fit(X_train, Y_train)
predictions = modelCART.predict(X_validation)

# Evaluate predictions
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))


In [None]:
# Evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))