In [109]:
## Pandas
import pandas as pd
## Numpy
import numpy as np
## MatplotLib
import matplotlib.pyplot as plt
## Seaborn
import seaborn as sns

## Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

## Classification Metrics
from sklearn.metrics import (roc_auc_score, ConfusionMatrixDisplay, 
                             PrecisionRecallDisplay, RocCurveDisplay, 
                             f1_score, accuracy_score, classification_report)

## Set global scikit-learn configuration 
from sklearn import set_config
## Display estimators as a diagram
set_config(display='diagram') # 'text' or 'diagram'}
# new libraries
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [89]:
df = pd.read_csv('/content/database.csv')
df.head()

Unnamed: 0,Institution Name,Main Office,Branch Name,Branch Number,Established Date,Acquired Date,Street Address,City,County,State,Zipcode,Latitude,Longitude,2010 Deposits,2011 Deposits,2012 Deposits,2013 Deposits,2014 Deposits,2015 Deposits,2016 Deposits
0,JPMorgan Chase Bank,1,JPMorgan Chase Bank Main Office,0,01/01/1824,,1111 Polaris Parkway,Columbus,Delaware,OH,43240,40.14453,-82.99115,633131000.0,743268000.0,832455000.0,916543000.0,1032549000.0,1069425000.0,1155185000
1,JPMorgan Chase Bank,0,Vernon Hills Scarsdale Branch,2,03/20/1961,,676 White Plains Road,Scarsdale,Westchester,NY,10583,40.97008,-73.8067,293229.0,310791.0,325742.0,327930.0,327792.0,341475.0,381558
2,JPMorgan Chase Bank,0,Great Neck Northern Boulevard Branch,3,09/09/1963,,410 Northern Boulevard,Great Neck,Nassau,NY,11021,40.77944,-73.7224,191011.0,206933.0,216439.0,237983.0,234183.0,262455.0,278940
3,JPMorgan Chase Bank,0,North Hartsdale Branch,4,02/19/1966,,353 North Central Avenue,Hartsdale,Westchester,NY,10530,41.02654,-73.79168,87110.0,88367.0,93163.0,109659.0,111985.0,116772.0,140233
4,JPMorgan Chase Bank,0,Lawrence Rockaway Branch,5,01/16/1965,,335 Rockaway Turnpike,Lawrence,Nassau,NY,11559,40.62715,-73.73675,172608.0,172749.0,189413.0,198445.0,205198.0,223200.0,235594


In [90]:
eda_ml = df.copy()

In [91]:
eda_ml.isna().sum()

Institution Name       0
Main Office            0
Branch Name            0
Branch Number          0
Established Date       0
Acquired Date       1615
Street Address         0
City                   0
County                 0
State                  0
Zipcode                0
Latitude              66
Longitude             66
2010 Deposits        740
2011 Deposits        578
2012 Deposits        329
2013 Deposits        175
2014 Deposits         56
2015 Deposits         19
2016 Deposits          0
dtype: int64

In [92]:
eda_ml.duplicated().sum()

0

In [93]:
eda_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5413 entries, 0 to 5412
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Institution Name  5413 non-null   object 
 1   Main Office       5413 non-null   int64  
 2   Branch Name       5413 non-null   object 
 3   Branch Number     5413 non-null   int64  
 4   Established Date  5413 non-null   object 
 5   Acquired Date     3798 non-null   object 
 6   Street Address    5413 non-null   object 
 7   City              5413 non-null   object 
 8   County            5413 non-null   object 
 9   State             5413 non-null   object 
 10  Zipcode           5413 non-null   int64  
 11  Latitude          5347 non-null   float64
 12  Longitude         5347 non-null   float64
 13  2010 Deposits     4673 non-null   float64
 14  2011 Deposits     4835 non-null   float64
 15  2012 Deposits     5084 non-null   float64
 16  2013 Deposits     5238 non-null   float64


In [94]:
eda_ml.describe(include='all')

Unnamed: 0,Institution Name,Main Office,Branch Name,Branch Number,Established Date,Acquired Date,Street Address,City,County,State,Zipcode,Latitude,Longitude,2010 Deposits,2011 Deposits,2012 Deposits,2013 Deposits,2014 Deposits,2015 Deposits,2016 Deposits
count,5413,5413.0,5413,5413.0,5413,3798,5413,5413,5413,5413,5413.0,5347.0,5347.0,4673.0,4835.0,5084.0,5238.0,5357.0,5394.0,5413.0
unique,1,,5317,,3237,18,5375,1880,376,26,,,,,,,,,,
top,JPMorgan Chase Bank,,Riverside Branch,,01/01/1919,11/13/2004,66 Main Street,New York City,Los Angeles,CA,,,,,,,,,,
freq,5413,,4,,138,1602,3,154,298,1003,,,,,,,,,,
mean,,0.000185,,4303.761685,,,,,,,58800.016072,37.248922,-95.206261,266865.7,303754.4,323461.0,346234.3,381985.3,393516.9,426818.8
std,,0.013592,,2146.930304,,,,,,,30863.015686,5.430381,17.542081,9568755.0,11108290.0,12113390.0,13123200.0,14613180.0,15073850.0,16329410.0
min,,0.0,,0.0,,,,,,,2110.0,0.0,-124.42137,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,0.0,,2548.0,,,,,,,33334.0,33.277225,-115.16374,22526.0,23157.5,24651.25,26892.5,29355.0,33941.0,39513.0
50%,,0.0,,4274.0,,,,,,,60654.0,38.93952,-88.44241,43072.0,44426.0,47173.5,50415.0,54503.0,60133.5,68157.0
75%,,0.0,,6172.0,,,,,,,89102.0,41.040635,-81.418135,85027.0,86147.0,90008.5,95068.75,102330.0,111567.8,124318.0


# Validation Split Data

In [95]:
X = eda_ml[['Institution Name','Main Office','City','Branch Number','Established Date','Acquired Date','Branch Name','Street Address','County','State','Zipcode','Latitude','Longitude','2010 Deposits','2011 Deposits','2012 Deposits','2013 Deposits','2014 Deposits','2015 Deposits','2016 Deposits']]


In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


# Column Selectors

In [97]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include ='number')

#column Transformer

In [98]:
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean_imputer')

In [99]:
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')

# Preprocessing

In [100]:
# Making a tuple for preprocessing for numerical column and numerical columns 
cat_tuple = (ohe, cat_selector)
num_tuple= (scaler, num_selector)

In [101]:
# make column transfomer
col_transformer = make_column_transformer(num_tuple, cat_tuple)

In [102]:
# used fot here for x_train
col_transformer.fit(X_train)

#Pipelines 

In [103]:
# numerci pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

In [104]:
# categrocal pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

#Column Transfomer

In [105]:
# Tuple for the column transdomer 
number_tuple = (numeric_pipe, num_selector)
categorical_tuple = (categorical_pipe, cat_selector)

# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, categorical_tuple)
preprocessor

In [106]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

In [107]:
 X.head()

Unnamed: 0,Institution Name,Main Office,City,Branch Number,Established Date,Acquired Date,Branch Name,Street Address,County,State,Zipcode,Latitude,Longitude,2010 Deposits,2011 Deposits,2012 Deposits,2013 Deposits,2014 Deposits,2015 Deposits,2016 Deposits
0,JPMorgan Chase Bank,1,Columbus,0,01/01/1824,,JPMorgan Chase Bank Main Office,1111 Polaris Parkway,Delaware,OH,43240,40.14453,-82.99115,633131000.0,743268000.0,832455000.0,916543000.0,1032549000.0,1069425000.0,1155185000
1,JPMorgan Chase Bank,0,Scarsdale,2,03/20/1961,,Vernon Hills Scarsdale Branch,676 White Plains Road,Westchester,NY,10583,40.97008,-73.8067,293229.0,310791.0,325742.0,327930.0,327792.0,341475.0,381558
2,JPMorgan Chase Bank,0,Great Neck,3,09/09/1963,,Great Neck Northern Boulevard Branch,410 Northern Boulevard,Nassau,NY,11021,40.77944,-73.7224,191011.0,206933.0,216439.0,237983.0,234183.0,262455.0,278940
3,JPMorgan Chase Bank,0,Hartsdale,4,02/19/1966,,North Hartsdale Branch,353 North Central Avenue,Westchester,NY,10530,41.02654,-73.79168,87110.0,88367.0,93163.0,109659.0,111985.0,116772.0,140233
4,JPMorgan Chase Bank,0,Lawrence,5,01/16/1965,,Lawrence Rockaway Branch,335 Rockaway Turnpike,Nassau,NY,11559,40.62715,-73.73675,172608.0,172749.0,189413.0,198445.0,205198.0,223200.0,235594
