In [7]:
# Import Required Libraries

import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.extra.rate_limiter import RateLimiter
from sklearn.neighbors import BallTree
from tqdm.notebook import tqdm
import time
import os


In [2]:
# Load Enriched DataFrame

df = pd.read_csv("../data/enriched/enriched_us_bank_locations.csv")
df = df.dropna(subset=["latitude", "longitude"])
df.reset_index(drop=True, inplace=True)
df.head()


Unnamed: 0,rank,total_deposits,bank_name,latitude,longitude
0,1,2601221000000.0,JPMorgan Chase Bank,37.29175,-122.031897
1,2,2071624000000.0,Bank of America,51.516215,-0.100094
2,3,1419560000000.0,Wells Fargo Bank,40.036492,-75.129305
3,4,1361654000000.0,Citibank,0.421434,9.429063
4,5,523102100000.0,U.S. Bank,47.816222,-110.669211


In [3]:
# Feature: Distance to Nearest Competitor Bank

# Convert latitude/longitude to radians for BallTree
coords = np.radians(df[["latitude", "longitude"]].values)
tree = BallTree(coords, metric='haversine')

# Query for 2 nearest neighbors (self + nearest competitor)
distances, indices = tree.query(coords, k=2)

# Get distance to nearest *other* bank in kilometers
df["nearest_competitor_distance_km"] = distances[:, 1] * 6371  # Earth radius in km


In [4]:
# Feature : Number of Competitors within 10 km Radius

radius_km = 10
radius_rad = radius_km / 6371

# Count how many neighbors are within the radius
neighbor_counts = tree.query_radius(coords, r=radius_rad)

df["branch_density_10km"] = [len(nbrs)-1 for nbrs in neighbor_counts]  # exclude self


In [8]:
# Feature: Get City and State

geolocator = Nominatim(user_agent="geo_bank_reverse")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1)  # one request/sec

states = []
cities = []

for lat, lon in tqdm(zip(df["latitude"], df["longitude"]), total=len(df)):
    try:
        location = reverse((lat, lon), language='en')
        address = location.raw.get("address", {})
        state = address.get("state", None)
        city = address.get("city", address.get("town", address.get("village", None)))
    except Exception as e:
        state = None
        city = None

    states.append(state)
    cities.append(city)

df["state"] = states
df["city"] = cities


  0%|          | 0/2850 [00:00<?, ?it/s]

In [9]:
# Save Updated DataFrame

os.makedirs("../data/final", exist_ok=True)
df.to_csv("../data/final/geo_features_us_bank_final.csv", index=False)
