In [8]:
# 📦 Install Dependencies (Run once in your notebook or terminal)
!pip install selenium webdriver-manager beautifulsoup4 pandas




In [9]:
# 📚 Imports
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import os


In [10]:
# 🚀 Step 1: Set up Selenium with Headless Firefox
options = Options()
options.add_argument("--headless")  # so it runs in background

driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)


In [11]:
# 🌐 Step 2: Load the Bank Rankings Page
url = "https://www.usbanklocations.com/bank-rank/total-deposits.html"
driver.get(url)
time.sleep(5)  # wait for page to load


In [12]:
# 🧠 Step 3: Debug and Parse Table with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# 🔍 Debug: Check what tables exist
print("=== DEBUG: Finding tables ===")
all_tables = soup.find_all("table")
print(f"Found {len(all_tables)} table(s)")

=== DEBUG: Finding tables ===
Found 2 table(s)


In [13]:
# 💾 Step 4: Save each table to a separate CSV file
os.makedirs("../data/raw", exist_ok=True)

for i, table in enumerate(all_tables):
    print(f"Processing Table {i+1}...")
    
    # Check table structure
    rows = table.find_all("tr")
    print(f"Table {i+1} has {len(rows)} rows")
    
    data = []
    
    if len(rows) > 0:
        header_row = rows[0]
        header_cells = header_row.find_all(['th', 'td'])
        headers = [cell.text.strip() for cell in header_cells]
        
        # Skip header and process data rows
        data_rows = rows[1:] if len(rows) > 1 else []
        print(f"Data rows to process: {len(data_rows)}")
        
        for row in data_rows:
            cols = row.find_all("td")
            if len(cols) >= len(headers):  # Ensure we have enough columns
                row_data = [col.text.strip() for col in cols]
                data.append(row_data)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(data, columns=headers)
    csv_file_path = f"../data/raw/table_{i+1}.csv"
    df.to_csv(csv_file_path, index=False)
    print(f"✅ Table {i+1} saved to {csv_file_path}")

Processing Table 1...
Table 1 has 1 rows
Data rows to process: 0
✅ Table 1 saved to ../data/raw/table_1.csv
Processing Table 2...
Table 2 has 4492 rows
Data rows to process: 4491
✅ Table 2 saved to ../data/raw/table_2.csv


In [14]:
# ✅ Step 5: Close the Browser
driver.quit()
