In [1]:
import bs4
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup 
from collections import defaultdict

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC

In [2]:
'''
The data required for crawling is saved in a dynamic table that updates with more rows everytime the page is
scrolled to the bottom, this script will scroll to the bottom of the page revealing all data.
'''

def scrollPage(URL):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(URL)
    
    try:
        # Wait for table to load (max 2 min)
        elem = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        
    finally:
        last_height = driver.execute_script('return document.querySelector("table").scrollHeight')
        element = driver.find_element(By.TAG_NAME, "table")

        while True:
            element.send_keys(Keys.END)      # Scroll to end of available page
            time.sleep(3)                    # Wait for new content to load

            new_height = driver.execute_script('return document.querySelector("table").scrollHeight')
            if new_height == last_height:    # End of page
                 break
            last_height = new_height

        return driver.page_source            # html

In [3]:
'''
After all the table is loaded, this script will crawl the page and retrieve the data.
'''

def crawlHTML(htmlText):    
    soup = BeautifulSoup(htmlText, "html.parser")
    titles, columns = {}, defaultdict(list)
    i = 0
    
    # Save the collumn number of each header title for later access to its values
    for t in soup.findAll("th"):         
        titles[t.find("span").string.strip()] = i
        i += 1
    
    # Create a dictionary of {header title : list of collumn values}
    for title in titles:
        for value in soup.findAll("td", attrs={"data-col":str(titles[title])}):
            columns[title].append(value.text)
            
    return pd.DataFrame(columns)         # DataFrame

In [4]:
def saveDataFrameToCSV(df, path):
    df.to_csv(path, index=False)

In [5]:
def runProgram(URL, path_csv):
    htmlText = scrollPage(URL)
    df = crawlHTML(htmlText)
    df.columns = df.columns.str.replace(' ', '')    # Remove spaces in titles
    saveDataFrameToCSV(df, path_csv)

In [6]:
'''
Driver Code
'''

# URL_History:    Page with all wildfire records since 2014, USA only
# URL_Live:       Page with all ongoing wildfires, USA only, short table

# URL_History = "https://data-nifc.opendata.arcgis.com/datasets/wfigs-wildland-fire-locations-full-history/explore?showTable=true"
# URL_live = "https://data-nifc.opendata.arcgis.com/datasets/wfigs-current-wildland-fire-locations/explore?showTable=true"
# path_csv = "Wildland Fires.csv"

# runProgram(URL_live, path_csv)

# print("Task complete")

'\nDriver Code\n'