In [None]:
import bs4
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup 
from collections import defaultdict

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC

Scroll the page to load all data

In [None]:
'''
The data required for crawling is saved in a dynamic table that updates with more rows everytime the page is
scrolled to the bottom, this script will scroll to the bottom of the page revealing all data.
if recent is true then the script will scroll to the most recent fire in the saved csv file and stop.
'''

def scrollPage(URL, recent = False):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(URL)
    
    try:
        # Wait for table to load (max 2 min)
        elem = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        
    finally:
        last_height = driver.execute_script('return document.querySelector("table").scrollHeight')
        element = driver.find_element(By.TAG_NAME, "table")
        
        # FireDiscoveryDateTime
        if recent:
            df = pd.read_csv("Full_Wildland_Fires.csv")
            identifier = df["UniqueFireIdentifier"][0]
            
            driver.find_element_by_css_selector('#ember91-title').click()
            time.sleep(5)
            driver.find_element_by_css_selector('#ember91-title').click()
            time.sleep(5)
        
        while True:
            element.send_keys(Keys.END)      # Scroll to end of available page
            time.sleep(5)                    # Wait for new content to load
            
            if recent:
                if identifier in driver.page_source:
                    break
                
            new_height = driver.execute_script('return document.querySelector("table").scrollHeight')
            if new_height == last_height:    # End of page
                 break
            last_height = new_height

        return driver.page_source            # html

Crawl the page

In [None]:
'''
After all the table is loaded, this script will crawl the page and retrieve the data.
'''

def crawlHTML(htmlText):    
    soup = BeautifulSoup(htmlText, "html.parser")
    titles, columns = {}, defaultdict(list)
    i = 0
    
    # Save the collumn number of each header title for later access to its values
    for t in soup.findAll("th"):         
        titles[t.find("span").string.strip()] = i
        i += 1
    
    # Create a dictionary of {header title : list of collumn values}
    for title in titles:
        for value in soup.findAll("td", attrs={"data-col":str(titles[title])}):
            columns[title].append(value.text)
            
    return pd.DataFrame(columns)         # DataFrame

Save to csv

In [None]:
def saveDataFrameToCSV(df, path):
    df.to_csv(path, index=False)

Run Program

In [None]:
def runProgram(URL, path_csv, recent = False):
    htmlText = scrollPage(URL, recent)
    df = crawlHTML(htmlText)
    df.columns = df.columns.str.replace(' ', '')    # Remove spaces in titles
    saveDataFrameToCSV(df, path_csv)

#### Driver Code

In [3]:
'''
source:         https://data-nifc.opendata.arcgis.com

URL_Live:       Page with all ongoing wildfires, USA only, short table
URL_History:    Page with all wildfire records since 2014, USA only

recent:         True:  use with 'URL_History' to crawl new data that is not in the existing dataset
                False: use with 'URL_live' to crawl ongoing fires
                
Live dataset name:      'Live_Wildland_Fires.csv'
Existing dataset name:  'Full_Wildland_Fires.csv'
Recent dataset name:    'Recent_Wildland_Fires.csv'

-- Change 'path_csv' name and 'recent' value accordingly.
'''

URL_live = "https://data-nifc.opendata.arcgis.com/datasets/wfigs-current-wildland-fire-locations/explore?showTable=true"
URL_History = "https://data-nifc.opendata.arcgis.com/datasets/wfigs-wildland-fire-locations-full-history/explore?showTable=true"
path_csv = "Recent_Wildland_Fires.csv"

recent = True    

runProgram(URL_History, path_csv, recent)

print("Task complete")