In [1]:
#import dependencies

import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import json
from config import api_key

## Extract Data

In [3]:
#datasource url 
url = "https://inciweb.nwcg.gov/feeds/rss/incidents/"
state_fetch_url = "https://maps.googleapis.com/maps/api/geocode/json?latlng="
key_string = "&key=" + api_key

In [4]:
# function used to unpack state information from lat/lon
def state_extract(obj):
    stateval = ""
    
    for eachitem in obj:
        for k, v in eachitem.items():
            if k == "long_name":
                stateval = v
            elif isinstance(v, list):
                for item in v:
                    if item == "administrative_area_level_1":
                        return stateval

    return stateval

In [5]:
# instanciate landing dataframe
starting_data = pd.DataFrame()

# request xml code from url
xml_data = requests.get(url).content

#parse the xml response
soup = bs(xml_data, "xml")
     
# Find all text in the data
texts = str(soup.findAll(text=True)).replace('\\n','')
    
#Find the tag/child
child = soup.find("item")

# instatiate column lists
title = []
published = []
lat = []
lon = []
link = []
description = []
state = []

#loop trough each "item" in the xml response and store the target data
while True:    
    try:
        title.append(" ".join(child.find('title')))
    except:
         title.append(" ")
            
    try:
        published.append(" ".join(child.find('published')))
    except:
        published.append(" ")
        
    try:
        lat.append(" ".join(child.find('geo:lat')))
    except:
        lat.append(" ")
            
    try:
        lon.append(" ".join(child.find('geo:long')))
    except:
        lon.append(" ")
        
    try:
        link.append(" ".join(child.find('link')))
    except:
        link.append(" ")
        
    try:
        description.append(" ".join(child.find('description')))
    except:
        description.append(" ")
    
    # use google geomapping to determine state from lat/lon
    try:
        latlng = " ".join(child.find('geo:lat')) + ","+ " ".join(child.find('geo:long'))
        resp_data = requests.get(state_fetch_url + latlng + key_string).json()
        state.append(state_extract(resp_data.get('results')[0].get('address_components')))
    except:
        state.append(" ")
    
    try:   
        # Next sibling of child, here: 'item' 
        child = child.find_next_sibling('item')
    except:
        break
    
    #create dataframe
    data = pd.DataFrame({"title":title,
                                    "published":published,
                                    "lat":lat,
                                    "lon":lon,
                                    "link_url":link,
                                    "description": description,
                                    "state": state
                                    })
    starting_data = starting_data.append(data, ignore_index = True)

In [6]:
# drop duplicate rows
unique_data = starting_data.drop_duplicates(keep="first",ignore_index="True")
unique_data.head(5)

Unnamed: 0,title,published,lat,lon,link_url,description,state
0,Klamath NF RX Burning 2020/2021 (Prescribed Fire),"Wed, 09 Dec 2020 16:05:09 -06:00",41.738611111111,-122.77888888889,http://inciweb.nwcg.gov/incident/7274/,"After a challenging fire season, the arrival o...",California
1,Shasta-Trinity RX Burning 2020/2021 (Prescribe...,"Wed, 09 Dec 2020 15:42:45 -06:00",41.371944444444,-121.97138888889,http://inciweb.nwcg.gov/incident/7268/,See the 'Announcements' and 'News' Tabs for t...,California
2,Bond Fire (Wildfire),"Tue, 08 Dec 2020 21:03:37 -06:00",33.777222222222,-117.63833333333,http://inciweb.nwcg.gov/incident/7275/,"The Bond Fire started December 2, 2020 at 10:1...",California
3,Cameron Peak Fire (Wildfire),"Tue, 08 Dec 2020 12:52:01 -06:00",40.608611111111,-105.87916666667,http://inciweb.nwcg.gov/incident/6964/,A Type 3 Incident Management Team assumed comm...,Colorado
4,Canyon Wildfire (Wildfire),"Mon, 07 Dec 2020 20:44:22 -06:00",35.881944444444,-118.45805555556,http://inciweb.nwcg.gov/incident/7276/,,California
5,North Complex (Wildfire),"Mon, 07 Dec 2020 17:50:15 -06:00",40.090833333333,-120.93111111111,http://inciweb.nwcg.gov/incident/6997/,"Incident Start Date: 8/17/2020 Size: 318,93...",California
6,Fall Prescribed Fire Projects 2020 (Prescribed...,"Mon, 07 Dec 2020 15:50:50 -06:00",44.417777777778,-118.95138888889,http://inciweb.nwcg.gov/incident/7216/,Please take a moment to look at the links sec...,Oregon
7,Rattlesnake Fire (Wildfire),"Sun, 06 Dec 2020 19:33:15 -06:00",36.419722222222,-118.44833333333,http://inciweb.nwcg.gov/incident/7131/,The Rattlesnake Fire was discovered on August ...,California
8,SQF Complex (Wildfire),"Sun, 06 Dec 2020 11:58:25 -06:00",36.255,-118.49666666667,http://inciweb.nwcg.gov/incident/7048/,Operations Map | Fire History Map | Land Ow...,California
9,Grizzly Creek Fire (Wildfire),"Fri, 04 Dec 2020 09:05:15 -06:00",39.566666666667,-107.27138888889,http://inciweb.nwcg.gov/incident/6942/,Dec. 4 Update. There has been little activity ...,Colorado


In [None]:
# go the the link url for each rown and extract additional data (cause, size)

#instatiate landing lists
causes = []
sizes = []

#loop through each row of data
for x in range(len(unique_data)):
    #find the link in the row
    url = starting_data.loc[x,"link_url"]
    #go to the page and grap all the tables
    tables = pd.read_html(url)
    
    #the number of tables the page has will determine which tables we look in for data.
    #if there are more than one tables...
    if len(tables)>1:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[2,1]
        except:
            cause = "unknown"
    
        try:
            # find the "size" in the second table on the page (if it exists)
            size = tables[1].loc[(tables[1][0]) == "Size",1].item()            
        except:
            size = "unknown"
    #if there is only one table on the page...
    else:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[1,1]
            # no size data is available
            size = "n/a"
        except:
            cause = "unknown"
    
    #add cause and size to their lists
    causes.append(cause)
    sizes.append(size)
    
    #print progress
    print(f"{x+1} of {len(unique_data)}")

## Clean/Transform Data 

In [None]:
# if cause has the word "investigation" in it, set cause to "unknown"
for y in range(len(causes)):
    if "Investigation" in causes[y]:
        causes[y] = "Unknown"

# remove the word "Acres" from the size data
sizes = [s.replace(" Acres","") for s in sizes]
sizes = [s.replace(",","") for s in sizes]

In [None]:
# add causes and sizes to the dataframe
unique_data["cause"] = causes
unique_data["acres"] = sizes

In [None]:
# see the counts of each fire cause for reference
grouped_df = unique_data.groupby(["cause"])
grouped_df.count()

In [None]:
state_grouped_df = unique_data.groupby(["state"])
state_grouped_df.count()

## Prepare to load

In [None]:
# save the dataframe as "clean_data"
clean_data = unique_data
clean_data.head(5)

In [None]:
#store as csv for testing
clean_data.to_csv("data.csv")