In [1]:
#import dependencies

import pandas as pd
from bs4 import BeautifulSoup as bs
#from bs4.element import Comment
#from splinter import Browser
import requests
import json
from config import api_key
#import time
#import numpy as np


## Extract Data

In [2]:
#datasource url 
url = "https://inciweb.nwcg.gov/feeds/rss/incidents/"
state_fetch_url = "https://maps.googleapis.com/maps/api/geocode/json?latlng="
key_string = "&key=" + api_key

In [3]:
def state_extract(obj):
    stateval = ""
    
    for eachitem in obj:
        for k, v in eachitem.items():
            if k == "long_name":
                stateval = v
            elif isinstance(v, list):
                for item in v:
                    if item == "administrative_area_level_1":
                        return stateval

    return stateval

In [4]:
# instanciate landing dataframe
starting_data = pd.DataFrame()

# request xml code from url
xml_data = requests.get(url).content

#parse the xml response
soup = bs(xml_data, "xml")
     
# Find all text in the data
texts = str(soup.findAll(text=True)).replace('\\n','')
    
#Find the tag/child
child = soup.find("item")

# instatiate column lists
title = []
published = []
lat = []
lon = []
link = []
description = []
state=[]

#loop trough each "item" in the xml response and store the target data
while True:    
    try:
        title.append(" ".join(child.find('title')))
    except:
         title.append(" ")
            
    try:
        published.append(" ".join(child.find('published')))
    except:
        published.append(" ")
        
    try:
        lat.append(" ".join(child.find('geo:lat')))
    except:
        lat.append(" ")
            
    try:
        lon.append(" ".join(child.find('geo:long')))
    except:
        lon.append(" ")
        
    try:
        link.append(" ".join(child.find('link')))
    except:
        link.append(" ")
        
    try:
        description.append(" ".join(child.find('description')))
    except:
        description.append(" ")
    
    try:
        latlng = " ".join(child.find('geo:lat')) + ","+ " ".join(child.find('geo:long'))
        resp_data = requests.get(state_fetch_url + latlng + key_string).json()
        state.append(state_extract(resp_data.get('results')[0].get('address_components')))
    except:
        state.append(" ")
    
    try:   
        # Next sibling of child, here: 'item' 
        child = child.find_next_sibling('item')
    except:
        break
    
    #create dataframe
    data = pd.DataFrame({"title":title,
                                    "published":published,
                                    "lat":lat,
                                    "lon":lon,
                                    "link_url":link,
                                    "description": description,
                                    "state": state})
    starting_data = starting_data.append(data, ignore_index = True)

In [5]:
# drop duplicate rows
unique_data = starting_data.drop_duplicates(keep="first",ignore_index="True")
unique_data.head(15)

Unnamed: 0,title,published,lat,lon,link_url,description,state
0,Cameron Peak Fire (Wildfire),"Tue, 08 Dec 2020 12:52:01 -06:00",40.608611111111,-105.87916666667,http://inciweb.nwcg.gov/incident/6964/,A Type 3 Incident Management Team assumed comm...,
1,Klamath NF RX Burning 2020/2021 (Prescribed Fire),"Tue, 08 Dec 2020 12:30:57 -06:00",41.738611111111,-122.77888888889,http://inciweb.nwcg.gov/incident/7274/,"After a challenging fire season, the arrival o...",
2,Bond Fire (Wildfire),"Tue, 08 Dec 2020 09:18:05 -06:00",33.777222222222,-117.63833333333,http://inciweb.nwcg.gov/incident/7275/,"The Bond Fire started December 2, 2020 at 10:1...",
3,Canyon Wildfire (Wildfire),"Mon, 07 Dec 2020 20:44:22 -06:00",35.881944444444,-118.45805555556,http://inciweb.nwcg.gov/incident/7276/,,
4,North Complex (Wildfire),"Mon, 07 Dec 2020 17:50:15 -06:00",40.090833333333,-120.93111111111,http://inciweb.nwcg.gov/incident/6997/,"Incident Start Date: 8/17/2020 Size: 318,93...",
5,Fall Prescribed Fire Projects 2020 (Prescribed...,"Mon, 07 Dec 2020 15:50:50 -06:00",44.417777777778,-118.95138888889,http://inciweb.nwcg.gov/incident/7216/,Please take a moment to look at the links sec...,
6,Rattlesnake Fire (Wildfire),"Sun, 06 Dec 2020 19:33:15 -06:00",36.419722222222,-118.44833333333,http://inciweb.nwcg.gov/incident/7131/,The Rattlesnake Fire was discovered on August ...,
7,SQF Complex (Wildfire),"Sun, 06 Dec 2020 11:58:25 -06:00",36.255,-118.49666666667,http://inciweb.nwcg.gov/incident/7048/,Operations Map | Fire History Map | Land Ow...,
8,Grizzly Creek Fire (Wildfire),"Fri, 04 Dec 2020 09:05:15 -06:00",39.566666666667,-107.27138888889,http://inciweb.nwcg.gov/incident/6942/,Dec. 4 Update. There has been little activity ...,
9,Creek Fire (Wildfire),"Thu, 03 Dec 2020 18:06:37 -06:00",37.201111111111,-119.27166666667,http://inciweb.nwcg.gov/incident/7147/,,


In [6]:
# go the the link url for each rown and extract additional data (cause, size)

#instatiate landing lists
causes = []
sizes = []

#loop through each row of data
for x in range(len(unique_data)):
    #find the link in the row
    url = starting_data.loc[x,"link_url"]
    #go to the page and grap all the tables
    tables = pd.read_html(url)
    
    #the number of tables the page has will determine which tables we look in for data.
    #if there are more than one tables...
    if len(tables)>1:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[2,1]
        except:
            cause = "unknown"
    
        try:
            # find the "size" in the second table on the page (if it exists)
            size = tables[1].loc[(tables[1][0]) == "Size",1].item()            
        except:
            size = "unknown"
    #if there is only one table on the page...
    else:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[1,1]
            # no size data is available
            size = "n/a"
        except:
            cause = "unknown"
    
    #add cause and size to their lists
    causes.append(cause)
    sizes.append(size)
    
    #print progress
    print(f"{x+1} of {len(unique_data)}")


1 of 125
2 of 125
3 of 125
4 of 125
5 of 125
6 of 125
7 of 125
8 of 125
9 of 125
10 of 125
11 of 125
12 of 125
13 of 125
14 of 125
15 of 125
16 of 125
17 of 125
18 of 125
19 of 125
20 of 125
21 of 125
22 of 125
23 of 125
24 of 125
25 of 125
26 of 125
27 of 125
28 of 125
29 of 125
30 of 125
31 of 125
32 of 125
33 of 125
34 of 125
35 of 125
36 of 125
37 of 125
38 of 125
39 of 125
40 of 125
41 of 125
42 of 125
43 of 125
44 of 125
45 of 125
46 of 125
47 of 125
48 of 125
49 of 125
50 of 125
51 of 125
52 of 125
53 of 125
54 of 125
55 of 125
56 of 125
57 of 125
58 of 125
59 of 125
60 of 125
61 of 125
62 of 125
63 of 125
64 of 125
65 of 125
66 of 125
67 of 125
68 of 125
69 of 125
70 of 125
71 of 125
72 of 125
73 of 125
74 of 125
75 of 125
76 of 125
77 of 125
78 of 125
79 of 125
80 of 125
81 of 125
82 of 125
83 of 125
84 of 125
85 of 125
86 of 125
87 of 125
88 of 125
89 of 125
90 of 125
91 of 125
92 of 125
93 of 125
94 of 125
95 of 125
96 of 125
97 of 125
98 of 125
99 of 125
100 of 125
101 of 1

## Clean/Transform Data 

In [7]:
# if cause has the word "investigation" in it, set cause to "unknown"
for y in range(len(causes)):
    if "Investigation" in causes[y]:
        causes[y] = "Unknown"

# remove the word "Acres" from the size data
sizes = [s.replace(" Acres","") for s in sizes]
sizes = [s.replace(",","") for s in sizes]

In [8]:
# add causes and sizes to the dataframe
unique_data["cause"] = causes
unique_data["acres"] = sizes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_data["cause"] = causes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_data["acres"] = sizes


In [9]:
# see the counts of each fire cause for reference
grouped_df = unique_data.groupby(["cause"])
grouped_df.count()

Unnamed: 0_level_0,title,published,lat,lon,link_url,description,state,acres
cause,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Burned Area Emergency Response,1,1,1,1,1,1,1,1
Human,12,12,12,12,12,12,12,12
Lightning,29,29,29,29,29,29,29,29
Prescribed Fire,30,30,30,30,30,30,30,30
Unknown,53,53,53,53,53,53,53,53


In [10]:
state_grouped_df = unique_data.groupby(["state"])
state_grouped_df.count()

Unnamed: 0_level_0,title,published,lat,lon,link_url,description,cause,acres
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,125,125,125,125,125,125,125,125


In [11]:
# save the dataframe as "clean_data"

clean_data = unique_data
clean_data.head()

Unnamed: 0,title,published,lat,lon,link_url,description,state,cause,acres
0,Cameron Peak Fire (Wildfire),"Tue, 08 Dec 2020 12:52:01 -06:00",40.608611111111,-105.87916666667,http://inciweb.nwcg.gov/incident/6964/,A Type 3 Incident Management Team assumed comm...,,Unknown,208913.0
1,Klamath NF RX Burning 2020/2021 (Prescribed Fire),"Tue, 08 Dec 2020 12:30:57 -06:00",41.738611111111,-122.77888888889,http://inciweb.nwcg.gov/incident/7274/,"After a challenging fire season, the arrival o...",,Unknown,208913.0
2,Bond Fire (Wildfire),"Tue, 08 Dec 2020 09:18:05 -06:00",33.777222222222,-117.63833333333,http://inciweb.nwcg.gov/incident/7275/,"The Bond Fire started December 2, 2020 at 10:1...",,Prescribed Fire,
3,Canyon Wildfire (Wildfire),"Mon, 07 Dec 2020 20:44:22 -06:00",35.881944444444,-118.45805555556,http://inciweb.nwcg.gov/incident/7276/,,,Unknown,208913.0
4,North Complex (Wildfire),"Mon, 07 Dec 2020 17:50:15 -06:00",40.090833333333,-120.93111111111,http://inciweb.nwcg.gov/incident/6997/,"Incident Start Date: 8/17/2020 Size: 318,93...",,Prescribed Fire,


In [12]:
#store as csv for testing
clean_data.to_csv("data.csv")