In [1]:
#import dependencies

import pandas as pd
from bs4 import BeautifulSoup as bs
#from bs4.element import Comment
#from splinter import Browser
import requests
#import time
#import numpy as np


## Extract Data

In [2]:
#datasource url 
url = "https://inciweb.nwcg.gov/feeds/rss/incidents/"

In [3]:
# instanciate landing dataframe
starting_data = pd.DataFrame()

# request xml code from url
xml_data = requests.get(url).content

#parse the xml response
soup = bs(xml_data, "xml")
     
# Find all text in the data
texts = str(soup.findAll(text=True)).replace('\\n','')
    
#Find the tag/child
child = soup.find("item")

# instatiate column lists
title = []
published = []
lat = []
lon = []
link = []
description = []

#loop trough each "item" in the xml response and store the target data
while True:    
    try:
        title.append(" ".join(child.find('title')))
    except:
         title.append(" ")
            
    try:
        published.append(" ".join(child.find('published')))
    except:
        published.append(" ")
        
    try:
        lat.append(" ".join(child.find('geo:lat')))
    except:
        lat.append(" ")
            
    try:
        lon.append(" ".join(child.find('geo:long')))
    except:
        lon.append(" ")
        
    try:
        link.append(" ".join(child.find('link')))
    except:
        link.append(" ")
        
    try:
        description.append(" ".join(child.find('description')))
    except:
        description.append(" ")
        
    try:   
        # Next sibling of child, here: 'item' 
        child = child.find_next_sibling('item')
    except:
        break
    
    #create dataframe
    data = pd.DataFrame({"title":title,
                                    "published":published,
                                    "lat":lat,
                                    "lon":lon,
                                    "link_url":link,
                                    "description": description})
    starting_data = starting_data.append(data, ignore_index = True)

In [4]:
# drop duplicate rows
unique_data = starting_data.drop_duplicates(keep="first",ignore_index="True")
unique_data.head(15)

Unnamed: 0,title,published,lat,lon,link_url,description
0,Creek Fire (Wildfire),"Thu, 03 Dec 2020 18:06:37 -06:00",37.201111111111,-119.27166666667,http://inciweb.nwcg.gov/incident/7147/,
1,Riverside Fire (Wildfire),"Thu, 03 Dec 2020 17:48:24 -06:00",45.049166666667,-122.06222222222,http://inciweb.nwcg.gov/incident/7174/,"The Riverside Fire started September 8, 2020 a..."
2,Dolan Fire (Wildfire),"Thu, 03 Dec 2020 15:40:38 -06:00",36.123333333333,-121.60166666667,http://inciweb.nwcg.gov/incident/7018/,The Dolan Fire will be considered contained/c...
3,Klamath NF RX Burning 2020/2021 (Prescribed Fire),"Thu, 03 Dec 2020 15:09:37 -06:00",41.738611111111,-122.77888888889,http://inciweb.nwcg.gov/incident/7274/,"After a challenging fire season, the arrival o..."
4,Rattlesnake Fire (Wildfire),"Thu, 03 Dec 2020 13:31:38 -06:00",36.419722222222,-118.44833333333,http://inciweb.nwcg.gov/incident/7131/,The Rattlesnake Fire was discovered on August ...
5,Cameron Peak Fire (Wildfire),"Thu, 03 Dec 2020 13:00:37 -06:00",40.608611111111,-105.87916666667,http://inciweb.nwcg.gov/incident/6964/,The Southern Area Gold Type 2 Incident Managem...
6,2020-21 Royal Gorge FO Pile Burns (Prescribed ...,"Thu, 03 Dec 2020 10:39:27 -06:00",38.444722222222,-105.19638888889,http://inciweb.nwcg.gov/incident/7266/,"November 12, 2020Bureau of Land Management pla..."
7,Shasta-Trinity RX Burning 2020/2021 (Prescribe...,"Wed, 02 Dec 2020 09:33:20 -06:00",41.371944444444,-121.97138888889,http://inciweb.nwcg.gov/incident/7268/,See the 'Announcements' and 'News' Tabs for t...
8,Woodhead BAER (Burned Area Emergency Response),"Tue, 01 Dec 2020 17:51:59 -06:00",44.756111111111,-116.87666666667,http://inciweb.nwcg.gov/incident/7262/,"The Woodhead Fire burned almost 100,000 acre..."
9,Big Hollow Fire (Wildfire),"Tue, 01 Dec 2020 13:44:54 -06:00",45.926944444444,-121.98,http://inciweb.nwcg.gov/incident/7171/,The Big Hollow Fire is now considered to be 7...


In [5]:
# go the the link url for each rown and extract additional data (cause, size)

#instatiate landing lists
causes = []
sizes = []

#loop through each row of data
for x in range(len(unique_data)):
    #find the link in the row
    url = starting_data.loc[x,"link_url"]
    #go to the page and grap all the tables
    tables = pd.read_html(url)
    
    #the number of tables the page has will determine which tables we look in for data.
    #if there are more than one tables...
    if len(tables)>1:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[2,1]
        except:
            cause = "unknown"
    
        try:
            # find the "size" in the second table on the page (if it exists)
            size = tables[1].loc[(tables[1][0]) == "Size",1].item()            
        except:
            size = "unknown"
    #if there is only one table on the page...
    else:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[1,1]
            # no size data is available
            size = "n/a"
        except:
            cause = "unknown"
    
    #add cause and size to their lists
    causes.append(cause)
    sizes.append(size)
    
    #print progress
    print(f"{x+1} of {len(unique_data)}")


1 of 130
2 of 130
3 of 130
4 of 130
5 of 130
6 of 130
7 of 130
8 of 130
9 of 130
10 of 130
11 of 130
12 of 130
13 of 130
14 of 130
15 of 130
16 of 130
17 of 130
18 of 130
19 of 130
20 of 130
21 of 130
22 of 130
23 of 130
24 of 130
25 of 130
26 of 130
27 of 130
28 of 130
29 of 130
30 of 130
31 of 130
32 of 130
33 of 130
34 of 130
35 of 130
36 of 130
37 of 130
38 of 130
39 of 130
40 of 130
41 of 130
42 of 130
43 of 130
44 of 130
45 of 130
46 of 130
47 of 130
48 of 130
49 of 130
50 of 130
51 of 130
52 of 130
53 of 130
54 of 130
55 of 130
56 of 130
57 of 130
58 of 130
59 of 130
60 of 130
61 of 130
62 of 130
63 of 130
64 of 130
65 of 130
66 of 130
67 of 130
68 of 130
69 of 130
70 of 130
71 of 130
72 of 130
73 of 130
74 of 130
75 of 130
76 of 130
77 of 130
78 of 130
79 of 130
80 of 130
81 of 130
82 of 130
83 of 130
84 of 130
85 of 130
86 of 130
87 of 130
88 of 130
89 of 130
90 of 130
91 of 130
92 of 130
93 of 130
94 of 130
95 of 130
96 of 130
97 of 130
98 of 130
99 of 130
100 of 130
101 of 1

## Clean/Transform Data 

In [6]:
# if cause has the word "investigation" in it, set cause to "unknown"
for y in range(len(causes)):
    if "Investigation" in causes[y]:
        causes[y] = "Unknown"

# remove the word "Acres" from the size data
sizes = [s.replace(" Acres","") for s in sizes]
sizes = [s.replace(",","") for s in sizes]

In [7]:
# add causes and sizes to the dataframe
unique_data["cause"] = causes
unique_data["acres"] = sizes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_data["cause"] = causes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_data["acres"] = sizes


In [8]:
# see the counts of each fire cause for reference
grouped_df = unique_data.groupby(["cause"])
grouped_df.count()

Unnamed: 0_level_0,title,published,lat,lon,link_url,description,acres
cause,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Burned Area Emergency Response,13,13,13,13,13,13,13
Human,15,15,15,15,15,15,15
Lightning,14,14,14,14,14,14,14
Prescribed Fire,33,33,33,33,33,33,33
Unknown,55,55,55,55,55,55,55


In [9]:
# save the dataframe as "clean_data"

clean_data = unique_data
clean_data.head()

Unnamed: 0,title,published,lat,lon,link_url,description,cause,acres
0,Creek Fire (Wildfire),"Thu, 03 Dec 2020 18:06:37 -06:00",37.201111111111,-119.27166666667,http://inciweb.nwcg.gov/incident/7147/,,Unknown,379895
1,Riverside Fire (Wildfire),"Thu, 03 Dec 2020 17:48:24 -06:00",45.049166666667,-122.06222222222,http://inciweb.nwcg.gov/incident/7174/,"The Riverside Fire started September 8, 2020 a...",Unknown,379895
2,Dolan Fire (Wildfire),"Thu, 03 Dec 2020 15:40:38 -06:00",36.123333333333,-121.60166666667,http://inciweb.nwcg.gov/incident/7018/,The Dolan Fire will be considered contained/c...,Human,138054
3,Klamath NF RX Burning 2020/2021 (Prescribed Fire),"Thu, 03 Dec 2020 15:09:37 -06:00",41.738611111111,-122.77888888889,http://inciweb.nwcg.gov/incident/7274/,"After a challenging fire season, the arrival o...",Unknown,379895
4,Rattlesnake Fire (Wildfire),"Thu, 03 Dec 2020 13:31:38 -06:00",36.419722222222,-118.44833333333,http://inciweb.nwcg.gov/incident/7131/,The Rattlesnake Fire was discovered on August ...,Human,138054


In [10]:
#store as csv for testing
clean_data.to_csv("csv/data.csv")