In [1]:
#import dependencies

import pandas as pd
from bs4 import BeautifulSoup as bs
#from bs4.element import Comment
#from splinter import Browser
import requests
#import time
#import numpy as np


## Extract Data

In [2]:
#datasource url 
url = "https://inciweb.nwcg.gov/feeds/rss/incidents/"

In [3]:
# instanciate landing dataframe
starting_data = pd.DataFrame()

# request xml code from url
xml_data = requests.get(url).content

#parse the xml response
soup = bs(xml_data, "xml")
     
# Find all text in the data
texts = str(soup.findAll(text=True)).replace('\\n','')
    
#Find the tag/child
child = soup.find("item")

# instatiate column lists
title = []
published = []
lat = []
lon = []
link = []
description = []

#loop trough each "item" in the xml response and store the target data
while True:    
    try:
        title.append(" ".join(child.find('title')))
    except:
         title.append(" ")
            
    try:
        published.append(" ".join(child.find('published')))
    except:
        published.append(" ")
        
    try:
        lat.append(" ".join(child.find('geo:lat')))
    except:
        lat.append(" ")
            
    try:
        lon.append(" ".join(child.find('geo:long')))
    except:
        lon.append(" ")
        
    try:
        link.append(" ".join(child.find('link')))
    except:
        link.append(" ")
        
    try:
        description.append(" ".join(child.find('description')))
    except:
        description.append(" ")
        
    try:   
        # Next sibling of child, here: 'item' 
        child = child.find_next_sibling('item')
    except:
        break
    
    #create dataframe
    data = pd.DataFrame({"title":title,
                                    "published":published,
                                    "lat":lat,
                                    "lon":lon,
                                    "link":link,
                                    "description": description})
    starting_data = starting_data.append(data, ignore_index = True)

In [4]:
# drop duplicate rows
unique_data = starting_data.drop_duplicates(keep="first",ignore_index="True")
unique_data.head(15)

Unnamed: 0,title,published,lat,lon,link,description
0,Woodhead BAER (Burned Area Emergency Response),"Tue, 01 Dec 2020 17:51:59 -06:00",44.756111111111,-116.87666666667,http://inciweb.nwcg.gov/incident/7262/,"The Woodhead Fire burned almost 100,000 acre..."
1,2020-21 Royal Gorge FO Pile Burns (Prescribed ...,"Tue, 01 Dec 2020 17:28:48 -06:00",38.444722222222,-105.19638888889,http://inciweb.nwcg.gov/incident/7266/,"November 12, 2020Bureau of Land Management pla..."
2,Big Hollow Fire (Wildfire),"Tue, 01 Dec 2020 13:44:54 -06:00",45.926944444444,-121.98,http://inciweb.nwcg.gov/incident/7171/,The Big Hollow Fire is now considered to be 7...
3,Cameron Peak Fire (Wildfire),"Tue, 01 Dec 2020 13:17:30 -06:00",40.608611111111,-105.87916666667,http://inciweb.nwcg.gov/incident/6964/,The Southern Area Gold Type 2 Incident Managem...
4,Fort Wolters Prescribed Fire (Prescribed Fire),"Tue, 01 Dec 2020 11:24:02 -06:00",32.869166666667,-98.045,http://inciweb.nwcg.gov/incident/7272/,Texas A&M Forest Service is assisting military...
5,Shasta-Trinity RX Burning 2020/2021 (Prescribe...,"Tue, 01 Dec 2020 11:10:31 -06:00",41.371944444444,-121.97138888889,http://inciweb.nwcg.gov/incident/7268/,See the 'Announcements' and 'News' Tabs for t...
6,East Troublesome Post-Fire BAER (Burned Area E...,"Tue, 01 Dec 2020 10:45:30 -06:00",40.25,-105.9,http://inciweb.nwcg.gov/incident/7267/,THREE PHASES OF WILDFIRE RECOVERYThere are thr...
7,East Troublesome Fire (Wildfire),"Tue, 01 Dec 2020 10:43:57 -06:00",40.200555555556,-106.23416666667,http://inciweb.nwcg.gov/incident/7242/,The East Troublesome Fire was reported on the ...
8,Williams Fork Fire (Wildfire),"Tue, 01 Dec 2020 10:37:42 -06:00",39.851111111111,-106.06472222222,http://inciweb.nwcg.gov/incident/6971/,The Williams Fork Fire was first reported on A...
9,SQF Complex (Wildfire),"Tue, 01 Dec 2020 07:35:02 -06:00",36.255,-118.49666666667,http://inciweb.nwcg.gov/incident/7048/,Operations Map | Fire History Map | Land Ow...


In [5]:
# go the the link url for each rown and extract additional data (cause, size)

#instatiate landing lists
causes = []
sizes = []

#loop through each row of data
for x in range(len(unique_data)):
    #find the link in the row
    url = starting_data.loc[x,"link"]
    #go to the page and grap all the tables
    tables = pd.read_html(url)
    
    #the number of tables the page has will determine which tables we look in for data.
    #if there are more than one tables...
    if len(tables)>1:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[2,1]
        except:
            cause = "unknown"
    
        try:
            # find the "size" in the second table on the page (if it exists)
            size = tables[1].loc[(tables[1][0]) == "Size",1].item()            
        except:
            size = "unknown"
    #if there is only one table on the page...
    else:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[1,1]
            # no size data is available
            size = "n/a"
        except:
            cause = "unknown"
    
    #add cause and size to their lists
    causes.append(cause)
    sizes.append(size)
    
    #print progress
    print(f"{x+1} of {len(unique_data)}")


1 of 133
2 of 133
3 of 133
4 of 133
5 of 133
6 of 133
7 of 133
8 of 133
9 of 133
10 of 133
11 of 133
12 of 133
13 of 133
14 of 133
15 of 133
16 of 133
17 of 133
18 of 133
19 of 133
20 of 133
21 of 133
22 of 133
23 of 133
24 of 133
25 of 133
26 of 133
27 of 133
28 of 133
29 of 133
30 of 133
31 of 133
32 of 133
33 of 133
34 of 133
35 of 133
36 of 133
37 of 133
38 of 133
39 of 133
40 of 133
41 of 133
42 of 133
43 of 133
44 of 133
45 of 133
46 of 133
47 of 133
48 of 133
49 of 133
50 of 133
51 of 133
52 of 133
53 of 133
54 of 133
55 of 133
56 of 133
57 of 133
58 of 133
59 of 133
60 of 133
61 of 133
62 of 133
63 of 133
64 of 133
65 of 133
66 of 133
67 of 133
68 of 133
69 of 133
70 of 133
71 of 133
72 of 133
73 of 133
74 of 133
75 of 133
76 of 133
77 of 133
78 of 133
79 of 133
80 of 133
81 of 133
82 of 133
83 of 133
84 of 133
85 of 133
86 of 133
87 of 133
88 of 133
89 of 133
90 of 133
91 of 133
92 of 133
93 of 133
94 of 133
95 of 133
96 of 133
97 of 133
98 of 133
99 of 133
100 of 133
101 of 1

## Clean/Transform Data 

In [6]:
# if cause has the word "investigation" in it, set cause to "unknown"
for y in range(len(causes)):
    if "Investigation" in causes[y]:
        causes[y] = "Unknown"

# remove the word "Acres" from the size data
sizes = [s.replace(" Acres","") for s in sizes]

In [7]:
# add causes and sizes to the dataframe
unique_data["cause"] = causes
unique_data["size"] = sizes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_data["cause"] = causes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_data["size"] = sizes


In [8]:
# see the counts of each fire cause for reference
grouped_df = unique_data.groupby(["cause"])
grouped_df.count()

Unnamed: 0_level_0,title,published,lat,lon,link,description,size
cause,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Burned Area Emergency Response,26,26,26,26,26,26,26
Lightning,9,9,9,9,9,9,9
Prescribed Fire,31,31,31,31,31,31,31
Prescribed Fire To Reduce Wildfire Hazard,12,12,12,12,12,12,12
Unknown,55,55,55,55,55,55,55


In [9]:
# save the dataframe as "clean_data"

clean_data = unique_data
clean_data.head()

Unnamed: 0,title,published,lat,lon,link,description,cause,size
0,Woodhead BAER (Burned Area Emergency Response),"Tue, 01 Dec 2020 17:51:59 -06:00",44.756111111111,-116.87666666667,http://inciweb.nwcg.gov/incident/7262/,"The Woodhead Fire burned almost 100,000 acre...",Burned Area Emergency Response,
1,2020-21 Royal Gorge FO Pile Burns (Prescribed ...,"Tue, 01 Dec 2020 17:28:48 -06:00",38.444722222222,-105.19638888889,http://inciweb.nwcg.gov/incident/7266/,"November 12, 2020Bureau of Land Management pla...",Burned Area Emergency Response,
2,Big Hollow Fire (Wildfire),"Tue, 01 Dec 2020 13:44:54 -06:00",45.926944444444,-121.98,http://inciweb.nwcg.gov/incident/7171/,The Big Hollow Fire is now considered to be 7...,Prescribed Fire,
3,Cameron Peak Fire (Wildfire),"Tue, 01 Dec 2020 13:17:30 -06:00",40.608611111111,-105.87916666667,http://inciweb.nwcg.gov/incident/6964/,The Southern Area Gold Type 2 Incident Managem...,Burned Area Emergency Response,
4,Fort Wolters Prescribed Fire (Prescribed Fire),"Tue, 01 Dec 2020 11:24:02 -06:00",32.869166666667,-98.045,http://inciweb.nwcg.gov/incident/7272/,Texas A&M Forest Service is assisting military...,Prescribed Fire,


In [10]:
#store as csv for testing
clean_data.to_csv("csv/data.csv")