In [1]:
#import dependencies

import pandas as pd
from bs4 import BeautifulSoup as bs
#from bs4.element import Comment
#from splinter import Browser
import requests
#import time
#import numpy as np


## Extract Data

In [2]:
#datasource url 
url = "https://inciweb.nwcg.gov/feeds/rss/incidents/"

In [3]:
# instanciate landing dataframe
starting_data = pd.DataFrame()

# request xml code from url
xml_data = requests.get(url).content

#parse the xml response
soup = bs(xml_data, "xml")
     
# Find all text in the data
texts = str(soup.findAll(text=True)).replace('\\n','')
    
#Find the tag/child
child = soup.find("item")

# instatiate column lists
title = []
published = []
lat = []
lon = []
link = []
description = []

#loop trough each "item" in the xml response and store the target data
while True:    
    try:
        title.append(" ".join(child.find('title')))
    except:
         title.append(" ")
            
    try:
        published.append(" ".join(child.find('published')))
    except:
        published.append(" ")
        
    try:
        lat.append(" ".join(child.find('geo:lat')))
    except:
        lat.append(" ")
            
    try:
        lon.append(" ".join(child.find('geo:long')))
    except:
        lon.append(" ")
        
    try:
        link.append(" ".join(child.find('link')))
    except:
        link.append(" ")
        
    try:
        description.append(" ".join(child.find('description')))
    except:
        description.append(" ")
        
    try:   
        # Next sibling of child, here: 'item' 
        child = child.find_next_sibling('item')
    except:
        break
    
    #create dataframe
    data = pd.DataFrame({"title":title,
                                    "published":published,
                                    "lat":lat,
                                    "lon":lon,
                                    "link":link,
                                    "description": description})
    starting_data = starting_data.append(data, ignore_index = True)

In [4]:
# drop duplicate rows
unique_data = starting_data.drop_duplicates(keep="first",ignore_index="True")
unique_data.head(15)

Unnamed: 0,title,published,lat,lon,link,description
0,Cameron Peak Fire (Wildfire),"Fri, 27 Nov 2020 12:20:33 -06:00",40.608611111111,-105.87916666667,http://inciweb.nwcg.gov/incident/6964/,The Southern Area Gold Type 2 Incident Managem...
1,Creek Fire (Wildfire),"Thu, 26 Nov 2020 14:00:27 -06:00",37.201111111111,-119.27166666667,http://inciweb.nwcg.gov/incident/7147/,
2,North Complex (Wildfire),"Wed, 25 Nov 2020 11:56:38 -06:00",40.090833333333,-120.93111111111,http://inciweb.nwcg.gov/incident/6997/,The incident command post (ICP) for North Com...
3,Fall Prescribed Fire Projects 2020 (Prescribed...,"Wed, 25 Nov 2020 11:34:53 -06:00",44.417777777778,-118.95138888889,http://inciweb.nwcg.gov/incident/7216/,Please take a moment to look at the links sec...
4,Mountain View Fire (Wildfire),"Wed, 25 Nov 2020 10:34:07 -06:00",38.515,-119.46527777778,http://inciweb.nwcg.gov/incident/7270/,"The Mountain View fire is 70% contained at 20,..."
5,Laura 2 Fire (Wildfire),"Tue, 24 Nov 2020 16:31:07 -06:00",40.334166666667,-120.115,http://inciweb.nwcg.gov/incident/7269/,"The fire is fully contained, and the Fort Sage..."
6,Grouse Fire (Wildfire),"Tue, 24 Nov 2020 12:49:19 -06:00",43.540555555556,-115.17277777778,http://inciweb.nwcg.gov/incident/7154/,"The Grouse Fire started Sept. 6, 2020, about ..."
7,Middle Fork Fire (Wildfire),"Mon, 23 Nov 2020 17:35:42 -06:00",40.627777777778,-106.79444444444,http://inciweb.nwcg.gov/incident/7153/,"Middle Fork Fact Sheet - 11/02/2020, 2:30 p.m...."
8,Rattlesnake Fire (Wildfire),"Mon, 23 Nov 2020 13:29:05 -06:00",36.419722222222,-118.44833333333,http://inciweb.nwcg.gov/incident/7131/,The Rattlesnake Fire was discovered on August ...
9,Shasta-Trinity RX Burning 2020/2021 (Prescribe...,"Mon, 23 Nov 2020 11:43:12 -06:00",41.371944444444,-121.97138888889,http://inciweb.nwcg.gov/incident/7268/,See the 'Announcements' and 'News' Tabs for t...


In [None]:
# go the the link url for each rown and extract additional data (cause, size)

#instatiate landing lists
causes = []
sizes = []

#loop through each row of data
for x in range(len(unique_data)):
    #find the link in the row
    url = starting_data.loc[x,"link"]
    #go to the page and grap all the tables
    tables = pd.read_html(url)
    
    #the number of tables the page has will determine which tables we look in for data.
    #if there are more than one tables...
    if len(tables)>1:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[2,1]
        except:
            cause = "unknown"
    
        try:
            # find the "size" in the second table on the page (if it exists)
            size = tables[1].loc[(tables[1][0]) == "Size",1].item()            
        except:
            size = "unknown"
    #if there is only one table on the page...
    else:
        try:
            # find the "cause" in the first table on the page (if it exists)
            cause = tables[0].iloc[1,1]
            # no size data is available
            size = "n/a"
        except:
            cause = "unknown"
    
    #add cause and size to their lists
    causes.append(cause)
    sizes.append(size)
    
    #print progress
    print(f"{x+1} of {len(unique_data)}")


1 of 143
2 of 143
3 of 143
4 of 143
5 of 143
6 of 143
7 of 143
8 of 143
9 of 143
10 of 143
11 of 143
12 of 143
13 of 143
14 of 143
15 of 143
16 of 143
17 of 143
18 of 143
19 of 143
20 of 143
21 of 143
22 of 143
23 of 143
24 of 143
25 of 143
26 of 143
27 of 143
28 of 143
29 of 143
30 of 143
31 of 143
32 of 143
33 of 143
34 of 143
35 of 143
36 of 143
37 of 143
38 of 143
39 of 143
40 of 143
41 of 143
42 of 143
43 of 143
44 of 143
45 of 143
46 of 143
47 of 143
48 of 143
49 of 143
50 of 143
51 of 143
52 of 143
53 of 143
54 of 143
55 of 143
56 of 143
57 of 143
58 of 143
59 of 143
60 of 143
61 of 143
62 of 143
63 of 143
64 of 143
65 of 143
66 of 143


## Clean/Transform Data 

In [None]:
# if cause has the word "investigation" in it, set cause to "unknown"
for y in range(len(causes)):
    if "Investigation" in causes[y]:
        causes[y] = "Unknown"

# remove the word "Acres" from the size data
sizes = [s.replace(" Acres","") for s in sizes]

In [None]:
# add causes and sizes to the dataframe
unique_data["cause"] = causes
unique_data["size"] = sizes

In [None]:
# see the counts of each fire cause for reference
grouped_df = unique_data.groupby(["cause"])
grouped_df.count()

In [None]:
# save the dataframe as "clean_data"

clean_data = unique_data
clean_data.head()