In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import time
# These are the modules needed for this operation

In [5]:
dis = pd.read_csv('./data/DisasterDeclarationsSummaries.csv') # Reading in disaster data from a csv



In [6]:
dis.tail()

Unnamed: 0,disasterNumber,ihProgramDeclared,iaProgramDeclared,paProgramDeclared,hmProgramDeclared,state,declarationDate,fyDeclared,disasterType,incidentType,title,incidentBeginDate,incidentEndDate,disasterCloseOutDate,declaredCountyArea,placeCode,hash,lastRefresh,id
51011,4473,1,0,0,0,PR,2020-01-16T14:48:00.000Z,2020,DR,Earthquake,EARTHQUAKES,2019-12-28T04:29:00.000Z,,,Sabana Grande (Municipio),99121.0,2fc70c181ea2812349c328633f2432e3,2020-02-05T23:40:50.716Z,5e3b528274cbd479fcb5c27b
51012,4473,1,0,0,0,PR,2020-01-16T14:48:00.000Z,2020,DR,Earthquake,EARTHQUAKES,2019-12-28T04:29:00.000Z,,,Juana Diaz (Municipio),99075.0,96d675c0cd6e7e68c97631c1aae76c04,2020-02-05T23:40:50.715Z,5e3b528274cbd479fcb5c278
51013,4473,1,0,0,0,PR,2020-01-16T14:48:00.000Z,2020,DR,Earthquake,EARTHQUAKES,2019-12-28T04:29:00.000Z,,,Mayaguez (Municipio),99097.0,04baba2a215b8af556e3b16dd5253413,2020-02-05T23:40:50.714Z,5e3b528274cbd479fcb5c276
51014,4473,1,0,0,0,PR,2020-01-16T14:48:00.000Z,2020,DR,Earthquake,EARTHQUAKES,2019-12-28T04:29:00.000Z,,,Hormigueros (Municipio),99067.0,1125cacf13706fc3c347b1f75ca79a35,2020-02-05T23:40:50.715Z,5e3b528274cbd479fcb5c277
51015,4473,1,0,0,0,PR,2020-01-16T14:48:00.000Z,2020,DR,Earthquake,EARTHQUAKES,2019-12-28T04:29:00.000Z,,,Arecibo (Municipio),99013.0,7e242aa5873d5adac5a3776a714cbd79,2020-02-05T23:40:50.717Z,5e3b528274cbd479fcb5c27c


In [7]:
set(dis.title) # Checking for unique titles on these events


{'HEAVY RAINS, ICE JAMS & FLOODING',
 'FLOODING ASSOCIATED WITH TROPICAL DEPRESSION FRAN',
 'SHEKELL FIRE',
 'MT - MISSOURI BREAKS COMPLEX FIRE 07/16/03',
 'CEDAR LANE FIRE',
 'LANDSLIDES',
 'WALDO CANYON FIRE',
 'STORMS & FLASH FLOODS',
 'MI - SEVERE WEATHER 1/2 /99',
 'ASH CREEK FIRE',
 'PEPPIN FIRE',
 'PUAKO FIRE',
 'GA - BLOUNTS PASTURE FIRE',
 'CA-EAGLE FIRE-05-04-2004',
 'TRINITY RIDGE FIRE',
 'SEVERE WEATHER CONDITIONS',
 'PEDERNALES BEND FIRE',
 'FAIR GROUNDS FIRE COMPLEX',
 'FL - ESCAMBIA FIRE COMPLEX - 05/16/01',
 'EXTREME  FIRE HAZARD',
 'RIM ROCK FIRE',
 'SEVERE STORMS, FLOODING, SNOW MELT AND ICE JAMS',
 'WELLNITZ FIRE',
 'HIGH WINDS, SEVERE  STORMS AND FLOODING',
 'HEAVY RAINS, FLOODING & MUDSLIDES',
 'TYPHOON LOLA',
 'HURRICANE DOLLY',
 'HURRICANE, TORRENTIAL RAIN & FLOODS',
 'FREEZING TEMPERATURES',
 'SEVERE STORM, FLOODING, AND TORNADOES',
 'WILDFIRES AND HIGH WINDS',
 'SUZIE FIRE',
 'DONALDSON FIRE',
 'SEVERE STORMS, SNOWMELT & FLOODING',
 'DROUGHT & IMPENDING FREEZE'

In [8]:
dis.loc[:,['incidentType','declaredCountyArea','placeCode']].dropna() #droping data points we don't have enough information to scrape for

Unnamed: 0,incidentType,declaredCountyArea,placeCode
83,Flood,Clay (County),99021.0
163,Flood,Alpine (County),99003.0
176,Flood,Colusa (County),99011.0
181,Flood,Butte (County),99007.0
182,Flood,Del Norte (County),99015.0
...,...,...,...
51011,Earthquake,Sabana Grande (Municipio),99121.0
51012,Earthquake,Juana Diaz (Municipio),99075.0
51013,Earthquake,Mayaguez (Municipio),99097.0
51014,Earthquake,Hormigueros (Municipio),99067.0


In [9]:
category_id = 'BNeawe s3v9rd AP7Wnd' # This is the id google uses for the text we want to scrape

In [10]:
val_id = 'BNeawe tAd8D AP7Wnd' # This is the id google uses for values we want

In [11]:
# This class scrapes all the info you get back from a google search and parses the html to -
# Give us back the values we need to build a dataset
class Analysis:
    def __init__(self, term):
        self.term = term
        self.date = None
        self.death = None
        self.damage = None
        self.location = None
        self.category= None
        self.specs = None
        self.url = f'https://www.google.com/search?q={self.term}&source=lnms'
       
        
    def crawl(self):
        res = requests.get(self.url) # fetching the html and css from googles api
        soup = BeautifulSoup(res.text, 'html.parser') #html.parser
        if len(soup.find_all('span',category_id)) > 1:
            a = [i.text for i in soup.find_all('span',category_id)]
            b = [i.text for i in soup.find_all('span',val_id)]
            disaster_specs = {a[i]:b[i] for i in range(len(a))}
        # The results we put into if statements to prevent errors from inconsistancies in google searching
            if 'Date' in disaster_specs.keys():
                self.date =  disaster_specs['Date']
            if 'Damage' in disaster_specs.keys():
                self.damage =  disaster_specs['Damage']
            if 'Affected areas' in disaster_specs.keys():
                self.location = disaster_specs['Affected areas']
            if 'Category' in disaster_specs.keys():
                self.category=  disaster_specs['Category']
            if 'Total fatalities' in disaster_specs.keys():
                    self.death = disaster_specs['Total fatalities']
            self.specs = disaster_specs
            if 'Direct fatalities' in disaster_specs.keys():
                self.death = disaster_specs['Direct fatalities']   
t0 = time.time()


    


In [12]:
res = requests.get('https://www.google.com/search?q=Hurricane Sandy&source=lnms')

In [13]:
soup = BeautifulSoup(res.text, 'html.parser')

In [14]:
soup # This was all a test to figure out what my code is doing.

<!DOCTYPE doctype html>
<html lang="en"><head><meta charset="utf-8"/><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/><title>Hurricane Sandy - Google Search</title><script nonce="QEJ2M6J326YlLpYgbW+lZQ==">(function(){
document.documentElement.addEventListener("submit",function(b){var a;if(a=b.target){var c=a.getAttribute("data-submitfalse");a="1"==c||"q"==c&&!a.elements.q.value?!0:!1}else a=!1;a&&(b.preventDefault(),b.stopPropagation())},!0);document.documentElement.addEventListener("click",function(b){var a;a:{for(a=b.target;a&&a!=document.documentElement;a=a.parentElement)if("A"==a.tagName){a="1"==a.getAttribute("data-nohref");break a}a=!1}a&&b.preventDefault()},!0);}).call(this);(function(){
var a=window.performance;window.start=(new Date).getTime();a:{var b=window;if(a){var c=a.timing;if(c){var d=c.navigationStart,f=c.responseStart;if(f>d&&f<=window.start){window.start=f;b.wsrt=f-d;break a}}a.now&&(b.wsrt=Math.floor(a.now()))}}window.go

In [15]:
category_id = 'BNeawe s3v9rd AP7Wnd'

In [16]:
val_id = 'BNeawe tAd8D AP7Wnd'

In [17]:
a = [i.text for i in soup.find_all('span',category_id)]

In [18]:
b = [i.text for i in soup.find_all('span',val_id)]

In [19]:
{a[i]:b[i] for i in range(len(a))}

{'Total fatalities': '285',
 'Highest wind speed': '115\xa0mph',
 'Lowest pressure': '940\xa0mb',
 'Date': 'October 22, 2012 – November 2, 2012',
 'Category': 'Category 3 Hurricane (SSHWS)',
 'Affected areas': 'Canada, Puerto Rico, Cuba, and more'}

In [20]:
harvey= Analysis('Hurricane Harvey')

In [21]:
harvey.crawl()

In [22]:
harvey.category

'Tropical Depression (NHC/CPHC) and Category 4 Hurricane (SSHWS)'

In [23]:
sandy = Analysis('Hurricane Sandy')

In [24]:
sandy.crawl()

In [25]:
sandy.category

'Category 3 Hurricane (SSHWS)'

In [26]:
dis['date'] = 0
dis['category'] = 0
dis['area_affected'] = 0
dis['damage'] = 0

In [27]:
titles = np.array(dis.title)

In [28]:
titles.shape

(51016,)

In [29]:
import time


In [27]:
dis.loc[dis.title == 'TORNADO & HEAVY RAINFALL', 'date']

1    0
Name: date, dtype: int64

In [28]:
'I' in 'this'

False

In [29]:
# t0 = time.time()

# for title in titles: 
#     item = Analysis(title)
#     item.crawl()
#     dis.loc[dis.title == title, 'date'] = item.date
#     dis.loc[dis.title == title, 'category'] = item.category
#     dis.loc[dis.title == title, 'area_affected'] = item.location
#     dis.loc[dis.title == title, 'damage'] = item.damage
#     print(time.time()- t0)

In [30]:
#dis.to_csv('./data/disaster_dataset.csv')

In [31]:
hurricanes = list(dis.title) # GExtracting the titiles of each disaster

In [32]:
hurricanes = [x for x in hurricanes if 'hurricane' in x.lower()] #filtering for hurricanes 

In [33]:
len(hurricanes) # checking the size

10601

In [34]:
hurricanes = set(hurricanes) # removing duplicates by taking the set()

In [35]:
hurricanes = list(hurricanes) 


In [36]:
# my_list = []
# for title in hurricanes:
#     item = Analysis(title)
#     item.crawl()
#     my_dict = {}
#     my_dict['event'] = item.term
#     my_dict['date'] = item.date
#     my_dict['category'] = item.category
#     my_dict['area_affected'] = item.location
#     my_dict['damage'] = item.damage
#     my_dict['fatalities'] = item.death 
#     my_list.append(my_dict)
# result = pd.DataFrame(my_list)
# result.head()

In [37]:
dot = Analysis(hurricanes[0])

In [38]:
dot.crawl()

In [39]:
dot.date

In [40]:
for i in hurricanes[:10]:
    print(i)

HURRICANE
HURRICANE ISABEL
HURRICANE IRMA - SEMINOLE TRIBE OF FLORIDA
HURRICANE BRET
HURRICANE KATE
HURRICANE OMAR
HURRICANE TUSI
SEVERE STORMS AND FLOODING ASSOCIATED WITH HURRICANE IKE
HURRICANE IRMA
HURRICANE ANDREW


In [41]:
wrong = [e for e in hurricanes if len(e)>18 ]

In [42]:
wrong_index = [hurricanes.index(i) for i in wrong]
wrong_index

[2,
 7,
 12,
 15,
 19,
 21,
 27,
 31,
 38,
 40,
 47,
 57,
 62,
 63,
 64,
 65,
 69,
 73,
 79,
 88,
 89,
 90,
 91,
 93,
 98,
 107]

In [55]:
len(wrong_index)

9

In [43]:
hurricanes = list(set(hurricanes)) 

In [44]:
# fixing the names of the hurricanes 

hurricanes[wrong_index[0]] = 'HURRICANE FRAN'
hurricanes[wrong_index[1]] = 'HURRICANE BOB'
hurricanes[wrong_index[2]] = 'HURRICANE GEORGES'
hurricanes[wrong_index[3]] = 'HURRICANE IKE'
hurricanes[wrong_index[4]] = 'HURRICANE GUSTAV'
hurricanes[wrong_index[5]] = 'HURRICANE GEORGES'
hurricanes[wrong_index[6]] = 'HURRICANE FLOYD'
hurricanes[wrong_index[7]] = 'HURRICANE KATRINA'
hurricanes[wrong_index[8]] = 'HURRICANE IRENE'
hurricanes[wrong_index[9]] = 'HURRICANE FRAN'
hurricanes[wrong_index[10]] = 'HURRICANE GEORGES'
hurricanes[wrong_index[11]] = 'HURRICANE DAVID'
hurricanes[wrong_index[12]] = None
hurricanes[wrong_index[13]] = 'HURRICANE SANDY '
hurricanes[wrong_index[14]] = 'HURRICANE JEANNE'
hurricanes[wrong_index[15]] = None
hurricanes[wrong_index[16]] = 'HURRICANE LENNY'
hurricanes[wrong_index[17]] = 'HURRICANE KATRINA'
hurricanes[wrong_index[18]] = 'HURRICANE FRAN'
hurricanes[wrong_index[19]] = 'HURRICANE BOB'
hurricanes[wrong_index[20]] = 'HURRICANE CHARLEY'
hurricanes[wrong_index[21]] =  'HURRICANE IRMA'

In [48]:
hurricanes.pop(hurricanes.index(None))

In [48]:
hurricanes.pop(hurricanes.index(None))

In [51]:
wrong = [e for e in hurricanes if len(e)>17 ] # I want to change titles that don't 

In [57]:
wrong

['HURRICANE & FLOODS',
 'HURRICANE FREDERIC',
 'FLOODS & HURRICANE',
 'HURRICANE BOB & FLOODING',
 'HURRICANE GEORGES - 18 SEP 98',
 'HURRICANE & SEVERE STORM',
 'HURRICANE FLORENCE',
 'HURRICANE HORTENSE',
 'HURRICANE GEORGES (DIRECT FEDERAL ASSIST FIRST 72 HRS)']

In [53]:
wrong_index = [hurricanes.index(i) for i in wrong]
wrong_index

[13, 43, 82, 89, 91, 96, 99, 100, 105]

In [58]:
hurricanes[wrong_index[0]] = None
hurricanes[wrong_index[1]] = 'HURRICANE FREDERIC'
hurricanes[wrong_index[2]] = None
hurricanes[wrong_index[3]] = 'HURRICANE BOB'
hurricanes[wrong_index[4]] = 'HURRICANE GEORGES'
hurricanes[wrong_index[5]] = None
hurricanes[wrong_index[6]] = 'HURRICANE FLORENCE'
hurricanes[wrong_index[7]] = 'HURRICANE HORTENSE'
hurricanes[wrong_index[8]] = 'HURRICANE GEORGES'


In [63]:
hurricanes = list(set(hurricanes))
hurricanes

['HURRICANE ISABEL',
 'HURRICANE BRET',
 'HURRICANE SANDY ',
 'HURRICANE KATE',
 'HURRICANE OMAR',
 'HURRICANE TUSI',
 'HURRICANE IRMA',
 'HURRICANE ANDREW',
 'HURRICANE HUGO',
 'HURRICANE OPHELIA',
 'HURRICANE ELENA',
 'HURRICANE FLOYD',
 'HURRICANE CARLA',
 'HURRICANE CINDY',
 'HURRICANE VAL',
 'HURRICANE HILDA',
 'HURRICANE KATRINA',
 'HURRICANE BERTHA',
 'HURRICANE DOLLY',
 'HURRICANE IVAN',
 'HURRICANE LANE',
 'HURRICANE RITA',
 'HURRICANE GUSTAV',
 'HURRICANE BELLE',
 'HURRICANE CAMILLE',
 'HURRICANE WILMA',
 'HURRICANE DORIAN',
 'HURRICANE INIKI',
 'HURRICANE DAVID',
 'HURRICANE GILBERT',
 'HURRICANE CHARLEY',
 'HURRICANE IKE',
 'HURRICANE FREDERIC',
 'HURRICANE GEORGES',
 'HURRICANE ISAAC',
 'HURRICANE OFA',
 'HURRICANE GLADYS',
 'HURRICANE CLEO',
 'HURRICANE ALICIA',
 'HURRICANE EDITH',
 'HURRICANE ALLEN',
 'HURRICANE IRENE',
 'HURRICANE NATE',
 'HURRICANE DIANA',
 'HURRICANE HERMINE',
 'HURRICANE DORA',
 'HURRICANE DONNA',
 'HURRICANE BARRY',
 'HURRICANE MARIA',
 'HURRICANE J

In [72]:
dis.disasterNumber

0           1
1           2
2           3
3           6
4           4
         ... 
51011    4473
51012    4473
51013    4473
51014    4473
51015    4473
Name: disasterNumber, Length: 51016, dtype: int64

In [60]:
hurricanes.pop(hurricanes.index(None ))

In [61]:
hurricanes.pop(hurricanes.index('HURRICANES'))

'HURRICANES'

In [62]:
hurricanes.pop(hurricanes.index('HURRICANE'))

'HURRICANE'

In [64]:
hurricanes.pop(hurricanes.index('HURRICANE & FLOOD'))

'HURRICANE & FLOOD'

In [65]:
# creating dataframe using the Analysis class
my_list = []
for title in hurricanes:
    item = Analysis(title)
    item.crawl()
    my_dict = {}
    my_dict['event'] = item.term
    my_dict['date'] = item.date
    my_dict['category'] = item.category
    my_dict['area_affected'] = item.location
    my_dict['damage'] = item.damage
    my_dict['fatalities'] = item.death 
    my_list.append(my_dict)
result = pd.DataFrame(my_list)
result.head()

Unnamed: 0,event,date,category,area_affected,damage,fatalities
0,HURRICANE ISABEL,"September 6, 2003 – September 20, 2003",Category 5 Hurricane (SSHWS),"Puerto Rico, Washington, D.C., The Bahamas, an...",$5.5 billion (2003 USD),51
1,HURRICANE BRET,"August 18, 1999 – August 25, 1999",Category 4 Hurricane (SSHWS),Mexico and Texas,$15 million (1999 USD),7
2,HURRICANE SANDY,"October 22, 2012 – November 2, 2012",Category 3 Hurricane (SSHWS),"Puerto Rico, Jamaica, Haiti, and more",,285
3,HURRICANE KATE,"November 15, 1985 – November 23, 1985",Category 3 Hurricane (SSHWS),"Cuba, Florida, The Bahamas, and more",$700 million (1985 USD),15
4,HURRICANE OMAR,"October 13, 2008 – October 21, 2008",Category 4 Hurricane (SSHWS),"Puerto Rico, Venezuela, Montserrat, and more",$80 million (2008 USD),1


In [66]:
result.shape

(78, 6)

In [None]:
set(dis.loc[dis['title'] == hurricanes[2],'disasterNumber'])

In [None]:
hurricanes[2]

In [67]:
result.to_csv('./data/updated_hurricane_clean.csv', index=False) 