In [1]:
### Assigned by ###
# Gal Israeli #####
# Yulia Kuderko ###
# Ram Michaeli ####
###################

import os
import re
import sys
import time
import warnings
from datetime import datetime as dt

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

# ignore warnings
warnings.filterwarnings('ignore')

# 1st Crawling

In [2]:
# the main URL of the website
url = "https://volcano.si.edu/search_eruption.cfm"

# using selenium to press the submit button on the home page (can't be done using beautifulsoup)
driver = webdriver.Chrome(os.path.dirname(sys.executable) + "/chromedriver")
driver.get(url)
driver.find_element_by_id('submit_account').click()

volcanoName = list()
eruptionCertainty = list()
startDate = list()
maxVEI = list()

activityArea = list()
evidenceMethod = list()
stopDate = list()

latitude = list()
longitude = list()
volcanoHeight = list()

country = list()
primaryVolcanoType = list()

population5km = list()
population10km = list()
population30km = list()
population100km = list()

# a dictionary that holds all the start dates for each volcano
dateDict = dict()

# final data frame which will be used for opening the csv file after finishing the crawling process
finalDF = pd.DataFrame()

# creating a folder to hold each page from the site in different csv file
folderName = 'Volcanoes By Page'

# if the folder is absent, create it
if not os.path.exists(folderName):
    os.makedirs(folderName)

print("Crawling Started at: " + str(dt.now()))

# a loop that goes through the website's pages
for page in range(1, 16):
    startPageTime = dt.now()
    print("Now in page: " + str(page))
    try:
        # if the page number ISN'T 1, press the next button (using selenium)
        if page != 1:
            driver.find_element_by_xpath("//*[@id='content']/div/div[3]/ul/li[17]/a").click()
    # if the above fails, wait for 2 seconds, reload the driver and press the submit button
    except:
        time.sleep(2)
        driver = webdriver.Chrome(os.path.dirname(sys.executable) + "/chromedriver")
        driver.get(url)
        driver.find_element_by_id('submit_account').click()
        # go back to the last page that was loaded before the failure
        for i in range(1, page):
            driver.find_element_by_xpath("//*[@id='content']/div/div[3]/ul/li[17]/a").click()

    myPage = driver.page_source
    soup = BeautifulSoup(myPage, 'html.parser')
    myTable = soup.find(class_='TableSearchResults')

    # counter that helps us to show the current volcano's name index
    counter = (page - 1) * 750
    # going through the table's rows (while passing over "tHead")
    
    for row in myTable("tr")[1:]:
        if (counter - 25) % 25 == 0:
            startCrawlTime = dt.now()
        counter += 1
        cell = row("td")
        if counter % 25 == 0:
            print("Now At: " + cell[0].text + " number " + str(counter) + " and in page: " + str(page))
        # adding volcano name, eruption certainty, eruption date and max VEI to separate lists
        volcanoName.append(cell[0].text)
        eruptionCertainty.append(cell[1].text)
        startDate.append(cell[2].text)
        maxVEI.append(cell[3].text)

        # reaching the detailed page of each volcano using its specific link
        mainLink = " http://volcano.si.edu" + cell[0]("a")[0]["href"]

        # reaching the "eruptive history" tab
        link_eruptive_history = mainLink + "&vtab=Eruptions"

        newPage = requests.get(link_eruptive_history)

        # as long as the status code of the page isn't 200, wait for 5 seconds and request a
        # permission to access the link
        while newPage.status_code != 200:
            time.sleep(5)
            newPage = requests.get(link_eruptive_history)

        innerSoup = BeautifulSoup(newPage.content, 'html.parser')

        innerTable = innerSoup.find("table", {"class": "DivTable", "title": "Eruption history table for this volcano"})

        innerStartDateArray = list()

        if cell[0].text != 'Unknown Source':
            volcanoID = innerSoup.find(class_='volcano-subinfo-table')("ul")[0]("li")[5].text
        else:
            volcanoID = ''


        # a flag that indicates if the date in the outer table is contained in the date in the inner table
        # if so, change the flag to 1, if not, it means that the start date was not found inside the "Eruption history table for this volcano"
        # so we will set the innerTable to None so afterwards empty cells will be added to the data
        flag = 0

        # create a dictionary that stores the volcano name and its start day array
        # we are using this dictionary to prevent the filling of the same array multiple times
        if cell[0].text+" "+volcanoID not in dateDict:
            if innerTable is not None:
                for index_row in innerTable("tr")[1:]:
                    innerStartDateArray.append(index_row("td")[0].text)
            dateDict[cell[0].text+ " " + volcanoID] = innerStartDateArray

        for index, start_date in enumerate(dateDict[cell[0].text+ " " + volcanoID]):
            if cell[2].text in start_date:
                flag = 1
                break

        if flag == 0:
            innerTable = None

        # increase the index by 1 to skip "<tr>" head
        index += 1

        # try to append the values we want, if not succeed append an empty value in order to keep the sequence
        if innerTable is not None:
            activityArea.append(innerTable("tr")[index]("td")[5].text)
            evidenceMethod.append(innerTable("tr")[index]("td")[4].text)
            stopDate.append(innerTable("tr")[index]("td")[1].text)

            innerTable = innerSoup.find(class_='volcano-subinfo-table')

            # change south and west to minus in latitude and longitude
            if re.sub("[^a-z^A-Z]", "", innerTable("ul")[0]("li")[0].text) == 'S':
                newNum = '-' + (re.sub("[^0-9\.]", "", innerTable("ul")[0]("li")[0].text))

            else:
                newNum = (re.sub("[^0-9\.]", "", innerTable("ul")[0]("li")[0].text))

            latitude.append(newNum)

            if re.sub("[^a-z^A-Z]", "", innerTable("ul")[0]("li")[1].text) == 'W':
                newNum = '-' + (re.sub("[^0-9\.]", "", innerTable("ul")[0]("li")[1].text))

            else:
                newNum = (re.sub("[^0-9\.]", "", innerTable("ul")[0]("li")[1].text))

            longitude.append(newNum)

            volcanoHeight.append(innerTable("ul")[0]("li")[3].contents[0].text)
            innerTable = innerSoup.find(class_='volcano-info-table')
            country.append(innerTable("ul")[0]("li")[0].text)
            primaryVolcanoType.append(innerTable("ul")[0]("li")[2].text)

            innerTable = innerSoup.find("td", {"class": "DivTable35"})

            # iterating in order to get the wanted table row
            for tr in innerTable("tr"):
                continue

        else:
            activityArea.append("")
            evidenceMethod.append("")
            stopDate.append("")
            latitude.append("")
            longitude.append("")
            volcanoHeight.append("")
            country.append("")
            primaryVolcanoType.append("")

        if innerTable is None:
            tr = None

        try:
            # "tr.contents" may be inconsistent, so we wrap this part with try and except for every value
            # if innerTable is None we don't want to append the "tr" from the previous iteration, so we change "tr" to None too
            population5km.append(tr.contents[3].contents[0].strip())
        except:
            population5km.append("")
        try:
            population10km.append(tr.contents[3].contents[2].strip())
        except:
            population10km.append("")
        try:
            population30km.append(tr.contents[3].contents[4].strip())
        except:
            population30km.append("")
        try:
            population100km.append(tr.contents[3].contents[6].strip())
        except:
            population100km.append("")

        if counter % 25 == 0:
            endCrawlTime = dt.now()
            difference = (endCrawlTime - startCrawlTime)
            total_seconds = difference.total_seconds()
            print("Finished after " + str(round(total_seconds, 2)) + " seconds\n")

    # creating a dictionary from all the lists
    data = {
        'volcanoName': volcanoName,
        'eruptionCertainty': eruptionCertainty,
        'startDate': startDate,
        'maxVEI': maxVEI,
        'activityArea': activityArea,
        'evidenceMethod': evidenceMethod,
        'stopDate': stopDate,
        'latitude': latitude,
        'longitude': longitude,
        'volcanoHeight': volcanoHeight,
        'country': country,
        'primaryVolcanoType': primaryVolcanoType,
        'population5km': population5km,
        'population10km': population10km,
        'population30km': population30km,
        'population100km': population100km
    }
    # create a data frame from the dictionary
    df = pd.DataFrame.from_dict(data)
    ## add current dataframe(page) to the final dataframe
    finalDF = finalDF.append(df)

    # export a csv file to each of the pages (15 overall)
    df.to_csv(folderName + '/' + "Volcano(" + str(page) + ").csv", index=False, encoding='utf-8')

    # cleaning the lists and the dictionary for the next iteration
    for key, value in data.items():
        value.clear()

    # cleaning the dictionary
    data.clear()

    endPageTime = dt.now()
    difference_minutes = (endPageTime - startPageTime)
    total_minutes = (difference_minutes.total_seconds()) / 60
    print("Finished page " + str(page) + " after " + str(round(total_minutes, 2)) + " minutes\n")

# export a csv file to all the pages
finalDF.to_csv("Volcano_Final.csv", index=False, encoding='utf-8')

print("Crawling Ended at: " + str(dt.now()))

Crawling Started at: 2022-01-27 15:41:40.847632
Now in page: 1
Now At: Tengger Caldera number 25 and in page: 1
Finished after 130.05 seconds

Now At: Kavachi number 50 and in page: 1
Finished after 143.34 seconds

Now At: Colima number 75 and in page: 1
Finished after 130.57 seconds

Now At: Nishinoshima number 100 and in page: 1
Finished after 141.05 seconds

Now At: Dempo number 125 and in page: 1
Finished after 140.25 seconds

Now At: Ebulobo number 150 and in page: 1
Finished after 145.7 seconds

Now At: Bulusan number 175 and in page: 1
Finished after 200.11 seconds

Now At: Alaid number 200 and in page: 1
Finished after 130.7 seconds

Now At: San Cristobal number 225 and in page: 1
Finished after 119.91 seconds

Now At: Mayon number 250 and in page: 1
Finished after 139.48 seconds

Now At: Asosan number 275 and in page: 1
Finished after 162.8 seconds

Now At: Kikai number 300 and in page: 1
Finished after 118.25 seconds

Now At: Pagan number 325 and in page: 1
Finished after 140

# 2nd Crawling

In [3]:
# the main URL of the website
url = "https://volcano.si.edu/search_emission.cfm"

# using selenium to press the submit button on the home page (can't be done using beautifulsoup)
driver = webdriver.Chrome(os.path.dirname(sys.executable) + "/chromedriver")
driver.get(url)
driver.find_element_by_id('submit_account').click()

volcanoName = list()
startDate = list()
endDate = list()
SO2_Mass_kt = list()
SO2_Altitude = list()

print("Crawling Started at: " + str(dt.now()))

myPage = driver.page_source
soup = BeautifulSoup(myPage, 'html.parser')
myTable = soup.find(class_='TableSearchResults')
# going through the table's rows (while passing over "tHead")

counter = 0
for row in myTable("tr")[1:]:
    counter += 1
    cell = row("td")
    if counter % 10 == 0:
        print("Now At: " + cell[0].text + " number " + str(counter))
    # adding volcano name, start date, end date, SO2 Mass kt and SO2 Altitude to separate lists
    volcanoName.append(cell[0].text)
    startDate.append(cell[2].text)
    endDate.append(cell[3].text)
    SO2_Mass_kt.append(cell[4].text)
    SO2_Altitude.append(cell[5].text)

# creating a dictionary from all the lists
data = {
    'volcanoName': volcanoName,
    'startDate': startDate,
    'endDate': endDate,
    'SO2_Mass_kt': SO2_Mass_kt,
    'SO2_Altitude': SO2_Altitude
}
# create a data frame from the dictionary
df = pd.DataFrame.from_dict(data)

df.to_csv("Volcano_Emission.csv", index=False, encoding='utf-8')

print("Crawling Ended at: " + str(dt.now()))

Crawling Started at: 2022-01-30 03:09:08.466278
Now At: Fournaise, Piton de la number 10
Now At: Soputan number 20
Now At: Pavlof number 30
Now At: Rabaul number 40
Now At: Veniaminof number 50
Now At: Etna number 60
Now At: Planchon-Peteroa number 70
Now At: Etna number 80
Now At: Fournaise, Piton de la number 90
Now At: Bezymianny number 100
Now At: Soufriere Hills number 110
Now At: Etna number 120
Now At: Soufriere Hills number 130
Now At: Fourpeaked number 140
Now At: Tungurahua number 150
Now At: Negra, Sierra number 160
Now At: Soputan number 170
Now At: Fournaise, Piton de la number 180
Now At: Galeras number 190
Now At: Fournaise, Piton de la number 200
Now At: Cleveland number 210
Now At: Miyakejima number 220
Now At: Soufriere Hills number 230
Now At: Rinjani number 240
Now At: Pinatubo number 250
Now At: Pacaya number 260
Now At: Etna number 270
Now At: Kilauea number 280
Now At: Ruiz, Nevado del number 290
Now At: Kilauea number 300
Now At: Mauna Loa number 310
Now At: Kil