In [1]:
# import necessary libraries/packages
import time
import pandas as pd
from bs4 import BeautifulSoup
from splinter import Browser
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# used to obtain the latest version of Chrome without needing to manually download the driver and add it to the path
browser = Browser("chrome", service=ChromeService(ChromeDriverManager().install()))

In [3]:
# dictionary used in webpage url generation
states = [
    {'abbrev': 'MN', 'name': 'Minnesota'},
    {'abbrev': 'NY', 'name': 'New York'},
    {'abbrev': 'IL', 'name': 'Illinois'},
    {'abbrev': 'IN', 'name': 'Indiana'},
    {'abbrev': 'MI', 'name': 'Michgan'},
    {'abbrev': 'OH', 'name': 'Ohio'},
    {'abbrev': 'PA', 'name': 'Pennslyvania'},
    {'abbrev': 'WI', 'name': 'Wisconsin'}
]

In [4]:
# stem of page urls 
base_url = 'https://ecos.fws.gov/ecp/report/species-listings-by-state'

# list used to store scrape results (holds each row_record as shown below)
data = []

# iterate through all pages
for state in states:
    url = f"{base_url}?stateAbbrev={state['abbrev']}&stateName={state['name']}&statusCategory=Listed"

    # visit page in automated browser
    browser.visit(url)
    print("Page visited: ", url)

    # modify page so that All results show on the page without pageination
    browser.find_by_xpath(
        "//select[@name='species-listings-by-state-report_length']/option[text()='All']"
    ).click()

    # grab page html and make into soup object
    html_content = browser.html
    soup = BeautifulSoup(html_content, "html.parser")

   # base url for each animal link
    base_animal_url = "https://ecos.fws.gov"
    for table_row in soup.find_all("tr"):
        # dictionary that will hold info obtained from one table row
        row_record = {}

        # find link in each row and add to dictionary as key 'url'
        row_link = table_row.find("a", href=True)
        # filter out some of the other results not related to our goal
        if (row_link == None) or (row_link["href"][0:4] == "http"):
            # skip unwanted selection and move to next iteration in loop
            continue
        else:
            row_record["url"] = base_animal_url + row_link["href"]
        
        # find animal name in each row and add to dictionary as key 'name'
        name = table_row.find("td", class_="sorting_2").text
        row_record["name"] = name

        # find state name for each page that is scraped and add to dictionary as key 'state'
        row_record["state"] = state["name"]
        data.append(row_record)

Page visited:  https://ecos.fws.gov/ecp/report/species-listings-by-state?stateAbbrev=MN&stateName=Minnesota&statusCategory=Listed
Page visited:  https://ecos.fws.gov/ecp/report/species-listings-by-state?stateAbbrev=NY&stateName=New York&statusCategory=Listed
Page visited:  https://ecos.fws.gov/ecp/report/species-listings-by-state?stateAbbrev=IL&stateName=Illinois&statusCategory=Listed
Page visited:  https://ecos.fws.gov/ecp/report/species-listings-by-state?stateAbbrev=IN&stateName=Indiana&statusCategory=Listed
Page visited:  https://ecos.fws.gov/ecp/report/species-listings-by-state?stateAbbrev=MI&stateName=Michgan&statusCategory=Listed
Page visited:  https://ecos.fws.gov/ecp/report/species-listings-by-state?stateAbbrev=OH&stateName=Ohio&statusCategory=Listed
Page visited:  https://ecos.fws.gov/ecp/report/species-listings-by-state?stateAbbrev=PA&stateName=Pennslyvania&statusCategory=Listed
Page visited:  https://ecos.fws.gov/ecp/report/species-listings-by-state?stateAbbrev=WI&stateName=

In [5]:
# view results in pandas for easy viewing
df = pd.DataFrame(data)
df.head(50)

Unnamed: 0,url,name,state
0,https://ecos.fws.gov/ecp/species/6039,Piping Plover,Minnesota
1,https://ecos.fws.gov/ecp/species/1864,rufa red knot,Minnesota
2,https://ecos.fws.gov/ecp/species/758,Whooping crane,Minnesota
3,https://ecos.fws.gov/ecp/species/5428,Higgins eye (pearlymussel),Minnesota
4,https://ecos.fws.gov/ecp/species/6903,Sheepnose Mussel,Minnesota
5,https://ecos.fws.gov/ecp/species/4135,Snuffbox mussel,Minnesota
6,https://ecos.fws.gov/ecp/species/7867,Spectaclecase (mussel),Minnesota
7,https://ecos.fws.gov/ecp/species/4127,Winged Mapleleaf,Minnesota
8,https://ecos.fws.gov/ecp/species/4122,Topeka shiner,Minnesota
9,https://ecos.fws.gov/ecp/species/601,Eastern prairie fringed orchid,Minnesota


In [6]:
# check if you have all the results - inspect scraped page and dataframe to make sure everything was captured

# find total records scraped
print('total records: ', len(df))

# groupby each state and find total entries
df.groupby('state')['url'].count()

# view one state records and compare webpage 
df[df['state'] == 'NewYork']



total records:  211


Unnamed: 0,url,name,state


In [7]:
# using list of dictionaries above that has a dictionary for each animal (row)
for animal in data:
    browser.visit(animal["url"])
    # slow down scrape to server does see rapid hits coming from one IP
    time.sleep(1)

    # extract browser html
    html_content = browser.html

    # create soup object
    soup = BeautifulSoup(html_content, "html.parser")

    # attempt to search for images on each animal page but if none just record it as none
    try:
        image = soup.find("img", class_="imageSize")["src"]
    except:
        image = None

    # attempt to search for endangered animals on each animal page but if none just record it as none
    try:
        end_status = soup.find("span", class_="listingEnd").text
    except:
        end_status = None

    # attempt to search for threatened animals on each animal page but if none just record it as none
    try:
        threat_status = soup.find("span", class_="listingThreat").text
    except:
        threat_status = None

    # attempt to search for paragraphs of info on each animal page but if none just record it as none
    try:
        general_info = soup.find("div", {"id": "j-general-info"}).text
    except:
        general_info = None

    # store all info for this particular page into the dictionary originally accessed (where we got the link)
    animal["image_url"] = image
    animal["endangered"] = end_status
    animal["threatened"] = threat_status
    animal["description"] = general_info

# does the endangered/threatened content need collect from this page?

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=120.0.6099.199)
Stacktrace:
	GetHandleVerifier [0x00E06EE3+174339]
	(No symbol) [0x00D30A51]
	(No symbol) [0x00A46FF6]
	(No symbol) [0x00A2EFE7]
	(No symbol) [0x00A9B53B]
	(No symbol) [0x00AA9E7B]
	(No symbol) [0x00A96DA6]
	(No symbol) [0x00A71034]
	(No symbol) [0x00A71F8D]
	GetHandleVerifier [0x00EA4B1C+820540]
	sqlite3_dbdata_init [0x00F653EE+653550]
	sqlite3_dbdata_init [0x00F64E09+652041]
	sqlite3_dbdata_init [0x00F597CC+605388]
	sqlite3_dbdata_init [0x00F65D9B+656027]
	(No symbol) [0x00D3FE6C]
	(No symbol) [0x00D383B8]
	(No symbol) [0x00D384DD]
	(No symbol) [0x00D25818]
	BaseThreadInitThunk [0x7634FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77A57C6E+286]
	RtlGetAppContainerNamedObjectPath [0x77A57C3E+238]


In [None]:
# view complete dataframe
df = pd.DataFrame(data)
df.head(20)

Unnamed: 0,url,name,state,image_url,endangered,threatened,description
0,https://ecos.fws.gov/ecp/species/6039,Piping Plover,Minnesota,https://ecos.fws.gov/docs/species_images/doc37...,Endangered,Threatened,General InformationSize: 18 cm (7.25 in) in le...
1,https://ecos.fws.gov/ecp/species/1864,rufa red knot,Minnesota,https://ecos.fws.gov/docs/species_images/doc36...,,Threatened,General InformationLength: 25-28 cm. Adults in...
2,https://ecos.fws.gov/ecp/species/758,Whooping crane,Minnesota,https://ecos.fws.gov/docs/species_images/doc37...,Endangered,,General InformationThe whooping crane occurs o...
3,https://ecos.fws.gov/ecp/species/5428,Higgins eye (pearlymussel),Minnesota,https://ecos.fws.gov/docs/species_images/doc49...,Endangered,,General InformationThe Higgins eye is a freshw...
4,https://ecos.fws.gov/ecp/species/6903,Sheepnose Mussel,Minnesota,https://ecos.fws.gov/docs/species_images/doc49...,Endangered,,"General InformationShell surface: Many low, wi..."
5,https://ecos.fws.gov/ecp/species/4135,Snuffbox mussel,Minnesota,https://ecos.fws.gov/docs/species_images/doc70...,Endangered,,General InformationThe snuffbox is a small- to...
6,https://ecos.fws.gov/ecp/species/7867,Spectaclecase (mussel),Minnesota,,Endangered,,General InformationThe species historical rang...
7,https://ecos.fws.gov/ecp/species/4127,Winged Mapleleaf,Minnesota,https://ecos.fws.gov/docs/species_images/doc51...,Endangered,,General InformationThe species historical rang...
8,https://ecos.fws.gov/ecp/species/4122,Topeka shiner,Minnesota,https://ecos.fws.gov/docs/species_images/doc51...,Endangered,,General InformationThe species historical rang...
9,https://ecos.fws.gov/ecp/species/601,Eastern prairie fringed orchid,Minnesota,https://ecos.fws.gov/docs/species_images/doc50...,,Threatened,General InformationThis plant is 8 to 40 inche...


In [None]:
# inspect df
df.describe()

Unnamed: 0,url,name,state,image_url,endangered,threatened,description
count,211,211,211,160,130,94,211
unique,75,75,8,48,1,1,65
top,https://ecos.fws.gov/ecp/species/6039,Piping Plover,Illinois,https://ecos.fws.gov/docs/species_images/doc37...,Endangered,Threatened,General Information
freq,9,9,33,9,130,94,20


In [None]:
# view number of nan scraped values
df.isna().sum()

url              0
name             0
state            0
image_url       51
endangered      81
threatened     117
description      0
dtype: int64

In [None]:
# show missing image url and check to see if the image is missing or if it is a code issue
df[df['image_url'].isna()]

Unnamed: 0,url,name,state,image_url,endangered,threatened,description
6,https://ecos.fws.gov/ecp/species/7867,Spectaclecase (mussel),Minnesota,,Endangered,,General InformationThe species historical rang...
10,https://ecos.fws.gov/ecp/species/285,Leedy's roseroot,Minnesota,,,Threatened,General InformationLeedy's roseroot is a cliff...
27,https://ecos.fws.gov/ecp/species/9880,Longsolid,New York,,,Threatened,General Information
31,https://ecos.fws.gov/ecp/species/4232,American hart's-tongue fern,New York,,,Threatened,General InformationThe species historical rang...
32,https://ecos.fws.gov/ecp/species/5219,Houghton's goldenrod,New York,,,Threatened,General Information
33,https://ecos.fws.gov/ecp/species/285,Leedy's roseroot,New York,,,Threatened,General InformationLeedy's roseroot is a cliff...
36,https://ecos.fws.gov/ecp/species/8128,Sandplain gerardia,New York,,Endangered,,General InformationThe species historical rang...
38,https://ecos.fws.gov/ecp/species/1890,Small whorled pogonia,New York,,,Threatened,General InformationThe species historical rang...
39,https://ecos.fws.gov/ecp/species/8023,bog buck moth,New York,,Endangered,,General Information
55,https://ecos.fws.gov/ecp/species/9880,Longsolid,Illinois,,,Threatened,General Information


In [None]:
# check of the images not missing to see why there are so few unique image urls
no_na_df = df[~df["image_url"].isna()]
no_na_df[no_na_df["image_url"].duplicated(keep=False)].sort_values(by="image_url").head(50)

Unnamed: 0,url,name,state,image_url,endangered,threatened,description
12,https://ecos.fws.gov/ecp/species/4458,Prairie bush-clover,Minnesota,https://ecos.fws.gov/docs/species_images/doc10...,,Threatened,General InformationAlso known as slender-leave...
72,https://ecos.fws.gov/ecp/species/4458,Prairie bush-clover,Illinois,https://ecos.fws.gov/docs/species_images/doc10...,,Threatened,General InformationAlso known as slender-leave...
200,https://ecos.fws.gov/ecp/species/4458,Prairie bush-clover,Wisconsin,https://ecos.fws.gov/docs/species_images/doc10...,,Threatened,General InformationAlso known as slender-leave...
107,https://ecos.fws.gov/ecp/species/6329,Gray bat,Indiana,https://ecos.fws.gov/docs/species_images/doc14...,Endangered,,"General InformationLong, glossy fur, light bro..."
76,https://ecos.fws.gov/ecp/species/6329,Gray bat,Illinois,https://ecos.fws.gov/docs/species_images/doc14...,Endangered,,"General InformationLong, glossy fur, light bro..."
41,https://ecos.fws.gov/ecp/species/5949,Indiana bat,New York,https://ecos.fws.gov/docs/species_images/doc36...,Endangered,,General InformationThe Indiana bat is a medium...
134,https://ecos.fws.gov/ecp/species/5949,Indiana bat,Michgan,https://ecos.fws.gov/docs/species_images/doc36...,Endangered,,General InformationThe Indiana bat is a medium...
182,https://ecos.fws.gov/ecp/species/5949,Indiana bat,Pennslyvania,https://ecos.fws.gov/docs/species_images/doc36...,Endangered,,General InformationThe Indiana bat is a medium...
77,https://ecos.fws.gov/ecp/species/5949,Indiana bat,Illinois,https://ecos.fws.gov/docs/species_images/doc36...,Endangered,,General InformationThe Indiana bat is a medium...
108,https://ecos.fws.gov/ecp/species/5949,Indiana bat,Indiana,https://ecos.fws.gov/docs/species_images/doc36...,Endangered,,General InformationThe Indiana bat is a medium...


In [None]:
# move full dataset to csv file
df.to_csv('great_lakes_data.csv')

In [None]:
# view a deduped df
df.drop_duplicates(subset=['url'], inplace=True, keep='first')

In [None]:
df

Unnamed: 0,url,name,state,image_url,endangered,threatened,description
0,https://ecos.fws.gov/ecp/species/6039,Piping Plover,Minnesota,https://ecos.fws.gov/docs/species_images/doc37...,Endangered,Threatened,General InformationSize: 18 cm (7.25 in) in le...
1,https://ecos.fws.gov/ecp/species/1864,rufa red knot,Minnesota,https://ecos.fws.gov/docs/species_images/doc36...,,Threatened,General InformationLength: 25-28 cm. Adults in...
2,https://ecos.fws.gov/ecp/species/758,Whooping crane,Minnesota,https://ecos.fws.gov/docs/species_images/doc37...,Endangered,,General InformationThe whooping crane occurs o...
3,https://ecos.fws.gov/ecp/species/5428,Higgins eye (pearlymussel),Minnesota,https://ecos.fws.gov/docs/species_images/doc49...,Endangered,,General InformationThe Higgins eye is a freshw...
4,https://ecos.fws.gov/ecp/species/6903,Sheepnose Mussel,Minnesota,https://ecos.fws.gov/docs/species_images/doc49...,Endangered,,"General InformationShell surface: Many low, wi..."
...,...,...,...,...,...,...,...
157,https://ecos.fws.gov/ecp/species/1728,Virginia spiraea,Ohio,,,Threatened,General InformationThe Virginia spiraea is\nfo...
158,https://ecos.fws.gov/ecp/species/66,American burying beetle,Ohio,https://ecos.fws.gov/docs/species_images/doc51...,,Threatened,General InformationThe species historical rang...
177,https://ecos.fws.gov/ecp/species/1286,American chaffseed,Pennslyvania,,Endangered,,General Informationhttps://www.fws.gov/southea...
179,https://ecos.fws.gov/ecp/species/3739,Harperella,Pennslyvania,,Endangered,,General InformationThe species historical rang...
