In [None]:
# import necessary libraries/packages
import time
import pandas as pd
from bs4 import BeautifulSoup
from splinter import Browser
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# used to obtain the latest version of Chrome without needing to manually download the driver and add it to the path
browser = Browser("chrome", service=ChromeService(ChromeDriverManager().install()))

In [None]:
# dictionary used in webpage url generation
states = [
    {'abbrev': 'MN', 'name': 'Minnesota'},
    {'abbrev': 'NY', 'name': 'NewYork'},
    {'abbrev': 'IL', 'name': 'Illinois'},
    {'abbrev': 'IN', 'name': 'Indiana'},
    {'abbrev': 'MI', 'name': 'Michgan'},
    {'abbrev': 'OH', 'name': 'Ohio'},
    {'abbrev': 'PA', 'name': 'Pennslyvania'},
    {'abbrev': 'WI', 'name': 'Wisconsin'}
]

In [None]:
# stem of page urls 
base_url = "https://ecos.fws.gov/ecp/report/species-listings-by-state"

# list used to store scrape results (holds each row_record as shown below)
data = []

# iterate through all pages
for state in states:
    url = f"{base_url}?stateAbbrev={state['abbrev']}&stateName={state['name']}&statusCategory=Listed"

    # visit page in automated browser
    browser.visit(url)
    print("Page visited: ", url)

    # modify page so that All results show on the page without pageination
    browser.find_by_xpath(
        "//select[@name='species-listings-by-state-report_length']/option[text()='All']"
    ).click()

    # grab page html and make into soup object
    html_content = browser.html
    soup = BeautifulSoup(html_content, "html.parser")

   # base url for each animal link
    base_animal_url = "https://ecos.fws.gov"
    for table_row in soup.find_all("tr"):
        # dictionary that will hold info obtained from one table row
        row_record = {}

        # find link in each row and add to dictionary as key 'url'
        row_link = table_row.find("a", href=True)
        # filter out some of the other results not related to our goal
        if (row_link == None) or (row_link["href"][0:4] == "http"):
            # skip unwanted selection and move to next iteration in loop
            continue
        else:
            row_record["url"] = base_animal_url + row_link["href"]
        
        # find animal name in each row and add to dictionary as key 'name'
        name = table_row.find("td", class_="sorting_2").text
        row_record["name"] = name

        # find state name for each page that is scraped and add to dictionary as key 'state'
        row_record["state"] = state["name"]
        data.append(row_record)

In [None]:
# view results in pandas for easy viewing
df = pd.DataFrame(data)
df.head(50)

In [None]:
# check if you have all the results - inspect scraped page and dataframe to make sure everything was captured

# find total records scraped
print('total records: ', len(df))

# groupby each state and find total entries
df.groupby('state')['url'].count()

# view one state records and compare webpage 
df[df['state'] == 'NewYork']

# There are duplicate records in New Yor for Piping Plover - Why?
# What else might you want to capture and put into this dataframe?

In [None]:
# using list of dictionaries above that has a dictionary for each animal (row)
for animal in data:
    browser.visit(animal["url"])
    # slow down scrape to server does see rapid hits coming from one IP
    time.sleep(1)

    # extract browser html
    html_content = browser.html

    # create soup object
    soup = BeautifulSoup(html_content, "html.parser")

    # attempt to search for images on each animal page but if none just record it as none
    try:
        image = soup.find("img", class_="imageSize")["src"]
    except:
        image = None

    # attempt to search for endangered animals on each animal page but if none just record it as none
    try:
        end_status = soup.find("span", class_="listingEnd").text
    except:
        end_status = None

    # attempt to search for threatened animals on each animal page but if none just record it as none
    try:
        threat_status = soup.find("span", class_="listingThreat").text
    except:
        threat_status = None

    # attempt to search for paragraphs of info on each animal page but if none just record it as none
    try:
        general_info = soup.find("div", {"id": "j-general-info"}).text
    except:
        general_info = None

    # store all info for this particular page into the dictionary originally accessed (where we got the link)
    animal["image_url"] = image
    animal["endangered"] = end_status
    animal["threatened"] = threat_status
    animal["description"] = general_info

# does the endangered/threatened content need collect from this page?


In [None]:
# view complete dataframe
df = pd.DataFrame(data)
df.head(20)

In [None]:
# inspect df
df.describe()

In [None]:
# view number of nan scraped values
df.isna().sum()

In [None]:
# show missing image url and check to see if the image is missing or if it is a code issue
df[df['image_url'].isna()]

In [None]:
# check of the images not missing to see why there are so few unique image urls
no_na_df = df[~df["image_url"].isna()]
no_na_df[no_na_df["image_url"].duplicated(keep=False)].sort_values(by="image_url").head(50)

In [None]:
# move full dataset to csv file
df.to_csv('great_lakes_data.csv')

In [None]:
# view a deduped df
df.drop_duplicates(subset=['url'], inplace=True, keep='first')

In [None]:
df

In [None]:
df.to_json('output.json', orient='records', lines=True)