# Import libraries

In [17]:
import pandas as pd
import numpy as np

# for web scraping 
from bs4 import BeautifulSoup
import requests

# Access OEHHA url from species name request

First attempt: Using the species name, find the species name on the OEHHA advisroy page, and return the correct link

In [18]:
# cleaning up data 

# read in the data
species = pd.read_csv("species_common_science.csv")

# select only the distinct species and reset the index from 0-60
species_clean = species[['CompositeCommonName', "scientific_name"]].drop_duplicates().reset_index()

# convert the common names to title case since that is what is onthe website
species_clean["name_finder"] = species_clean["CompositeCommonName"].str.title()

species_clean = species_clean[['CompositeCommonName', "scientific_name", "name_finder"]]

species_clean

Unnamed: 0,CompositeCommonName,scientific_name,name_finder
0,white croaker,Genyonemus lineatus,White Croaker
1,spotted sand bass,Paralabrax maculatofasciatus,Spotted Sand Bass
2,shiner surfperch,Cymatogaster aggregata,Shiner Surfperch
3,walleye surfperch,Hyperprosopon argenteum,Walleye Surfperch
4,barred surfperch,Amphistichus argenteus,Barred Surfperch
...,...,...,...
56,greenspotted rockfish,Sebastes chlorostictus,Greenspotted Rockfish
57,speckled rockfish,Sebastes ovalis,Speckled Rockfish
58,squarespot rockfish,Sebastes hopkinsi,Squarespot Rockfish
59,greenblotched rockfish,Sebastes rosenblatti,Greenblotched Rockfish


# Match the website link from OEHHA

The avisory link is: https://oehha.ca.gov/fish/advisories 

The list to the species is: https://oehha.ca.gov/fish/species 

To access a specific species: https://oehha.ca.gov/fish/species/white-croaker

First attempt:
1. make a column with the names in the format that the url needs `species-name`
2. iterate through the names and create urls that are in the format `https://oehha.ca.gov/fish/species/species-name`

Step 1:

In [19]:
# make new column with url ready species name
species_clean['url_name'] = species_clean["CompositeCommonName"].str.replace(" ", "-")

species_clean


Unnamed: 0,CompositeCommonName,scientific_name,name_finder,url_name
0,white croaker,Genyonemus lineatus,White Croaker,white-croaker
1,spotted sand bass,Paralabrax maculatofasciatus,Spotted Sand Bass,spotted-sand-bass
2,shiner surfperch,Cymatogaster aggregata,Shiner Surfperch,shiner-surfperch
3,walleye surfperch,Hyperprosopon argenteum,Walleye Surfperch,walleye-surfperch
4,barred surfperch,Amphistichus argenteus,Barred Surfperch,barred-surfperch
...,...,...,...,...
56,greenspotted rockfish,Sebastes chlorostictus,Greenspotted Rockfish,greenspotted-rockfish
57,speckled rockfish,Sebastes ovalis,Speckled Rockfish,speckled-rockfish
58,squarespot rockfish,Sebastes hopkinsi,Squarespot Rockfish,squarespot-rockfish
59,greenblotched rockfish,Sebastes rosenblatti,Greenblotched Rockfish,greenblotched-rockfish


Step 2:

In [20]:
# set up a base url for accessing species info
base_url = "https://oehha.ca.gov/fish/species/"

# create a new url column using the modified names
species_clean["url"] = species_clean['url_name'].apply(lambda x: base_url + x)

# check that there are the correct number of urls
species_clean["url"].count()
species_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   CompositeCommonName  61 non-null     object
 1   scientific_name      61 non-null     object
 2   name_finder          61 non-null     object
 3   url_name             61 non-null     object
 4   url                  61 non-null     object
dtypes: object(5)
memory usage: 2.5+ KB


# Download the statewide advisory poster from the link

Current Problems
1. When you click each fish, it has a list of locations that pop up and are inconsistent across the species
2. Then, when we select on a location, there are a list of posters in different languages: this could be really good for accessibility and you can specify a language. 
3. The posters have advisores for many fish

Potential Solutions
1. We simply provide a link that allows people to learn more about the advisories based on species name
2. We create a folder that has custom advisories...but this woul dnot be able to be updated automatically 

First attempt:
1. Access species link
2. find the link that corresponds to `Statewide Advisory for Eating Fish from California Coastal Locations without Site-Specific Advice`

In [16]:
# create a funciton that specifies the link based on the input 

def advisory_link(species_name):
    
    # match the name to the lookup table syntax
    name = species_name.title()

    # filter name df based on input
    sub = species_clean.loc[species_clean["name_finder"] == name]

    # get the url
    link = sub["url"]

    return(link)


# test the function
advisory_link("White Croaker")



0    https://oehha.ca.gov/fish/species/white-croaker
Name: url, dtype: object

# Access the html information from the links

This block contains the advisory information: `id="block-views-species-block-1"` and `class="clearfix block block-views"`  

The issue with this is that there is a captcha request, and it is blocking access to the advisories. 


In [40]:
url = "https://oehha.ca.gov/fish/species/barred-surfperch"
page = requests.get(url)

# access the html from the page
print(page.text)


soup = BeautifulSoup(page.content, "html.parser")
#soup

# find the Advisory heading using the id
results = soup.find(id="block-views-species-block-1")


<html style="height:100%"><head><META NAME="ROBOTS" CONTENT="NOINDEX, NOFOLLOW"><meta name="format-detection" content="telephone=no"><meta name="viewport" content="initial-scale=1.0"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><script type="text/javascript" src="/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3"></script></head><body style="margin:0px;height:100%"><iframe id="main-iframe" src="/_Incapsula_Resource?SWUDNSAI=31&xinfo=14-76952401-0%200NNN%20RT%281713049526339%20165%29%20q%280%20-1%20-1%20-1%29%20r%280%20-1%29%20B12%284%2c315%2c0%29%20U18&incident_id=1445000010193676268-352648086753181774&edet=12&cinfo=04000000&rpinfo=0&cts=F7XU%2b02zbORWQYdFOGPKqT8eywE7XF2jK3tzsF1zkNJg80lljGFt%2f6Lazhq5Sbsh&mth=GET" frameborder=0 width="100%" height="100%" marginheight="0px" marginwidth="0px">Request unsuccessful. Incapsula incident ID: 1445000010193676268-352648086753181774</iframe></body></html>
