# <span style="font-size:larger;font-weight:bold">Webscraping Data from New York State Park and Historic site Websites</span>

In [78]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


class Park_Hist_siteScraper:
    """
    A web scraper class designed to extract information about historic sites 
    and state parks in New York State from the New York State Parks website.
    
    Methods:
        scrape_site: a method to scrape the latitude, longitude, address, phone number, description,
        and amenities of a park site.
        scrape_all_sites: a method to loop through all the websites and call the method 'scrape_site'
        to_dataframe: a method to covert scraped data to panda dataframe.
    """
    
    def __init__(self, url, select_menu_id):
        """
        Initialize the scraper with the URL of the website and the ID of the select menu for state parks.

        Parameters:
        - url (str): the URL of the website to scrape
        - select_menu_id (str): the ID of the select menu for state parks
        """
        # Make a request to the New York State Parks website
        self.url = url
        self.response = requests.get(url)
        # Parse the HTML content using Beautiful Soup
        self.soup = BeautifulSoup(self.response.content, 'html.parser')
        # Find the dropdown menu for selecting state parks
        self.select_menu = self.soup.find('select', {'id': select_menu_id})
        # Get all the option tags from the dropdown menu
        self.options = self.select_menu.find_all('option')
        self.sites = []

    def scrape_site(self, option):
        # Make a request to the park page
        site_url = self.url + option['value']
        site_response = requests.get(site_url)
        # Parse the HTML content using Beautiful Soup
        site_soup = BeautifulSoup(site_response.content, 'html.parser')
        
        try:
            latitude_span = site_soup.find("span", {"style": "font-weight:bold;"}, text="Latitude")
            latitude = latitude_span.find_next_sibling(text=True).strip()
        except AttributeError:
            print('Latitude not found.')
            # append a None value for missing latitude
            latitude = None
        
        try:
            longitude_span = site_soup.find("span", {"style": "font-weight:bold;"}, text="Longitude")
            longitude = longitude_span.find_next_sibling(text=True).strip()
        except:
            print("Longitude not found")
            # append a None value for missing longitude
            longitude = None
        
        try:
            address_div = site_soup.find("div", {"style": "margin-bottom:8px;"})
            address = address_div.get_text(strip=True).replace("Address", "", 1).strip()
        except:
            print("Address not found")
            # append a None value for missing Address
            address = None
        
        try:
            phone_text = site_soup.find("strong", text=["Phone", "General Information"])
            phone = phone_text.find_next("a").text.strip()
        except:
            print("Phone not found")
            phone = None # append a None value for missing phone number
        
        try:
            description_div = site_soup.find("div", {"role": "tabpanel"})
            description = description_div.get_text(strip=True)
        except:
            print("Description not found")
            # append a None value for missing Descriptions
            description = None
            
        try:
            amenity_ul = site_soup.find("ul", {"style": "padding:0px; list-style:none;"})
            amenity_li = amenity_ul.find_all("li")[1:]
            amenities_list = []
            for amenity in amenity_li:
                amenity_text = amenity.contents[0].strip()
                amenities_list.append(amenity_text)
            amenities_str = ", ".join(amenities_list[1:])
        except:
            print("Amenities not found")
            # append a None value for missing Amenities
            amenities_str = None

        site = {
            "site_name": option.text.strip(),
            "website": site_url,
            "latitude": latitude,
            "longitude": longitude,
            "address": address,
            "phone": phone,
            "description": description,
            "amenities": amenities_str
        }
        self.sites.append(site)

    def scrape_all_sites(self):
        # Loop through each park option and extract the latitude and longitude
        # start at index 1 to skip the first "Any State Park" option
        for option in self.options[1:]:
            self.scrape_site(option)

    def to_dataframe(self):
        #Creating panda dataframe
        return pd.DataFrame(self.sites)

In [79]:
urls=['https://parks.ny.gov/historic-sites/', 'https://parks.ny.gov/parks/']
scrap =['Container_lstHistoricSites', 'Container_lstStateParks']
df = pd.DataFrame()
for i, url in enumerate(urls):
    print(scrap[i])
    #Creating an instance of ParkScraper class
    scraper = Park_Hist_siteScraper(url, scrap[i])
    #'scrape_all_sites()' method is called on the 'scraper' object
    scraper.scrape_all_sites()
    #Resulting data is converted to dataframe
    df = pd.concat([df, scraper.to_dataframe()], ignore_index=True)

Container_lstHistoricSites
Amenities not found
Amenities not found
Container_lstStateParks
Amenities not found
Amenities not found
Phone not found
Latitude not found.
Longitude not found
Amenities not found
Latitude not found.
Longitude not found
Latitude not found.
Longitude not found
Latitude not found.
Longitude not found
Amenities not found
Phone not found


In [80]:
df

Unnamed: 0,site_name,website,latitude,longitude,address,phone,description,amenities
0,Bennington Battlefield State Historic Site,https://parks.ny.gov/historic-sites/12,42.933758,-73.304878,"5157 Route 67Walloomsac, NY 12090",(518) 860-9094,2022 Geocache challengeBennington Battlefield ...,Picnic Area
1,Clermont State Historic Site,https://parks.ny.gov/historic-sites/16,42.085087,-73.911835,"One Clermont AvenueGermantown, NY 12526",(518) 537-4240,Clermont State Historic Site was the Hudson Ri...,"Equestrian Trails, Gift Shop, Hiking, Picnic A..."
2,Clinton House State Historic Site,https://parks.ny.gov/historic-sites/1,41.700333,-73.915993,"549 Main StreetPoughkeepsie, NY 12602",(845) 471-1630,The vernacular stone house now known as Clinto...,
3,Crailo State Historic Site,https://parks.ny.gov/historic-sites/30,42.634590,-73.749496,"9 1/2 Riverside AvenueRensselaer, NY 12144",(518) 463-8738,2022 Geocache challengeCrailo is the museum of...,Visitor Center
4,Crown Point State Historic Site,https://parks.ny.gov/historic-sites/34,44.024853,-73.424377,"21 Grandview DriveCrown Point, NY 12928",(518) 597-4666,2022 Geocache challengeCrown Point State Histo...,"Hiking, Picnic Area, Snowshoeing/X-Country Ski..."
...,...,...,...,...,...,...,...,...
226,Whirlpool State Park,https://parks.ny.gov/parks/105,43.119640,-79.062302,"Niagara Scenic ParkwayNiagara Falls, NY 14303",(716) 284-4691,Know Before You Go...More InfoPet PolicyA maxi...,"Fishing, Hiking, Pavilions and Shelter Rentals..."
227,Wildwood State Park,https://parks.ny.gov/parks/68,40.962742,-72.808426,"790 Hulse Landing RoadWading River, NY 11792",(631) 929-4314,Wildwood State Park comprises 600 acres of und...,"Camper Assistance Program, Campsites, Cabins &..."
228,Wilson-Tuscarora State Park,https://parks.ny.gov/parks/69,43.307045,-78.852936,"3371 Lake RoadWilson, NY 14172",(716) 751-6361,Know Before You Go...More InfoPet PolicyA maxi...,"Disc Golf, Fishing, Hiking, Hunting, Marina, P..."
229,Wonder Lake State Park,https://parks.ny.gov/parks/190,41.476602,-73.646826,"380 Ludingtonville RoadHolmes, NY 12531",(845) 225-7207,Know Before You Go...More InfoPet PolicyDogs a...,"Hiking, Hunting"


# <span style="font-size:large;font-weight:bold">Filling the missing latitude and longitude data</span>

In [81]:
df[df.isnull().any(axis=1)]

Unnamed: 0,site_name,website,latitude,longitude,address,phone,description,amenities
12,Jay Heritage Center,https://parks.ny.gov/historic-sites/39,40.958358,-73.705237,"210 Boston Post RoadRye, NY 10580",(914) 698-9275,The 23-acre Jay Estate in Rye is one of two fa...,
33,Stonewall Inn State Historic Site,https://parks.ny.gov/historic-sites/41,40.73379,-74.0021,"53 Christopher StreetNew York, NY 10014",(212) 668-2577,Stonewall National Monument is a new national ...,
58,Brentwood State Park,https://parks.ny.gov/parks/187,40.78977,-73.271894,"375 Crooked Hill RoadBrentwood, NY 11717",(631) 667-5055,Brentwood State Park is a 52-acre sport facili...,
100,Franklin D. Roosevelt Four Freedoms State Park,https://parks.ny.gov/parks/186,40.750419,-73.960583,"Roosevelt IslandNew York, NY 10044",(212) 308-3350,Know Before You Go...More InfoPet PolicyWe ask...,
102,Franny Reese State Park,https://parks.ny.gov/parks/192,41.700383,-73.963092,"Johnson-Iorio Park, Haviland RdHighland, NY 12528",,Franny Reese State Park provides an exciting p...,"Hiking, Snowshoeing/X-Country Skiing"
104,Genesee Valley Greenway State Park,https://parks.ny.gov/parks/189,,,Genesee Valley Greenway1 Letchworth State Park...,(585) 493-3614,Know Before You Go:Please check theDetourspage...,"Equestrian Trails, Fishing, Hiking, Snowmobili..."
126,Hudson River Islands State Park,https://parks.ny.gov/parks/98,42.3288852479,-73.7783764003,"Hudson RiverCoxsackie, NY 12192",(518) 732-0187,Know Before You Go...More InfoMooring and Camp...,
127,Hudson River Park,https://parks.ny.gov/parks/185,,,"353 West St 2nd FloorNew York, NY 10044",(212) 627-2020,"For information about the Hudson River Park, p...","Boat Launches, Boat Launches, Fishing, Fishing..."
159,Minnewaska State Park Preserve: Sam's Point Area,https://parks.ny.gov/parks/193,,,"400 Samâs Point RoadCragsmoor, NY 12420",(845) 647-7989,Know Before You Go...More InfoRestrictions-The...,"Hiking, Hunting, Snowshoeing/X-Country Skiing,..."
173,Pat McGee Trail,https://parks.ny.gov/parks/204,,,"Pat McGee TrailLittle Valley, NY 14755",(716) 354-9101,The 12.1-mile Pat McGee Trail in Cattaraugus C...,"Equestrian Trails, Hiking, Pavilions and Shelt..."


In [82]:
#Obtained the missing latitudes and longitudes and adding them
df.loc[df['site_name']=="Minnewaska State Park Preserve: Sam's Point Area", "latitude"] = "41.670636"
df.loc[df['site_name']=="Minnewaska State Park Preserve: Sam's Point Area", "longitude"] = "-74.361385"
df.loc[df['site_name']=="Pat McGee Trail", "latitude"] = "42.20820"
df.loc[df['site_name']=="Pat McGee Trail", "longitude"] = "-78.75710"
df.loc[df['site_name']=="Hudson River Park", "latitude"] = "40.729563"
df.loc[df['site_name']=="Hudson River Park", "longitude"] = "-74.012699"
df.loc[df['site_name']=="Genesee Valley Greenway State Park", "latitude"] = "43.1080"
df.loc[df['site_name']=="Genesee Valley Greenway State Park", "longitude"] = "-77.6515"

In [83]:
# save dataframe to a CSV file
df.to_csv('Webscraped_NYS.csv', index=False)