## Volpe Center
### Code for National Park Service sponsor project
### Code by Eric Englin
### Date: 10/23/23
<br><br>
#### Objective: scrape all websites for NPS to see if they meet the 10 essential travelers information for transportation
#### These 10 are:
<li>driving directions</li>
<li>Public transportation information</li>
<li>Bike and pedestrian information</li>
<li>Parking lot locations and accommodations</li>
<li>Parking lot peak use and availability</li>
<li>congestion information</li>
<li>travel distances and travel time to sites within the park</li>
<li>Accessibility</li>
<li>Description of transportation experience</li>
<li>Alternative fueling stations</li>

<br><br>
#### Together, these 10 measures can allow NPS to understand how their parks are providing transportation to visitors. This information can be used to evaluate each park and plan for an improved park experience in the future. 

In [1]:
# import libraries
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns 
import html
from bs4 import BeautifulSoup
import requests
import os
from selenium import webdriver
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import warnings; warnings.simplefilter('ignore')
from VE_scraper_functions import *
from chromedriver_py import binary_path # this will get you the path variable


In [2]:
#https://stackoverflow.com/questions/76727774/selenium-webdriver-chrome-115-stopped-working
from selenium.webdriver.chrome.service import Service

service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)
driver.close()

In [3]:
#change to the location where you saved chromedriver
chromedriver_location=r'C:\Users\eric.englin\Downloads/chromedriver.exe'

In [4]:
driver = webdriver.Chrome(service=service, options=options)
driver.get('https://www.nps.gov/AGFO/planyourvisit/directions.htm')
driver.close() #close driver link

In [5]:
path = "Park Unit Scraping Information.csv"
parks = pd.read_csv(path, encoding='latin-1')


In [6]:
parks.head()

Unnamed: 0,Park Name,Type of Unit,State,Reg,Alpha,Unit
0,Abraham Lincoln Birthplace,National Historical Park,KY,SER,ABLI,1.0
1,Acadia,National Park,ME,NER,ACAD,1.0
2,Adams,National Historical Park,MA,NER,ADAM,1.0
3,African Burial Ground,National Monument,NY,NER,AFBG,1.0
4,Agate Fossil Beds,National Monument,NE,MWR,AGFO,1.0


In [7]:
len(parks)

421

In [8]:
index = []
for x in parks['Alpha']:
    y = "https://www.nps.gov/"+x+"/index.htm"
    index.append(y)

parks['index site']=index

In [9]:
## For context, here is the main site for each national park
for x in parks['index site']:
    print(x)

https://www.nps.gov/ABLI/index.htm
https://www.nps.gov/ACAD/index.htm
https://www.nps.gov/ADAM/index.htm
https://www.nps.gov/AFBG/index.htm
https://www.nps.gov/AGFO/index.htm
https://www.nps.gov/ALAG/index.htm
https://www.nps.gov/ALFL/index.htm
https://www.nps.gov/ALKA/index.htm
https://www.nps.gov/ALPO/index.htm
https://www.nps.gov/AMIS/index.htm
https://www.nps.gov/AMME/index.htm
https://www.nps.gov/ANDE/index.htm
https://www.nps.gov/ANIA/index.htm
https://www.nps.gov/ANJO/index.htm
https://www.nps.gov/ANTI/index.htm
https://www.nps.gov/APCO/index.htm
https://www.nps.gov/APIS/index.htm
https://www.nps.gov/APPA/index.htm
https://www.nps.gov/ARCH/index.htm
https://www.nps.gov/ARHO/index.htm
https://www.nps.gov/ARPO/index.htm
https://www.nps.gov/ASIS/index.htm
https://www.nps.gov/AZRU/index.htm
https://www.nps.gov/BADL/index.htm
https://www.nps.gov/BAND/index.htm
https://www.nps.gov/BELA/index.htm
https://www.nps.gov/BEOL/index.htm
https://www.nps.gov/BEPA/index.htm
https://www.nps.gov/

In [10]:
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", 
             "you", "your", "yours", "yourself", "yourselves", "he", "him", 
             "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
             "they", "them", "their", "theirs", "themselves", "what", "which", "who",
             "whom", "this", "that", "these", "those", "am", "is", "are", "was", 
             "were", "be", "been", "being", "have", "has", "had", "having", "do", 
             "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", 
             "because", "as", "until", "while", "of", "at", "by", "for", "with", "about",
             "against", "between", "into", "through", "during", "before", "after", "above", 
             "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", 
             "again", "further", "then", "once", "here", "there", "when", "where", "why", 
             "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", 
             "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", 
             "s", "t", "can", "will", "just", "don", "should", "now",
            "href","=","/",">","<","]","[","span","'\n'",'class',"jstcache",
            "onclick","null","jscontent"," <br/>","</span>",",",";","(",")","{","}",":","''",
            "&","'","var","+=",".","#","-","=","+","``","0","’","data.operatingHours","outputVarOperatingHours",
            ".exceptions","--","1","-1","?","class=","==","div","/div","$","li","e","!","k","/span","jQuery",
            "tabindex",'j','l']

In [11]:
park_scrape_dataset_YOSE = scrape_site('YOSE')

In [12]:
park_scrape_dataset_YOSE

Unnamed: 0,website page,content,soup,park
0,https://www.nps.gov/planyourvisit/index.htm,Plan Your Visit (U.S. National Park Service)Sk...,"[[ Content Copyright National Park Service , ...",YOSE
1,https://www.nps.gov/planyourvisit/event-search...,Event Calendar (U.S. National Park Service)Ski...,"[[ Content Copyright National Park Service , ...",YOSE
2,https://www.nps.gov/planyourvisit/passes.htm,Entrance Passes (U.S. National Park Service)Sk...,"[[ Content Copyright National Park Service , ...",YOSE
3,https://www.nps.gov/planyourvisit/trip-ideas.htm,Trip Ideas (U.S. National Park Service)Skip to...,"[[ Content Copyright National Park Service , ...",YOSE
4,https://www.nps.gov/yose/planyourvisit/basicin...,Basic Information - Yosemite National Park (U....,"[[ Content Copyright National Park Service , ...",YOSE
5,https://www.nps.gov/yose/planyourvisit/conditi...,Current Conditions - Yosemite National Park (U...,"[[ Content Copyright National Park Service , ...",YOSE
6,https://www.nps.gov/yose/planyourvisit/maps.htm,Maps - Yosemite National Park (U.S. National P...,"[[ Content Copyright National Park Service , ...",YOSE
7,https://www.nps.gov/yose/planyourvisit/calenda...,Calendar - Yosemite National Park (U.S. Nation...,"[[ Content Copyright National Park Service , ...",YOSE
8,https://www.nps.gov/yose/planyourvisit/fees.htm,Fees & Passes - Yosemite National Park (U.S. N...,"[[ Content Copyright National Park Service , ...",YOSE
9,https://www.nps.gov/yose/planyourvisit/eatings...,Eating & Sleeping - Yosemite National Park (U....,"[[ Content Copyright National Park Service , ...",YOSE


In [15]:
for x in park_scrape_dataset_YOSE[:1]['soup']:
    print(x)

<html class="js no-applicationcache cookies eventlistener geolocation history generators localstorage sessionstorage postmessage websqldatabase svg webworkers websockets webaudio hashchange audio preserve3d inlinesvg video webgl multiplebgs csspointerevents cssremunit rgba supports svgclippaths smil cssgradients canvas canvasblending todataurljpeg todataurlpng todataurlwebp canvaswinding hsla no-touchevents fontface generatedcontent siblinggeneral pointerevents textshadow csstransforms csstransforms3d csstransitions no-csspseudotransitions backgroundsize bgsizecover borderimage borderradius csscolumns csscolumns-width csscolumns-span csscolumns-fill csscolumns-gap csscolumns-rule csscolumns-rulecolor csscolumns-rulestyle csscolumns-rulewidth csscolumns-breakbefore csscolumns-breakafter csscolumns-breakinside flexbox no-flexboxtweener cssreflections cssanimations csspseudoanimations webpalpha webp webp-alpha webp-animation webp-lossless indexeddb" lang="en" style=""><!-- Content Copyrig

In [15]:
park_scrape_dataset_YELL=scrape_site('YELL')

In [18]:
park_scrape_dataset=scrape_site('YOSE')

In [20]:
park_scrape_dataset_YOSE

Unnamed: 0,website page,content,park
0,https://www.nps.gov/planyourvisit/index.htm,Plan Your Visit (U.S. National Park Service)Sk...,YOSE
1,https://www.nps.gov/planyourvisit/event-search...,Event Calendar (U.S. National Park Service)Ski...,YOSE
2,https://www.nps.gov/planyourvisit/passes.htm,Entrance Passes (U.S. National Park Service)Sk...,YOSE
3,https://www.nps.gov/planyourvisit/trip-ideas.htm,Trip Ideas (U.S. National Park Service)Skip to...,YOSE
4,https://www.nps.gov/yose/planyourvisit/basicin...,Basic Information - Yosemite National Park (U....,YOSE
5,https://www.nps.gov/yose/planyourvisit/conditi...,Current Conditions - Yosemite National Park (U...,YOSE
6,https://www.nps.gov/yose/planyourvisit/maps.htm,Maps - Yosemite National Park (U.S. National P...,YOSE
7,https://www.nps.gov/yose/planyourvisit/calenda...,Calendar - Yosemite National Park (U.S. Nation...,YOSE
8,https://www.nps.gov/yose/planyourvisit/fees.htm,Fees & Passes - Yosemite National Park (U.S. N...,YOSE
9,https://www.nps.gov/yose/planyourvisit/eatings...,Eating & Sleeping - Yosemite National Park (U....,YOSE


In [21]:
park_scrape_dataset_YELL

Unnamed: 0,website page,content,park
0,https://www.nps.gov/yell/planyourvisit/index.htm,Plan Your Visit - Yellowstone National Park (U...,YELL
1,https://www.nps.gov/yell/planyourvisit/yellows...,Take the Yellowstone Pledge - Yellowstone Nati...,YELL
2,https://www.nps.gov/yell/planyourvisit/yellows...,Yellowstone Pledge - Chinese - Yellowstone Nat...,YELL
3,https://www.nps.gov/yell/planyourvisit/yellows...,Yellowstone Pledge - Czech - Yellowstone Natio...,YELL
4,https://www.nps.gov/yell/planyourvisit/yellows...,Yellowstone Pledge - French - Yellowstone Nati...,YELL
...,...,...,...
135,https://www.nps.gov/planyourvisit/index.htm,Plan Your Visit (U.S. National Park Service)Sk...,YELL
136,https://www.nps.gov/planyourvisit/event-search...,Event Calendar (U.S. National Park Service)Ski...,YELL
137,https://www.nps.gov/planyourvisit/passes.htm,Entrance Passes (U.S. National Park Service)Sk...,YELL
138,https://www.nps.gov/planyourvisit/trip-ideas.htm,Trip Ideas (U.S. National Park Service)Skip to...,YELL


In [13]:
z=0
v=0
for x in parks['Alpha'].unique():
    v+=1
    if z==0:
        park_scrape_dataset=scrape_site(x)
        print(x)
        break
    else:
        this_park_scrape = scrape_site(x)
        park_scrape_dataset = park_scrape_dataset.append(this_park_scrape)
    if v % 25 == 0:
        print(x)
        print(len(park_scrape_dataset))
    z+=1
    #if z>50: #if want to test out
    #    break

ABLI


In [23]:
for x in park_scrape_dataset[15:16]['content']:
    print(x)

Directions - Yosemite National Park (U.S. National Park Service)Skip to global NPS navigationSkip to this park navigationSkip to the main contentSkip to this park information sectionSkip to the footer sectionNational Park ServiceSearchSearchThis SiteAll NPSOpenMenuCloseMenuExplore This ParkExplore the National Park ServicePlan Your VisitToggle submenu for Plan Your VisitFind a ParkEventsPassesTrip IdeasLearn & ExploreToggle submenu for Learn & ExploreAbout UsDiscover HistoryExplore NatureKidsEducatorsNewsPhotos & MultimediaExplore by TopicGet InvolvedToggle submenu for Get InvolvedDonatePartnerVolunteerWork for UsCommunity ResourcesSearch for parks by activity or topicExiting nps.govCancelYosemiteNational ParkCaliforniaInfoAlertsMapsCalendarFeesLoading alertsAlerts In EffectDismissmore information on current conditions...DismissView all alertsContact UsDirectionsYosemite National Park covers nearly 1,200 square miles of mountainous terrain in the Sierra Nevada of California. Yosemite i

In [None]:
# save your scraped website into another variable name so don't have to redo scrape
park_scrape_dataset2 = park_scrape_dataset

#data cleaning
park_scrape_dataset2['index1'] = park_scrape_dataset2.index
park_scrape_dataset2=park_scrape_dataset2.reset_index()


#save as excel
#note: saving as a csv won't work due to punctuation used in html code
park_scrape_dataset2.to_excel("full_park_scrape_dataset.xlsx")

In [None]:
park_scrape_dataset2.tail()

In [None]:
## Model to calculate VE fields ##

#create new sheet so with our variables for each park
park_sheet = pd.DataFrame(columns = ['park', 'Driving_Directions','Public_transportation_information',
                                     'Bike_Pedestrian_Information','Congestion_information','Accessibility',
                                           'Alternative_Fueling_Stations', 'website page count'])
z=0
tic = time.clock() #function to let us track processing time


for x in park_scrape_dataset2['park'].unique():
    z+=1
    if z % 25 == 0: 
        #function to let us track processing time
        z5 = 400-z
        toc = time.clock()
        time_diff = toc-tic
        print("Current Park: ", x, ": ", z, " checks done; ", z5, " remaining; Processing Time: ",time_diff)
        tic=toc
        
    this_park = park_scrape_dataset2[(park_scrape_dataset2['park']==x)] #filter our webscraping dataset for our park's website code
    park_final = Traveler_Info_Finder(this_park) #run function
    park_sheet = park_sheet.append({'park': park_final.get_value(0,'park'),
                        'website page count': park_final.get_value(0,'website page'),
                        'Directions_word_count': park_final.get_value(0,'Directions_count'),
                        'Directions_page_count': park_final.get_value(0,'Directions_page_count'),
                       'Driving_Directions': park_final.get_value(0,'MajorDirections_count'),
                       'Public_transportation_information': park_final.get_value(0,'Public_transportation_information'),
                       'Alternative_Fueling_Stations': park_final.get_value(0,'Alternative_Fueling_Stations'), 
                       'Bike_Pedestrian_Information': park_final.get_value(0,'Bike_Pedestrian_Information'),
                       'Congestion_information': park_final.get_value(0,'Congestion_information'),
                        'Travel_Distance_Information': park_final.get_value(0,'Travel_dist_information'),
                        'Travel_other_dist_information': park_final.get_value(0,'Travel_other_dist_information'),
                        'Accessibility': park_final.get_value(0,'Accessibility_information'),
                        'Parking_raw_information': park_final.get_value(0,'Parking_information'),
                        'Parking_experience_information': park_final.get_value(0,'Parking_experience_information'),
                        'Parking_max_on_one_site': park_final.get_value(0,'Parking_max_on_one_site')
                       },
                      ignore_index=True)
    park_sheet.loc[park_sheet.Driving_Directions > 0, 'Driving_Directions'] = 1
    park_sheet.loc[park_sheet.Alternative_Fueling_Stations > 0, 'Alternative_Fueling_Stations'] = 1
    park_sheet.loc[park_sheet.Public_transportation_information > 0, 'Public_transportation_information'] = 1
    park_sheet.loc[park_sheet.Bike_Pedestrian_Information > 0, 'Bike_Pedestrian_Information'] = 1
    park_sheet.loc[park_sheet.Congestion_information > 0, 'Congestion_information'] = 1
    park_sheet.loc[park_sheet.Accessibility > 0, 'Accessibility'] = 1
 #   park_sheet.loc[park_sheet.Parking_information > 0, 'Parking_information'] = 1
    park_sheet['Travel_Distance_Final']=np.where(
        np.logical_or(park_sheet['Travel_Distance_Information']>9, 
                     park_sheet['Travel_other_dist_information']>0),1,0)
    park_sheet['Parking_Experience_information']=np.where((
        park_sheet['Parking_raw_information']/park_sheet['website page count'])>0.25,1,0)
    park_sheet['Transportation_experience_information']=np.where((
        park_sheet['Directions_page_count']/park_sheet['website page count'])>0.65,1,0)
    park_sheet['Parking_information']=np.where(np.logical_or(
        park_sheet['Parking_Experience_information']==1,
        park_sheet['Parking_max_on_one_site']>2),1,0)


park_sheet= park_sheet.drop(columns=['website page count', 'Directions_word_count',
                        'Directions_page_count','Parking_raw_information','Parking_experience_information',
                        'Parking_max_on_one_site','Travel_Distance_Information','Travel_other_dist_information'])
    
#create csv
park_sheet.to_csv("final_park.csv") #save final csv
os.system("start EXCEL.EXE final_park.csv") #open csv file