In [80]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np

In [81]:
def get_html_content(query, year):
    # URL of the page
    url = 'https://www.timeanddate.com/sun/'   

    # Create a list to store responses for each month
    responses = []
    
    # Set up the parameters for the current month
    params = {'query': query}

    # Send a GET request to the website with the search query parameters
    response = requests.get(url, params=params)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Loop through each month (assuming months are represented by numbers 1 to 12)
        for month in range(1, 13):
            # Add ?month=&year= to the URL
            new_url = f'{response.url}?month={month}&year={year}'

            # Send another GET request with the updated URL
            new_response = requests.get(new_url)
            
            # Check if the second request was successful
            if new_response.status_code == 200:
                #Add response to the list
                responses.append(new_response)

                print(f'Successfully retrieved data for {year}-{month}')
            else:
                print(f'Error in the request for {year}-{month}: {response.status_code}')
    else:
        print('Error in the first request:', response.status_code)

    return responses

In [82]:
def retrieve_monthly_data(content):
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')

    # Find the table with the specified id
    table = soup.find('table', {'id': 'as-monthsun'})
    
    # Extract all the rows
    rows = table.find_all('tr')

    # Extract column headers
    headers1 = [th.text.strip() for th in rows[0].find_all('th')]
    headers = [th.text.strip() for th in rows[1].find_all('th')]

    # Rename Columns
    headers[0] = headers1[0] + ' ' + headers[0]
    headers[1] = headers[1] 
    headers[2] = headers[2]
    headers[3] = headers1[2] + ' (' + headers[3] + ')'
    headers[4] = headers1[2] + ' (' + headers[4] + ')'
    headers[5] = headers1[3] + ' (' + headers[5] + ')'
    headers[6] = headers1[3] + ' (' + headers[6] + ')'
    headers[7] = headers1[4] + ' (' + headers[7] + ')'
    headers[8] = headers1[4] + ' (' + headers[8] + ')'
    headers[9] = headers1[5] + ' (' + headers[9] + ')'
    headers[10] = headers1[5] + ' (' + headers[10] + ')'
    headers[11] = headers[11]
    headers[12] = headers1[6] + ' (' + headers[12] + ')'

    # Extract data rows
    data = []
    for row in rows[3:-1]:
        cols = row.find_all(['th', 'td'])
        cols = [col.text.strip() for col in cols]
        data.append(cols)

    # Convert data to a Pandas DataFrame
    df = pd.DataFrame(data, columns=headers).dropna()

    # Clean data
    df['Date'] = pd.to_datetime(headers[0] + ' ' + df[headers[0]], format='%Y %b %d')
    df[['Sunrise (Time)', 'Sunrise (Angle)']] = df['Sunrise'].str.extract(r'(\d+:\d+ [apm]+) ↑ \((\d+)°\)')
    df[['Sunset (Time)', 'Sunset (Angle)']] = df['Sunset'].str.extract(r'(\d+:\d+ [apm]+) ↑ \((\d+)°\)')
    df[['Solar Noon (Time)', 'Solar Noon (Angle)']] = df['Time'].str.extract(r'(\d+:\d+ [apm]+) \(([\d.]+)°\)')
    df = df.drop(columns=[headers[0], 'Sunrise', 'Sunset', 'Time'])

    # Reorder columns
    column_order = np.concatenate([['Date', 'Sunrise (Time)', 'Sunrise (Angle)', 'Sunset (Time)', 'Sunset (Angle)'], headers[3:11], ['Solar Noon (Time)', 'Solar Noon (Angle)', headers[-1]]])
    df = df[column_order]

    return df

In [83]:
def retrieve_all_data(query):
    # Get all the responses
    responses = get_html_content(query, 2024)

    # Initialize an empty list to store DataFrames
    dfs = []

    # Iterate through each response and retrieve data
    for response in responses:
        df = retrieve_monthly_data(response.content)
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(dfs, ignore_index=True)
    final_df.insert(0, 'State Park', query)

    return final_df

In [85]:
# Define the search query
query = 'Ahjumawi Lava Springs State Park'

retrieve_all_data(query)

Successfully retrieved data for 2024-1
Successfully retrieved data for 2024-2
Successfully retrieved data for 2024-3
Successfully retrieved data for 2024-4
Successfully retrieved data for 2024-5
Successfully retrieved data for 2024-6
Successfully retrieved data for 2024-7
Successfully retrieved data for 2024-8
Successfully retrieved data for 2024-9
Successfully retrieved data for 2024-10
Successfully retrieved data for 2024-11
Successfully retrieved data for 2024-12


Unnamed: 0,State Park,Date,Sunrise (Time),Sunrise (Angle),Sunset (Time),Sunset (Angle),Daylength (Length),Daylength (Diff.),Astronomical Twilight (Start),Astronomical Twilight (End),Nautical Twilight (Start),Nautical Twilight (End),Civil Twilight (Start),Civil Twilight (End),Solar Noon (Time),Solar Noon (Angle),Solar Noon (Mil. mi)
0,Ahjumawi Lava Springs State Park,2024-01-01,7:30 am,120,4:47 pm,240,9:16:29,+0:41,5:51 am,6:26 pm,6:25 am,5:53 pm,6:59 am,5:18 pm,12:09 pm,25.9,91.404
1,Ahjumawi Lava Springs State Park,2024-01-02,7:31 am,120,4:48 pm,240,9:17:14,+0:45,5:52 am,6:27 pm,6:25 am,5:53 pm,7:00 am,5:19 pm,12:09 pm,26.0,91.404
2,Ahjumawi Lava Springs State Park,2024-01-03,7:31 am,120,4:49 pm,240,9:18:03,+0:49,5:52 am,6:28 pm,6:25 am,5:54 pm,7:00 am,5:20 pm,12:10 pm,26.1,91.404
3,Ahjumawi Lava Springs State Park,2024-01-04,7:31 am,120,4:50 pm,240,9:18:56,+0:53,5:52 am,6:28 pm,6:25 am,5:55 pm,7:00 am,5:21 pm,12:10 pm,26.2,91.405
4,Ahjumawi Lava Springs State Park,2024-01-05,7:31 am,120,4:51 pm,240,9:19:53,+0:56,5:52 am,6:29 pm,6:25 am,5:56 pm,7:00 am,5:21 pm,12:10 pm,26.3,91.406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,Ahjumawi Lava Springs State Park,2024-12-27,7:30 am,121,4:44 pm,239,9:14:00,+0:24,5:50 am,6:23 pm,6:24 am,5:49 pm,6:58 am,5:15 pm,12:07 pm,25.6,91.417
362,Ahjumawi Lava Springs State Park,2024-12-28,7:30 am,121,4:44 pm,239,9:14:28,+0:28,5:51 am,6:24 pm,6:24 am,5:50 pm,6:59 am,5:16 pm,12:07 pm,25.7,91.414
363,Ahjumawi Lava Springs State Park,2024-12-29,7:30 am,121,4:45 pm,239,9:15:01,+0:32,5:51 am,6:24 pm,6:24 am,5:51 pm,6:59 am,5:16 pm,12:08 pm,25.7,91.412
364,Ahjumawi Lava Springs State Park,2024-12-30,7:30 am,121,4:46 pm,239,9:15:38,+0:36,5:51 am,6:25 pm,6:25 am,5:52 pm,6:59 am,5:17 pm,12:08 pm,25.8,91.410


In [104]:
SP_table = pd.read_html('https://en.wikipedia.org/wiki/List_of_California_state_parks#List_of_parks')[0]
SP_table.columns = SP_table.columns.droplevel(0)
SP_table.rename(columns={'Park name': 'State Park', 'County orcounties': 'County', 'acres': 'Size (acres)', 'ha': 'Size (ha)', 'Year established[1]': 'Year established'}, inplace=True)
SP_table


Unnamed: 0,State Park,Classification,County,Size (acres),Size (ha),Year established,Remarks
0,Admiral William Standley State Recreation Area,State recreation area,Mendocino,45.00,18,1944,Boasts redwoods plus salmon and steelhead fish...
1,Ahjumawi Lava Springs State Park,State park,Shasta,5930.00,2400,1975,Preserves a wilderness of freshwater springs a...
2,Albany State Marine Reserve,Park property,Alameda,,,1985,
3,Anderson Marsh State Historic Park,State historic park,Lake,1298.00,525,1982,Preserves a tule marsh and ancient archaeologi...
4,Andrew Molera State Park,State park,Monterey,4766.00,1929,1968,Offers a primitive walk-in campground on the B...
...,...,...,...,...,...,...,...
274,Will Rogers State Historic Park,State historic park,Los Angeles,189.00,76,1944,
275,William B. Ide Adobe State Historic Park,State historic park,Tehama,3.90,1.6,1951,
276,Woodland Opera House State Historic Park,State historic park,Yolo,0.26,0.11,1980,
277,Woodson Bridge State Recreation Area,State recreation area,Tehama,323.00,131,1959,


In [142]:
import re

def dms_to_decimal(dms_str):
    # Find all numbers in the string
    numbers = list(map(int, re.findall(r'\d+', dms_str)))

    # Extract degrees, minutes, and seconds
    if len(numbers) == 3:
        degrees, minutes, seconds = numbers[:3]
    elif len(numbers) == 2:
        degrees, minutes = numbers[:2]
        seconds = 0
    else:
        degrees = numbers[0]
        minutes = 0
        seconds = 0

    direction = dms_str[-1]
    
    decimal_degrees = float(degrees) + (float(minutes) / 60) + (float(seconds) / 3600)
    if direction == 'S' or direction == 'W':
        decimal_degrees *= -1
    return round(decimal_degrees, 4)

def convert_coordinates(lat_str, long_str):
    latitude = dms_to_decimal(lat_str)
    longitude = dms_to_decimal(long_str)
    return latitude, longitude

In [151]:
url = 'https://en.wikipedia.org/wiki/List_of_California_state_parks#List_of_parks'

# Send a GET request to the URL
response = requests.get(url)

# List to store park links
SP_links = []

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table with the specified class
    table = soup.find('table', class_='wikitable')

    # Extract all the rows from the table
    rows = table.find_all('tr')[2:]

    # Iterate over rows and extract links
    for row in rows:
        link = row.find('a')
        park_link = link.get('href')
        SP_links.append(park_link)
else:
    print(f"Error: Failed to retrieve the page. Status code: {response.status_code}")

# Print the list of park links
print(len(SP_links))

279


In [157]:
url = 'https://en.wikipedia.org'
dfs = []

# Iterate over park links
for link in SP_links:
    new_url = url + link
    print(f'Getting {new_url}')

    # Send a GET request to the URL
    response = requests.get(new_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find latitude and longitude spans
        latitude = soup.find('span', class_='latitude')
        longitude = soup.find('span', class_='longitude')

        if latitude and longitude:
            # Convert coordinates
            latitude, longitude = convert_coordinates(latitude.text, longitude.text)
        else:
            # If latitude or longitude is missing, set them to NaN
            latitude, longitude = 'NaN', 'NaN'
             
        data = {'Latitude': [latitude], 
                'Longitude': [longitude]}
    else:
        print(f"Error: Failed to retrieve the page {new_url}. Status code: {response.status_code}")
        data = {'Latitude': ['NaN'], 
                'Longitude': ['NaN']}
    df = pd.DataFrame(data)
    dfs.append(df)

# Concatenate DataFrames into a final DataFrame
final_df = pd.concat(dfs, ignore_index=True)

print(final_df)

Getting https://en.wikipedia.org/wiki/Admiral_William_Standley_State_Recreation_Area
Getting https://en.wikipedia.org/wiki/Ahjumawi_Lava_Springs_State_Park
Getting https://en.wikipedia.org/wiki/Albany_State_Marine_Reserve
Getting https://en.wikipedia.org/wiki/Anderson_Marsh_State_Historic_Park
Getting https://en.wikipedia.org/wiki/Andrew_Molera_State_Park
Getting https://en.wikipedia.org/wiki/Angel_Island_(California)
Getting https://en.wikipedia.org/wiki/A%C3%B1o_Nuevo_State_Park
Getting https://en.wikipedia.org/wiki/Antelope_Valley_California_Poppy_Reserve
Getting https://en.wikipedia.org/wiki/Antelope_Valley_Indian_Museum_State_Historic_Park
Getting https://en.wikipedia.org/wiki/Anza-Borrego_Desert_State_Park
Getting https://en.wikipedia.org/wiki/Armstrong_Redwoods_State_Natural_Reserve
Getting https://en.wikipedia.org/wiki/Arthur_B._Ripley_Desert_Woodland_State_Park
Getting https://en.wikipedia.org/wiki/Asilomar_State_Beach
Getting https://en.wikipedia.org/wiki/Auburn_State_Recreat

In [161]:
keys = ['SP_table', 'final_df']
final_SP_table = pd.concat([SP_table, final_df], axis=1, keys=keys)
final_SP_table.columns = final_SP_table.columns.droplevel(0)
final_SP_table

Unnamed: 0,State Park,Classification,County,Size (acres),Size (ha),Year established,Remarks,Latitude,Longitude
0,Admiral William Standley State Recreation Area,State recreation area,Mendocino,45.00,18,1944,Boasts redwoods plus salmon and steelhead fish...,39.6464,-123.6169
1,Ahjumawi Lava Springs State Park,State park,Shasta,5930.00,2400,1975,Preserves a wilderness of freshwater springs a...,41.1361,-121.4178
2,Albany State Marine Reserve,Park property,Alameda,,,1985,,37.8869,-122.2978
3,Anderson Marsh State Historic Park,State historic park,Lake,1298.00,525,1982,Preserves a tule marsh and ancient archaeologi...,38.9236,-122.625
4,Andrew Molera State Park,State park,Monterey,4766.00,1929,1968,Offers a primitive walk-in campground on the B...,36.2833,-121.8333
...,...,...,...,...,...,...,...,...,...
274,Will Rogers State Historic Park,State historic park,Los Angeles,189.00,76,1944,,34.0561,-118.5122
275,William B. Ide Adobe State Historic Park,State historic park,Tehama,3.90,1.6,1951,,40.1967,-122.2253
276,Woodland Opera House State Historic Park,State historic park,Yolo,0.26,0.11,1980,,38.6778,-121.7708
277,Woodson Bridge State Recreation Area,State recreation area,Tehama,323.00,131,1959,,39.9175,-122.0914
