In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.by import By
import dateutil
import os
import sys
import logging
from google.cloud import bigquery
from typing import List
from google.cloud import bigquery
from google.oauth2 import service_account
import pyarrow

In [3]:
#-------------------Extract-------------------------
urls = ["https://forecast.weather.gov/MapClick.php?lat=57.0826&lon=-135.2692#.Y-vs_9LMJkg",
        'https://forecast.weather.gov/MapClick.php?lat=45.5118&lon=-122.6756#.Y-vtHNLMJkg',
        ]
        
combined_df = pd.DataFrame()

for url in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.content,"html.parser")

    #various containers
    item1 = soup.find_all(id='current_conditions-summary')
    item2 = soup.find_all(id='current_conditions_detail')
    item4 = soup.find_all(id='tombstone-container')

    #raw data
    temp_f = [item.find(class_="myforecast-current-lrg").get_text() for item in item1]
    temp_min = soup.find('p', {'class': 'temp temp-low'}).text.strip()
    temp_max = soup.find('p', {'class': 'temp temp-high'}).text.strip()


    #df of temperatures
    df_temperature = pd.DataFrame({"temp" : temp_f,'tempmin': temp_min,'tempmax': temp_max})

    #df_2 is a df of current conditions in detail
    table = soup.find_all('table')
    df_2 = pd.read_html(str(table))[0]
    df_2 = df_2.pivot(columns=0, values=1).ffill().dropna().reset_index().drop(columns=['index'])

    #merge both dataframes
    temp_df=pd.concat([df_temperature,df_2],axis=1)

    #scrape lattitude, longitude, and elevation 
    lat_lon_elev = soup.find('span', {'class': 'smallTxt'}).text.strip()
    lat, lon, elev = re.findall(r'[-+]?\d*\.\d+|\d+', lat_lon_elev)

    #scrape name
    station = soup.find('h2', {'class': 'panel-title'}).text.strip()

    #add location, lat, long, and elev to source_df
    temp_df['elevation_ft'] = elev
    temp_df['latitude'] = lat
    temp_df['longitude'] = lon
    temp_df['weather_station'] = station

    combined_df = pd.concat([temp_df, combined_df], ignore_index=True, sort=False)

display(combined_df)



Unnamed: 0,temp,tempmin,tempmax,Barometer,Dewpoint,Humidity,Last update,Visibility,Wind Speed,elevation_ft,latitude,longitude,weather_station
0,33°F,Low: 30 °F,High: 45 °F,30.44 in (1030.82 mb),30°F (-1°C),88%,15 Feb 10:41 AM PST,3.00 mi,N 0 MPH,20,45.59578,122.60917,"Portland, Portland International Airport (KPDX)"
1,35°F,Low: 26 °F,High: 36 °F,29.62 in (1002.9 mb),31°F (-1°C),85%,15 Feb 9:53 am AKST,1.75 mi,Calm,13,57.05,135.36,Sitka - Sitka Airport (PASI)


In [4]:
URLS = ["https://www.wunderground.com/history/daily/us/or/portland",
        'https://www.wunderground.com/history/daily/us/ak/sitka/PASI'
        ]

precip_df = pd.DataFrame()

for URL in URLS:
    
    #create selenium web driver
    driver = webdriver.Chrome(r"C:/home/reed/.cache/selenium/chromedriver/linux64/109.0.5414.74")
    driver.get(URL)
    sleep(5)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    #closes the pop up add if it appears otherwise continues to scrape the precip value from the page
    try:
        driver.find_element(By.XPATH,'/html/body/div[9]/div[3]/div/div/div/a').click()
    except:
        precip = driver.find_element(By.XPATH,"/html/body/app-root/app-history/one-column-layout/wu-header/sidenav/mat-sidenav-container/mat-sidenav-content/div/section/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[2]/tr/td[1]").text

    df = pd.DataFrame([precip], columns=['precip'])
    precip_df = pd.concat([precip_df, df], ignore_index=True, sort=False)

    #quits the selenium driver
    driver.quit()

source_df =pd.concat([combined_df,precip_df],axis=1)
display(source_df)

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,temp,tempmin,tempmax,Barometer,Dewpoint,Humidity,Last update,Visibility,Wind Speed,elevation_ft,latitude,longitude,weather_station,precip
0,33°F,Low: 30 °F,High: 45 °F,30.44 in (1030.82 mb),30°F (-1°C),88%,15 Feb 10:41 AM PST,3.00 mi,N 0 MPH,20,45.59578,122.60917,"Portland, Portland International Airport (KPDX)",0.0
1,35°F,Low: 26 °F,High: 36 °F,29.62 in (1002.9 mb),31°F (-1°C),85%,15 Feb 9:53 am AKST,1.75 mi,Calm,13,57.05,135.36,Sitka - Sitka Airport (PASI),0.02


In [5]:
#-----------Data Transformations-----------------

# Convert 'lat' and 'lon' columns to float type
source_df[['latitude', 'longitude']] = source_df[['latitude', 'longitude']].astype(float)

# Convert 'elev' column to int type
source_df['elevation_ft'] = source_df['elevation_ft'].astype(int)

# Extract the numeric part of the temperature string and convert it to int
source_df['temp'] = source_df['temp'].str.extract('(\d+)').astype(float)

# Extract the numeric part of the tempmin string and convert it to int
source_df['tempmin'] = source_df['tempmin'].str.extract('(\d+)').astype(float)

# Extract the numeric part of the temperature string and convert it to int
source_df['tempmax'] = source_df['tempmax'].str.extract('(\d+)').astype(float)

# Split wind speed values into components and convert speed to int type
source_df['Wind Speed'] = source_df['Wind Speed'].str.extract('(\d+)', expand=False).fillna(0).astype(float)

# Convert 'humidity' column to int type
source_df['Humidity'] = source_df['Humidity'].str.extract('(\d+)', expand=False).astype(float)

# Convert 'barometer' column to float type, and convert inches to millibars
source_df['Barometer'] = round(source_df['Barometer'].apply(lambda x: float(x.split()[0]) * 33.8639 if 'in' in x and x != 'NA' else None), 2)

# Convert 'Visibility' column to float type
source_df['Visibility'] = source_df['Visibility'].str.extract('(\d+\.\d+|\d+)', expand=False).astype(float).round(2)

#Convert 'last_update' column to UTC
source_df['Last update'] = source_df['Last update'].apply(lambda x: dateutil.parser.parse(x, tzinfos={"EST": -5 * 3600, "CST": -6 * 3600, "MST": -7 * 3600,"PST": -8 * 3600,"AKST": -9 * 3600,"HST": -10 * 3600}))
source_df['Last update'] = source_df['Last update'].apply(lambda x: x.astimezone(dateutil.tz.tzutc()))
source_df['datetime'] = source_df['Last update'].dt.strftime('%Y-%m-%d')
source_df['datetime'] = pd.to_datetime(source_df['datetime'])

# make wind chill a float and only display degree F
#source_df[['Wind Chill']] = source_df['Wind Chill'].str.extract('(\d+)', expand=True).astype(float)

# extract the numeric value of dewpoint and only display the degree n farenheit
source_df[['Dewpoint']] = source_df['Dewpoint'].str.extract('(\d+)', expand=True).astype(float)

#change precip data type to float
source_df['precip'] = source_df['precip'].astype(float)

#rename weather station column to the city
def rename_station(value):
    if value == 'Portland, Portland International Airport (KPDX)':
        return 'Portland'
    elif value == 'Sitka - Sitka Airport (PASI)':
        return 'Sitka'

source_df['name'] = source_df['weather_station'].map(rename_station)

#change the names and order of columns to better fit the historical data
source_df = source_df.rename({'Humidity': 'humidity', 'Wind Speed': 'windspeed', 'Visibility': 'visibility','Wind Chill': 'windchill','Dewpoint':'dewpoint'}, axis=1) 
#this line only includes necesarry columns
source_df = source_df.reindex(['name','datetime','tempmax','tempmin','temp','windchill','dewpoint','humidity','precip','windspeed','visibility'], axis=1)



display(source_df)

Unnamed: 0,name,datetime,tempmax,tempmin,temp,windchill,dewpoint,humidity,precip,windspeed,visibility
0,Portland,2023-02-15,45.0,30.0,33.0,,30.0,88.0,0.0,0.0,3.0
1,Sitka,2023-02-15,36.0,26.0,35.0,,31.0,85.0,0.02,0.0,1.75


In [12]:
#unused urls:
#nws
'https://forecast.weather.gov/MapClick.php?lat=40.7143&lon=-74.006#.Y-vtQtLMJkg'
#selenium
'https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA'

'https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA'