In [82]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from lxml import etree
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.by import By
import dateutil


In [83]:
#-------------------Extract-------------------------
urls = ["https://forecast.weather.gov/MapClick.php?lat=57.0826&lon=-135.2692#.Y-vs_9LMJkg",
        'https://forecast.weather.gov/MapClick.php?lat=45.5118&lon=-122.6756#.Y-vtHNLMJkg',
        ]
        
combined_df = pd.DataFrame()

for url in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.content,"html.parser")

    #various containers
    item1 = soup.find_all(id='current_conditions-summary')
    item2 = soup.find_all(id='current_conditions_detail')
    item4 = soup.find_all(id='tombstone-container')

    #raw data
    temp_f = [item.find(class_="myforecast-current-lrg").get_text() for item in item1]
    temp_min = soup.find('p', {'class': 'temp temp-low'}).text.strip()
    temp_max = soup.find('p', {'class': 'temp temp-high'}).text.strip()


    #df of temperatures
    df_temperature = pd.DataFrame({"temp" : temp_f,'tempmin': temp_min,'tempmax': temp_max})

    #df_2 is a df of current conditions in detail
    table = soup.find_all('table')
    df_2 = pd.read_html(str(table))[0]
    df_2 = df_2.pivot(columns=0, values=1).ffill().dropna().reset_index().drop(columns=['index'])

    #merge both dataframes
    temp_df=pd.concat([df_temperature,df_2],axis=1)

    #scrape lattitude, longitude, and elevation 
    lat_lon_elev = soup.find('span', {'class': 'smallTxt'}).text.strip()
    lat, lon, elev = re.findall(r'[-+]?\d*\.\d+|\d+', lat_lon_elev)

    #scrape name
    station = soup.find('h2', {'class': 'panel-title'}).text.strip()

    #add location, lat, long, and elev to source_df
    temp_df['elevation_ft'] = elev
    temp_df['latitude'] = lat
    temp_df['longitude'] = lon
    temp_df['weather_station'] = station

    combined_df = pd.concat([temp_df, combined_df], ignore_index=True, sort=False)

display(combined_df)



Unnamed: 0,temp,tempmin,tempmax,Barometer,Dewpoint,Humidity,Last update,Visibility,Wind Speed,elevation_ft,latitude,longitude,weather_station,Wind Chill
0,32°F,Low: 30 °F,High: 45 °F,30.43 in (1030.48 mb),30°F (-1°C),92%,15 Feb 09:20 AM PST,1.00 mi,N 0 MPH,20,45.59578,122.60917,"Portland, Portland International Airport (KPDX)",
1,39°F,Low: 26 °F,High: 36 °F,29.56 in (1000.9 mb),33°F (1°C),79%,15 Feb 7:53 am AKST,5.00 mi,S 9 G 21 mph,13,57.05,135.36,Sitka - Sitka Airport (PASI),33°F (1°C)


In [84]:
URLS = ["https://www.wunderground.com/history/daily/us/or/portland",
        'https://www.wunderground.com/history/daily/us/ak/sitka/PASI'
        ]

precip_df = pd.DataFrame()

for URL in URLS:
    
    #create selenium web driver
    driver = webdriver.Chrome(r"C:/home/reed/.cache/selenium/chromedriver/linux64/109.0.5414.74")
    driver.get(URL)
    sleep(5)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    #closes the pop up add if it appears otherwise continues to scrape the precip value from the page
    try:
        driver.find_element(By.XPATH,'/html/body/div[9]/div[3]/div/div/div/a').click()
    except:
        precip = driver.find_element(By.XPATH,"/html/body/app-root/app-history/one-column-layout/wu-header/sidenav/mat-sidenav-container/mat-sidenav-content/div/section/div[2]/div[1]/div[3]/div[1]/div/lib-city-history-summary/div/div[2]/table/tbody[2]/tr/td[1]").text

    df = pd.DataFrame([precip], columns=['precip'])
    precip_df = pd.concat([precip_df, df], ignore_index=True, sort=False)

    #quits the selenium driver
    driver.quit()

source_df =pd.concat([combined_df,precip_df],axis=1)
display(source_df)

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,temp,tempmin,tempmax,Barometer,Dewpoint,Humidity,Last update,Visibility,Wind Speed,elevation_ft,latitude,longitude,weather_station,Wind Chill,precip
0,32°F,Low: 30 °F,High: 45 °F,30.43 in (1030.48 mb),30°F (-1°C),92%,15 Feb 09:20 AM PST,1.00 mi,N 0 MPH,20,45.59578,122.60917,"Portland, Portland International Airport (KPDX)",,0.0
1,39°F,Low: 26 °F,High: 36 °F,29.56 in (1000.9 mb),33°F (1°C),79%,15 Feb 7:53 am AKST,5.00 mi,S 9 G 21 mph,13,57.05,135.36,Sitka - Sitka Airport (PASI),33°F (1°C),0.02


In [85]:
#-----------Data Transformations-----------------

# Convert 'lat' and 'lon' columns to float type
source_df[['latitude', 'longitude']] = source_df[['latitude', 'longitude']].astype(float)

# Convert 'elev' column to int type
source_df['elevation_ft'] = source_df['elevation_ft'].astype(int)

# Extract the numeric part of the temperature string and convert it to int
source_df['temp'] = source_df['temp'].str.extract('(\d+)').astype(int)

# Extract the numeric part of the tempmin string and convert it to int
source_df['tempmin'] = source_df['tempmin'].str.extract('(\d+)').astype(int)

# Extract the numeric part of the temperature string and convert it to int
source_df['tempmax'] = source_df['tempmax'].str.extract('(\d+)').astype(int)

# Split wind speed values into components and convert speed to int type
source_df['Wind Speed'] = source_df['Wind Speed'].str.extract('(\d+)', expand=False).fillna(0).astype(int)

# Convert 'humidity' column to int type
source_df['Humidity'] = source_df['Humidity'].str.extract('(\d+)', expand=False).astype(int)

# Convert 'barometer' column to float type, and convert inches to millibars
source_df['Barometer'] = round(source_df['Barometer'].apply(lambda x: float(x.split()[0]) * 33.8639 if 'in' in x and x != 'NA' else None), 2)

# Convert 'Visibility' column to float type
source_df['Visibility'] = source_df['Visibility'].str.extract('(\d+\.\d+|\d+)', expand=False).astype(float).round(2)

#Convert 'last_update' column to UTC
source_df['Last update'] = source_df['Last update'].apply(lambda x: dateutil.parser.parse(x, tzinfos={"EST": -5 * 3600, "CST": -6 * 3600, "MST": -7 * 3600,"PST": -8 * 3600,"AKST": -9 * 3600,"HST": -10 * 3600}))
source_df['Last update'] = source_df['Last update'].apply(lambda x: x.astimezone(dateutil.tz.tzutc()))
source_df['datetime'] = source_df['Last update'].dt.strftime('%Y-%m-%d')
source_df['datetime'] = pd.to_datetime(source_df['datetime'])

# make wind chill a float and only display degree F
source_df[['Wind Chill']] = source_df['Wind Chill'].str.extract('(\d+)', expand=True).astype(float)

# extract the numeric value of dewpoint and only display the degree n farenheit
source_df[['Dewpoint']] = source_df['Dewpoint'].str.extract('(\d+)', expand=True).astype(int)

#change precip data type to float
source_df['precip'] = source_df['precip'].astype(float)

#rename weather station column to the city
def rename_station(value):
    if value == 'Portland, Portland International Airport (KPDX)':
        return 'Portland'
    elif value == 'Sitka - Sitka Airport (PASI)':
        return 'Sitka'

source_df['name'] = source_df['weather_station'].map(rename_station)

#change the names and order of columns to better fit the historical data
source_df = source_df.rename({'Humidity': 'humidity', 'Wind Speed': 'windspeed', 'Visibility': 'visibility','Wind Chill': 'windchill','Dewpoint':'dewpoint'}, axis=1) 
#this line only includes necesarry columns
source_df = source_df.reindex(['name','datetime','tempmax','tempmin','temp','windchill','dewpoint','humidity','precip','windspeed','visibility'], axis=1)



display(source_df)
source_df.info()

Unnamed: 0,name,datetime,tempmax,tempmin,temp,windchill,dewpoint,humidity,precip,windspeed,visibility
0,Portland,2023-02-15,45,30,32,,30,92,0.0,0,1.0
1,Sitka,2023-02-15,36,26,39,33.0,33,79,0.02,9,5.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   name        2 non-null      object        
 1   datetime    2 non-null      datetime64[ns]
 2   tempmax     2 non-null      int64         
 3   tempmin     2 non-null      int64         
 4   temp        2 non-null      int64         
 5   windchill   1 non-null      float64       
 6   dewpoint    2 non-null      int64         
 7   humidity    2 non-null      int64         
 8   precip      2 non-null      float64       
 9   windspeed   2 non-null      int64         
 10  visibility  2 non-null      float64       
dtypes: datetime64[ns](1), float64(3), int64(6), object(1)
memory usage: 304.0+ bytes


In [86]:
#transform historical data set to better fit the scraped data
historical_df = pd.read_csv('./data/3 cities weather.csv')
historical_df = historical_df.rename({'feelslikemin': 'windchill', 'dew': 'dewpoint'}, axis=1)
display(historical_df.head(3))

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,windchill,feelslike,dewpoint,humidity,...,solarenergy,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations
0,New York City,2013-02-14,46.2,33.7,38.7,43.1,26.0,35.0,27.3,65.6,...,14.5,6,,2013-02-14T06:50:56,2013-02-14T17:30:00,0.13,"Snow, Rain, Partially cloudy",Partly cloudy throughout the day with early mo...,rain,"72505394728,KEWR,KLGA,72502014734,KNYC,7250301..."
1,New York City,2013-02-15,53.3,36.9,43.7,53.3,30.8,40.5,31.1,62.0,...,14.8,6,,2013-02-15T06:49:38,2013-02-15T17:31:12,0.17,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,"72505394728,KEWR,KLGA,72502014734,KNYC,7250301..."
2,New York City,2013-02-16,41.6,32.0,35.9,33.9,22.3,29.0,25.2,66.2,...,6.5,3,,2013-02-16T06:48:19,2013-02-16T17:32:24,0.2,"Rain, Overcast",Cloudy skies throughout the day with early mor...,rain,"72505394728,KEWR,KLGA,72502014734,KNYC,7250301..."


In [None]:
#--------------------------------load to BigQuery-----------------------------


In [87]:
#unused urls:
#nws
'https://forecast.weather.gov/MapClick.php?lat=40.7143&lon=-74.006#.Y-vtQtLMJkg'
#selenium
'https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA'

'https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA'