### NWS Forecast Data
The NWS doesn't store historical forecast data, so this notebook will just be for prototyping the scraping functions for the DAG (i.e., no preliminary scrape and data load as with the USCRN data).

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Time Libraries
import datetime as dt 
import pytz
from tzwhere import tzwhere 

But first, let's save the relevant urls for each station location -- this will make it easier when we're retrieving the results for each station as part of our DAG.

In [3]:
def getUrl(row):
  """Construct NWS forecast url from latitude and longitude columns"""
  url = f"https://forecast.weather.gov/MapClick.php?lat={row['latitude']}&lon={row['longitude']}&unit=0&lg=english&FcstType=digital&menu=1"
  return url

locations = pd.read_csv("data/locations.csv")
locations['nws_url'] = locations.apply(getUrl, axis=1)
locations.to_csv("data/locations.csv", index=False)

In [51]:
# from tzwhere import tzwhere

# tzw = tzwhere.tzwhere()

# def local_time(row): 
#   return tzw.tzNameAt(row['latitude'], row['longitude'])
  
# locations.apply(local_time, axis=1)

  self.timezoneNamesToPolygons[tzname] = WRAP(polys)


0      America/Anchorage
1      America/Anchorage
2          America/Sitka
3           America/Nome
4      America/Anchorage
5      America/Anchorage
6      America/Anchorage
7           America/Nome
8      America/Anchorage
9         America/Juneau
10     America/Anchorage
11    America/Metlakatla
12     America/Anchorage
13     America/Anchorage
14     America/Anchorage
15     America/Anchorage
16     America/Anchorage
17     America/Anchorage
18       America/Yakutat
19     America/Anchorage
20     America/Anchorage
21          America/Nome
22     America/Anchorage
dtype: object

#### Prototyping 
Let's scrape the tabular data from one of the NWS pages and transform it into a dataframe.

In [4]:
from random import choice

url = requests.get(choice(list(locations['nws_url'])))
soup = BeautifulSoup(url.content, "html.parser")

tables = soup.find_all('table')
data = tables[5].find_all("tr")

# Data tables are divided with colspan elements
colspan = data[0]
indices = [i for i, x in enumerate(data) if x == colspan]

dt1 = data[indices[0]+1:indices[1]] # first 24hrs
dt2 = data[indices[1]+1:] # next 24hrs

# Combining these into one dataframe
def getDF(l1, l2): 

  ls = l1.copy()
  ls.extend(l2)

  data_map = {}

  for ele in ls:
    row = [x.getText() for x in ele.find_all("font")]

    if row[0] not in data_map.keys(): # elements from l1
      data_map[row[0]] = row[1:]
    else: # elements from l2
      data_map[row[0]].extend(row[1:])

  df = pd.DataFrame(data_map)
  df['Date'][df['Date']==""] = np.NaN
  df['Date'].ffill(inplace=True)

  print(sys.getsizeof(df), sys.getsizeof(df.to_dict())) 
  return df

getDF(dt1, dt2).sample(5)

45470 656


Unnamed: 0,Date,Hour (AKST),Temperature (°F),Dewpoint (°F),Wind Chill (°F),Surface Wind (mph),Wind Dir,Gust,Sky Cover (%),Precipitation Potential (%),Relative Humidity (%),Rain,Thunder,Snow,Freezing Rain,Sleet
26,02/16,18,26,20,20,5,NE,,74,46,79,--,--,Chc,--,--
11,02/16,3,2,-4,2,1,E,,92,37,75,--,--,Chc,--,--
42,02/17,10,23,19,14,7,S,,96,82,85,--,--,Ocnl,--,--
14,02/16,6,2,-4,2,0,SW,,86,45,76,--,--,Chc,--,--
4,02/15,20,9,3,9,1,NW,,34,1,77,--,--,--,--,--


Notice the size penalty you get from representing data as a dataframe versus a simpler data type like a dictionary. We'll keep this in mind when we refactor our code next. 

#### Refactoring

In [1]:
# ## Old Refactor

# def getDict(lists:list):  # Refactoring from getDF to process all the tables, not just two
#   """Unnest list of tabular data and combine into one dictionary""" 
#   ls = list(itertools.chain.from_iterable(lists)) # Unnest lists 
#   # Note: each list in lists needs to have a field inserted for station_location prior to being fed into this function
#   data_map = {}
#   for ele in ls:
#     row = [x.getText() for x in ele.find_all("font")]
#     if row[0] not in data_map.keys(): # elements from l1
#       data_map[row[0]] = row[1:]
#     else: # elements from l2
#       data_map[row[0]].extend(row[1:])

#   data_map['Date'] = ffList(data_map['Date'])

#   return data_map


# def getForecast():
#   """Get data from soup objects created from NWS urls in locations.csv, then write to .csv"""
 
#   locations = pd.read_csv("data/locations.csv")
#   loc_dict = dict(zip(locations['station_location'], locations['nws_url']))
  
#   ls = []
#   for station, url in loc_dict.items():
#     result = requests.get(url)
#     soup = BeautifulSoup(result.content, "html.parser")
#     data = soup.find_all("table")[5].find_all("tr")

#     # 'data' is divided into two tables by two colspan elements
#     colspan = data[0]
#     data = [x for  x in data if x != colspan] # add station_name here 
#     ls.append(data)

#   return getDict(ls)
# # pd.DataFrame(getForecast())  

# pd.DataFrame(getForecast())

##### utils.py

In [3]:
def ffList(ls:list) -> list:
  """Like ffill() from pandas, except for lists"""
  for i in range(len(ls)):
    if not ls[i] and i > 0:
        ls[i] = ls[i-1]
  return ls

def getColsFromTable(table:list, location:str):
  """Get cols from list of <tr> elements"""
  cols = [[ele.getText() for ele in tr.find_all("font")] for tr in table] # these are rows in the table's current landscape orientation
  location_col = ['location']
  location_col.extend([location]*24)
  cols.insert(1, location_col)
  cols.insert(19, location_col) # for second table
  return cols

def getDict(col_list:list):
  """Get dictionary from list of columns (which are also lists)"""
  data_map = {}
  for col in col_list:
    if col[0] not in data_map.keys(): # cols from first half of table
      data_map[col[0]] = col[1:]
    else: # cols from second half
      data_map[col[0]].extend(col[1:])
  data_map['Date'] = ffList(data_map['Date'])
  return data_map

##### nws_dag.py

In [4]:
def getForecast():
  """Get dictionary of forecast data for next 48 hours from various points in Alaska"""
  locations = pd.read_csv("data/locations.csv")
  loc_dict = dict(zip(locations['station_location'], locations['nws_url']))

  col_list = []
  for location, url in loc_dict.items():
    result = requests.get(url)
    soup = BeautifulSoup(result.content, "html.parser")
    table48 = soup.find_all("table")[5].find_all("tr") # list of <tr> elements from main data table (really two tables combined: one for each day in next 48h period)
    colspan = table48[0]  # divided into two tables by two colspan elements
    table48 = [tr for  tr in table48 if tr != colspan] # remove colspan elements

    cols = getColsFromTable(table48,location)    
    col_list.extend(cols)
  
  return getDict(col_list)

# def transformDF(myDict): 
#   """Cast dictionary from getForecast() to a dataframe, transform, and write (append) to .csv"""
#   # We want to write to .csv instead of just uploading to BigQuery because NWS does not store their forecast data.
#   # So it's a good idea to keep a local backup in case we screw up our BigQuery table, or vice versa. 


# def uploadCSV():
#   """Upload .csv created by transformDF() to BigQuery"""


In [5]:
test_dict = getForecast()

In [18]:
tdf = pd.DataFrame(test_dict)
tdf.columns = [col.lower() for col in tdf.columns] 
uscrn = pd.read_csv("data/uscrn.csv", skiprows=list(range(2,2100000)))

In [19]:
uscrn.sample(2)

Unnamed: 0,station_location,wbanno,utc_date,utc_time,lst_date,lst_time,crx_vn,longitude,latitude,t_calc,...,solarad_min_flag,sur_temp_type,sur_temp,sur_temp_flag,sur_temp_max,sur_temp_max_flag,sur_temp_min,sur_temp_min_flag,rh_hr_avg,rh_hr_avg_flag
54784,Sand_Point,25630,20230202,2100,20230202,1200,2.424,-160.47,55.35,1.8,...,0,C,1.5,0,3.1,0,1.0,0,-9999,0
8012,Tok,96404,20220831,2300,20220831,1400,2.514,-141.21,62.74,18.4,...,0,C,23.8,0,26.4,0,22.0,0,36,0


In [20]:
tdf.head(2)

Unnamed: 0,date,location,hour (akst),temperature (°f),dewpoint (°f),wind chill (°f),surface wind (mph),wind dir,gust,sky cover (%),precipitation potential (%),relative humidity (%),rain,thunder,snow,freezing rain,sleet
0,02/16,Fairbanks,12,7,-2,-2,5,E,,92,30,66,--,--,Chc,--,--
1,02/16,Fairbanks,13,9,0,0,5,E,,92,30,66,--,--,Chc,--,--


In [21]:
from datetime import date, datetime
import pytz
import re

# 1.) Add year to date column 
# 2.) Format 'date' (YYYYMMDD) to match lst_date
cur_year = date.strftime(date.today(), "%Y")

def date_str(d):
  d = re.sub("/","", d)
  return f"{cur_year}{d}"
tdf['date'] = tdf['date'].map(date_str)

tdf['date'].drop_duplicates()

0     20230216
12    20230217
36    20230218
Name: date, dtype: object

In [31]:
# 3.) Format 'hour (akst)' to match 'lst_time' (HH00 -- no leading 0 for hour)
print(uscrn['lst_time'].iloc[0:2])
print(tdf['hour (akst)'].drop_duplicates().sample(2))

0    1300
1    1900
Name: lst_time, dtype: int64
9     21
16    04
Name: hour (akst), dtype: object


In [36]:
pd.to_datetime(uscrn['utc_date'], format="%Y%m%d")

0       2002-08-09
1       2022-10-02
2       2022-10-02
3       2022-10-02
4       2022-10-02
           ...    
62940   2023-02-16
62941   2023-02-16
62942   2023-02-16
62943   2023-02-16
62944   2023-02-16
Name: utc_date, Length: 62945, dtype: datetime64[ns]

In [None]:
# 4.) Calculate UTC datetime from date and hour columns 

# # find tz string: [tz for tz in pytz.all_timezones if tz.find("Alaska") != -1]
# tz = pytz.timezone('US/Alaska') 


# def dateUTC(row): 
#   dt = datetime.strptime(row, "%m/%d/%y")
#   tz.utcoffset(dt)



# tdf['date_hour'] = date_hour.map(dateUTC)
# 5.) Split UTC datetime into 'utc_date' and 'utc_time' columns, matching formatting (YYYYMMDD and HHmm)
