### NWS Forecast Data
The NWS doesn't store historical forecast data, so this notebook will just be for prototyping the scraping functions for the DAG (i.e., no preliminary scrape and data load as with the USCRN data).

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sys

But first, let's save the relevant urls for each station location -- this will make it easier when we're retrieving the results for each station as part of our DAG.

In [3]:
def getUrl(row):
  """Construct NWS forecast url from latitude and longitude columns"""
  url = f"https://forecast.weather.gov/MapClick.php?lat={row['latitude']}&lon={row['longitude']}&unit=0&lg=english&FcstType=digital&menu=1"
  return url

locations = pd.read_csv("data/locations.csv")
locations['nws_url'] = locations.apply(getUrl, axis=1)
locations.to_csv("data/locations.csv", index=False)

#### Prototyping 
Let's scrape the tabular data from one of the NWS pages and transform it into a dataframe.

In [4]:
from random import choice

url = requests.get(choice(list(locations['nws_url'])))
soup = BeautifulSoup(url.content, "html.parser")

tables = soup.find_all('table')
data = tables[5].find_all("tr")

# Data tables are divided with colspan elements
colspan = data[0]
indices = [i for i, x in enumerate(data) if x == colspan]

dt1 = data[indices[0]+1:indices[1]] # first 24hrs
dt2 = data[indices[1]+1:] # next 24hrs

# Combining these into one dataframe
def getDF(l1, l2): 

  ls = l1.copy()
  ls.extend(l2)

  data_map = {}

  for ele in ls:
    row = [x.getText() for x in ele.find_all("font")]

    if row[0] not in data_map.keys(): # elements from l1
      data_map[row[0]] = row[1:]
    else: # elements from l2
      data_map[row[0]].extend(row[1:])

  df = pd.DataFrame(data_map)
  df['Date'][df['Date']==""] = np.NaN
  df['Date'].ffill(inplace=True)

  print(sys.getsizeof(df), sys.getsizeof(df.to_dict())) 
  return df

getDF(dt1, dt2).sample(5)

45470 656


Unnamed: 0,Date,Hour (AKST),Temperature (°F),Dewpoint (°F),Wind Chill (°F),Surface Wind (mph),Wind Dir,Gust,Sky Cover (%),Precipitation Potential (%),Relative Humidity (%),Rain,Thunder,Snow,Freezing Rain,Sleet
26,02/16,18,26,20,20,5,NE,,74,46,79,--,--,Chc,--,--
11,02/16,3,2,-4,2,1,E,,92,37,75,--,--,Chc,--,--
42,02/17,10,23,19,14,7,S,,96,82,85,--,--,Ocnl,--,--
14,02/16,6,2,-4,2,0,SW,,86,45,76,--,--,Chc,--,--
4,02/15,20,9,3,9,1,NW,,34,1,77,--,--,--,--,--


Notice the size penalty you get from representing data as a dataframe versus a simpler data type like a dictionary. We'll keep this in mind when we refactor our code next. 

#### Refactoring

In [168]:
## Save this block in utils.py
import pandas as pd 
import itertools

def ffList(ls:list) -> list:
  """Like ffill() from pandas, except for lists"""
  for i in range(len(ls)):
    if not ls[i] and i > 0:
        ls[i] = ls[i-1]
  return ls

def getColsFromTable(table:list, station:str):
  """Get cols from list of <tr> elements"""
  cols = [[ele.getText() for ele in tr.find_all("font")] for tr in table] # these are rows in the table's current landscape orientation
  station_col = ['station_location']
  station_col.extend([station]*24)
  cols.insert(1, station_col)
  cols.insert(19, station_col) # for second table
  return cols

def getDict(col_list:list):
  """Get dictionary from list of columns (which are also lists)"""
  data_map = {}
  for col in col_list:
    if col[0] not in data_map.keys(): # elements from l1
      data_map[col[0]] = col[1:]
    else: # elements from l2
      data_map[col[0]].extend(col[1:])
  data_map['Date'] = ffList(data_map['Date'])
  return data_map

In [24]:
# ## Old Refactor in DAG

# def getDict(lists:list):  # Refactoring from getDF to process all the tables, not just two
#   """Unnest list of tabular data and combine into one dictionary""" 
#   ls = list(itertools.chain.from_iterable(lists)) # Unnest lists 
#   # Note: each list in lists needs to have a field inserted for station_location prior to being fed into this function
#   data_map = {}
#   for ele in ls:
#     row = [x.getText() for x in ele.find_all("font")]
#     if row[0] not in data_map.keys(): # elements from l1
#       data_map[row[0]] = row[1:]
#     else: # elements from l2
#       data_map[row[0]].extend(row[1:])

#   data_map['Date'] = ffList(data_map['Date'])

#   return data_map


# def getForecast():
#   """Get data from soup objects created from NWS urls in locations.csv, then write to .csv"""
 
#   locations = pd.read_csv("data/locations.csv")
#   loc_dict = dict(zip(locations['station_location'], locations['nws_url']))
  
#   ls = []
#   for station, url in loc_dict.items():
#     result = requests.get(url)
#     soup = BeautifulSoup(result.content, "html.parser")
#     data = soup.find_all("table")[5].find_all("tr")

#     # 'data' is divided into two tables by two colspan elements
#     colspan = data[0]
#     data = [x for  x in data if x != colspan] # add station_name here 
#     ls.append(data)

#   return getDict(ls)
# # pd.DataFrame(getForecast())  

# pd.DataFrame(getForecast())

Unnamed: 0,Date,Hour (AKST),Temperature (°F),Dewpoint (°F),Wind Chill (°F),Surface Wind (mph),Wind Dir,Gust,Sky Cover (%),Precipitation Potential (%),Relative Humidity (%),Rain,Thunder,Snow,Freezing Rain,Sleet
0,02/15,16,8,1,8,2,W,,73,14,74,--,--,--,--,--
1,02/15,17,4,-1,5,2,W,,73,14,79,--,--,--,--,--
2,02/15,18,2,-3,-8,5,SW,,70,9,82,--,--,--,--,--
3,02/15,19,0,-4,-10,5,SW,,70,9,83,--,--,--,--,--
4,02/15,20,-1,-5,-12,5,SW,,70,9,83,--,--,--,--,--
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,02/17,11,22,18,5,25,SW,37,94,56,85,--,--,Lkly,--,--
1100,02/17,12,25,21,9,28,SW,39,94,44,84,--,--,Chc,--,--
1101,02/17,13,28,24,13,28,SW,39,94,44,84,--,--,Chc,--,--
1102,02/17,14,31,26,17,28,SW,39,94,44,83,--,--,Chc,--,--


In [166]:
def getForecast():
  """Get dictionary"""
  locations = pd.read_csv("data/locations.csv")
  loc_dict = dict(zip(locations['station_location'], locations['nws_url']))

  col_list = []
  for station, url in loc_dict.items():
    result = requests.get(url)
    soup = BeautifulSoup(result.content, "html.parser")
    table48 = soup.find_all("table")[5].find_all("tr") # list of <tr> elements from main data table (really two tables combined: one for each day in next 48h period)
    colspan = table48[0]  # divided into two tables by two colspan elements
    table48 = [tr for  tr in table48 if tr != colspan] # remove colspan elements

    cols = getColsFromTable(table48,station)    
    col_list.extend(cols)
  
  return getDict(col_list)

In [169]:
pd.DataFrame(getForecast())

Unnamed: 0,Date,station_location,Hour (AKST),Temperature (°F),Dewpoint (°F),Wind Chill (°F),Surface Wind (mph),Wind Dir,Gust,Sky Cover (%),Precipitation Potential (%),Relative Humidity (%),Rain,Thunder,Snow,Freezing Rain,Sleet
0,02/15,Fairbanks,20,-1,-5,-12,5,SW,,70,9,83,--,--,--,--,--
1,02/15,Fairbanks,21,-2,-6,-10,3,S,,65,5,82,--,--,--,--,--
2,02/15,Fairbanks,22,-2,-6,-10,3,S,,65,5,82,--,--,--,--,--
3,02/15,Fairbanks,23,-2,-6,-10,3,S,,65,5,83,--,--,--,--,--
4,02/16,Fairbanks,00,-2,-6,-2,2,SE,,56,2,82,--,--,--,--,--
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,02/17,Aleknagik,15,35,30,23,24,SW,43,53,30,82,--,--,Chc,--,--
1100,02/17,Aleknagik,16,34,30,22,24,SW,43,53,30,83,--,--,Chc,--,--
1101,02/17,Aleknagik,17,33,29,20,24,SW,43,53,30,84,--,--,Chc,--,--
1102,02/17,Aleknagik,18,31,27,18,22,SW,37,54,22,85,--,--,SChc,--,--
