### NWS Forecast Data
The NWS doesn't store historical forecast data, so this notebook will just be for prototyping the scraping functions for the DAG (i.e., no preliminary scrape and data load as with the USCRN data).

In [62]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sys

But first, let's save the relevant urls for each station location -- this will make it easier when we're retrieving the results for each station as part of our DAG.

In [63]:
def getUrl(row):
  """Construct NWS forecast url from latitude and longitude columns"""
  url = f"https://forecast.weather.gov/MapClick.php?lat={row['latitude']}&lon={row['longitude']}&unit=0&lg=english&FcstType=digital&menu=1"
  return url

locations = pd.read_csv("data/locations.csv")
locations['nws_url'] = locations.apply(getUrl, avs=1)
locations.to_csv("data/locations.csv", index=False)

#### Prototyping 
Let's scrape the tabular data from one of the NWS pages and transform it into a dataframe.

In [115]:
from random import choice

url = requests.get(choice(list(locations['nws_url'])))
soup = BeautifulSoup(url.content, "html.parser")

tables = soup.find_all('table')
data = tables[5].find_all("tr")

# Data tables are divided with colspan elements
colspan = data[0]
indices = [i for i, x in enumerate(data) if x == colspan]

dt1 = data[indices[0]+1:indices[1]] # first 24hrs
dt2 = data[indices[1]+1:] # next 24hrs

# Combining these into one dataframe
def getDF(l1, l2): 

  ls = l1.copy()
  ls.extend(l2)

  data_map = {}

  for ele in ls:
    row = [x.getText() for x in ele.find_all("font")]
    if row[0] not in data_map.keys(): # elements from l1
      data_map[row[0]] = row[1:]
    else: # elements from l2
      data_map[row[0]].extend(row[1:])
      
  df = pd.DataFrame(data_map)
  df['Date'][df['Date']==""] = np.NaN
  df['Date'].ffill(inplace=True)

  print(sys.getsizeof(df), sys.getsizeof(df.to_dict())) 
  return df

getDF(dt1, dt2).sample(5)

45508 656


Unnamed: 0,Date,Hour (AKST),Temperature (°F),Dewpoint (°F),Wind Chill (°F),Surface Wind (mph),Wind Dir,Gust,Sky Cover (%),Precipitation Potential (%),Relative Humidity (%),Rain,Thunder,Snow,Freezing Rain,Sleet
27,02/16,17,24,20,24,1,E,,70,27,87,--,--,Chc,--,--
37,02/17,3,21,18,13,7,N,,95,68,86,--,--,Lkly,--,--
43,02/17,9,23,21,23,2,W,,88,50,90,--,--,Chc,--,--
41,02/17,7,22,20,16,5,N,,99,75,89,--,--,Ocnl,--,--
3,02/15,17,24,22,24,1,S,,87,56,90,--,--,Lkly,--,--


Notice the size penalty you get from representing data as a dataframe versus a simpler data type like a dictionary. We'll keep this in mind when we refactor our code next. 

#### Refactoring

In [145]:
## Save this block in utils.py
import pandas as pd 
import itertools

def ffList(ls:list) -> list:
  """Like ffill() from pandas, except for lists"""
  for i in range(len(ls)):
    if not ls[i] and i > 0:
        ls[i] = ls[i-1]
  return ls

def getDict(*lists):  # Refactoring from getDF
  """Combine nested list of tabular data into one dictionary""" 
  ls = list(itertools.chain.from_iterable(lists)) # Unnest lists 
  # Note: each list in lists needs to have a field inserted for station_location prior to being fed into this function

  data_map = {}
  for ele in ls:
    row = [x.getText() for x in ele.find_all("font")]
    if row[0] not in data_map.keys(): # elements from l1
      data_map[row[0]] = row[1:]
    else: # elements from l2
      data_map[row[0]].extend(row[1:])

  data_map['Date'] = ffList(data_map['Date'])

  return data_map

{'Date': ['02/15',
  '02/15',
  '02/15',
  '02/15',
  '02/15',
  '02/15',
  '02/15',
  '02/15',
  '02/15',
  '02/15',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/16',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17',
  '02/17'],
 'Hour (AKST)': ['14',
  '15',
  '16',
  '17',
  '18',
  '19',
  '20',
  '21',
  '22',
  '23',
  '00',
  '01',
  '02',
  '03',
  '04',
  '05',
  '06',
  '07',
  '08',
  '09',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '18',
  '19',
  '20',
  '21',
  '22',
  '23',
  '00',
  '01',
  '02',
  '03',
  '04',
  '05',
  '06',
  '07',
  '08',
  '09',
  '10',
  '11',
  '12',
  '13'],
 'Temperature (°F)': ['25',
  '26',
  '25',
  '24',
  '22',
  '

In [None]:
## Task in DAG
def getForecast():
  """Get dataframe from soup objects created from NWS urls in locations.csv, then write to .csv"""
 
  locations = pd.read_csv("data/locations.csv")
  
  for url in locations['nws_url']:
    result = requests.get(url)
    soup = BeautifulSoup(result.content, "html.parser")
    data = soup.find_all("table")[5].find_all("tr")

    # 'data' is divided into two tables by two colspan elements
    cspans = [i for i, x in enumerate(data) if x == data[0]] # first colspan at index 0

    table1 = data[cspans[0]+1:cspans[1]] # first 24hrs
    table2 = data[cspans[1]+1:] # next 24hrs

    df = getDF(table1, table2) # import from utils.py module

    
