### USCRN Data: High-Octane Scraping

In [1]:
import requests
import pandas as pd 
import numpy as np
import yaml 
import re
import itertools
from datetime import datetime, timedelta
from bs4 import BeautifulSoup

with open ("sources.yaml", "r") as yaml_file:
  sources = yaml.load(yaml_file, Loader=yaml.FullLoader)

##### 1.) Scrape Column Headers and Descriptions 

In [11]:
header_url = sources['USCRN']['headers']
header_response = requests.get(header_url)
header_soup = BeautifulSoup(header_response.content, "html.parser")

columns = str(header_soup).split("\n")[1].strip(" ").split(" ")
columns = list(map(lambda x: str.lower(x), columns)) # columns = [str.lower(c) for c in columns] -- faster?
columns.insert(0,'station_location')

descrip_text = str(header_soup).split("\n")[2] # raw text block containing column descriptions
descrip_text

"The station WBAN number. The UTC date of the observation. The UTC time of the observation. Time is the end of the observed hour, so the 0000 hour is actually the last hour of the previous day's observation (starting just after 11:00 PM through midnight). The Local Standard Time (LST) date of the observation. The Local Standard Time (LST) time of the observation. Time is the end of the observed hour (see UTC_TIME description). The version number of the station datalogger program that was in effect at the time of the observation. Note: This field should be treated as text (i.e. string). Station longitude, using WGS-84. Station latitude, using WGS-84. Average air temperature, in degrees C, during the last 5 minutes of the hour. See Note F. Average air temperature, in degrees C, for the entire hour. See Note F. Maximum air temperature, in degrees C, during the hour. See Note F. Minimum air temperature, in degrees C, during the hour. See Note F. Total amount of precipitation, in mm, record

The descriptions of the columns are quite the mess, as there is no standard separator used. We will have to work our way through it step by step: 

In [4]:
first_split = [re.sub(r'(\([^)]*)$', r"\1)", s) for s in descrip_text.split("). ")] # add ')' back after splitting text on ').' 
no_notes = [re.sub(r' See Note [A-Z]\.',"",s) for s in first_split] # drop any references to notes

The third entry in `no_notes` is ready. The last set of descriptions in `no_notes` can be split on `". "`, but the first two sets need special attention. We will pop the last set out and split it, then pop the third set out, and then address the first two sets. At that point we will recombine everything into one list while preserving the original order. 

In [5]:
last_set = no_notes.pop().strip().split(". ")
third_set = no_notes.pop() # Note: just a string

In [6]:
def flatten(ls:list): 
  return list(itertools.chain.from_iterable(ls)) 

no_notes = [re.sub(". Time is", " at", s) for s in no_notes]
first_second = flatten([s.split(". ") for s in no_notes])

# Finally:
descriptions = flatten([["Location name for USCRN station"], first_second, [third_set], last_set]) # Description added for "station_location" 

In [7]:
header_info = {
  'col_name': columns,
  'description': descriptions, 
  'units': ["X...(Various Lengths)","XXXXX", "YYYYMMDD", "HHmm", "YYYYMMDD", "HHmm", "XXXXXX", "Decimal_degrees", "Decimal_degrees", "Celsius", "Celsius", "Celsius", "Celsius", "mm", "W/m^2", "X", "W/m^2", "X", "W/m^2", "X", "X", "Celsius", "X", "Celsius", "X", "Celsius", "X", "%", "X", "m^3/m^3", "m^3/m^3", "m^3/m^3", "m^3/m^3", "m^3/m^3", "Celsius", "Celsius", "Celsius", "Celsius", "Celsius"]
}

header_df = pd.DataFrame(header_info)
# header_df.to_csv("data/column_descriptions.csv", index=False)

##### 2.) Scrape Core Data Files (>2 million rows)

In [3]:
base_url = sources["USCRN"]["index"]
base_soup = BeautifulSoup(requests.get(base_url).content, "html.parser")

In [4]:
links = base_soup.find_all("a") # 'links' in this notebook will refer to <a> elements, not urls
years = [str(x).zfill(1) for x in range(2000,2024)]
year_links = [link for link in links if link['href'].rstrip('/') in years]

file_urls = []
for year_link in year_links: 
  year_url = base_url + year_link.get("href")
  response = requests.get(year_url) 
  soup = BeautifulSoup(response.content, 'html.parser')
  file_links = soup.find_all('a', href=re.compile(r'AK.*\.txt'))
  if file_links:
    new_file_urls = [year_url + link.getText() for link in file_links]
    file_urls.extend(new_file_urls)

In [5]:
rows = []
regex = r"([St.]*[A-Z][a-z]+_*[A-Za-z]*).*.txt" 
for url in file_urls:
  # Get location from url -- will add to BS results in next step
  file_name = re.search(regex, url).group(0)
  station_location = re.sub("(_formerly_Barrow.*|_[0-9].*)", "", file_name)
  # Get results 
  response = requests.get(url)
  soup = BeautifulSoup(response.content,'html.parser')
  soup_lines = [station_location + " " + line for line in str(soup).strip().split("\n")]
  new_rows = [re.split('\s+', row) for row in soup_lines]
  # Add to list
  rows.extend(new_rows)

In [6]:
df = pd.DataFrame(rows, columns=columns) 

(This dataframe is huge and keeps crashing the kernel when I try to work with it. Save it as a .csv first, restart your kernel, and read it back in before continuing).

In [2]:
# df.to_csv("data/uscrn.csv", index=False)
# df = pd.read_csv("data/uscrn.csv")

_From the original data source [README](https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/readme.txt):_  

_"Missing data are indicated by the lowest possible integer for a given column format, such as -9999.0 for 7-character fields with one decimal place or -99.000 for 7-character fields with three decimal places."_

We can find these missing value indicators by getting the min of each column.

In [151]:
def minMap(df):
    min_values = {}
    for col in df.columns:
        mv = df[col].min()
        min_values[col] = mv
    return min_values

print(minMap(df))

{'station_location': 'Aleknagik', 'wbanno': '23583', 'utc_date': '20230217', 'utc_time': '0000', 'lst_date': '20230217', 'lst_time': '1100', 'crx_vn': '2.424', 'longitude': '-131.59', 'latitude': '55.05', 't_calc': '-0.1', 't_hr_avg': '-0.3', 't_max': '-0.1', 't_min': '-0.1', 'p_calc': '-9999.0', 'solarad': '0', 'solarad_flag': '0', 'solarad_max': '0', 'solarad_max_flag': '0', 'solarad_min': '0', 'solarad_min_flag': '0', 'sur_temp_type': 'C', 'sur_temp': '-0.1', 'sur_temp_flag': '0', 'sur_temp_max': '-0.1', 'sur_temp_max_flag': '0', 'sur_temp_min': '-0.1', 'sur_temp_min_flag': '0', 'rh_hr_avg': '15', 'rh_hr_avg_flag': '0', 'soil_moisture_5': '-99.000', 'soil_moisture_10': '-99.000', 'soil_moisture_20': '-99.000', 'soil_moisture_50': '-99.000', 'soil_moisture_100': '-99.000', 'soil_temp_5': '-0.1', 'soil_temp_10': '-0.1', 'soil_temp_20': '-9999.0', 'soil_temp_50': '-9999.0', 'soil_temp_100': '-9999.0'}


We will replace these values with `NaNs`, but we need to be careful: since the source does not normally have empty records, any `NaNs` entering our pipeline on read will likely come either from errors in the data source or errors in our attempts to read from it. When writing our update DAG, before we replace any values with `NaNs` we'll need to check for `NaNs` and log an alert if any are found. 

In [5]:
df.replace([-99999,-9999], np.nan, inplace=True) # Can safely assume these are always missing values in every column they appear in
df = df.filter(regex="^((?!soil).)*$") # vast majority of soil columns have missing data
df.replace({'crx_vn':{-9:np.nan}}, inplace=True)

Next, let's convert the date and time columns to `datetime` objects and reorder our columns

In [6]:
df['utc_datetime'] = pd.to_datetime(df['utc_date'].astype(int).astype(str) + df['utc_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')
df['lst_datetime'] = pd.to_datetime(df['lst_date'].astype(int).astype(str) + df['lst_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')

In [7]:
# drop old date and time columns
df.drop(['utc_date', 'utc_time', 'lst_date', 'lst_time'], axis=1, inplace=True)

In [8]:
# reorder columns 
cols = ['station_location','wbanno','crx_vn','utc_datetime','lst_datetime'] + list(df.columns)[3:-2]
df = df[cols]

Lastly, let's add a `date_added` column: 

In [10]:
df['date_added_utc'] = datetime.utcnow() 

In [11]:
df.sample(5)

Unnamed: 0,station_location,wbanno,crx_vn,utc_datetime,lst_datetime,longitude,latitude,t_calc,t_hr_avg,t_max,...,sur_temp_type,sur_temp,sur_temp_flag,sur_temp_max,sur_temp_max_flag,sur_temp_min,sur_temp_min_flag,rh_hr_avg,rh_hr_avg_flag,date_added_utc
2040214,Metlakatla,25381.0,2.424,2022-12-05 03:00:00,2022-12-04 18:00:00,-131.59,55.05,0.2,0.2,0.3,...,C,-0.8,0.0,-0.7,0.0,-0.9,0.0,74.0,0.0,2023-02-17 22:33:04.077133
1989977,Glennallen,56401.0,2.515,2022-03-11 22:00:00,2022-03-11 13:00:00,-145.5,63.03,-6.6,-6.4,-6.2,...,C,-4.8,0.0,-4.8,0.0,-4.8,0.0,78.0,0.0,2023-02-17 22:33:04.077133
2155364,Sand_Point,25630.0,2.424,2023-02-10 15:00:00,2023-02-10 06:00:00,-160.47,55.35,2.6,2.6,2.8,...,C,2.0,0.0,2.1,0.0,1.8,0.0,,0.0,2023-02-17 22:33:04.077133
131917,Utqiagvik,27516.0,1.301,2007-06-26 03:00:00,2007-06-25 18:00:00,-156.61,71.32,2.2,2.4,2.8,...,R,8.7,0.0,10.2,0.0,6.8,0.0,,0.0,2023-02-17 22:33:04.077133
1492562,St._Paul,25711.0,2.424,2019-06-22 07:00:00,2019-06-21 22:00:00,-170.21,57.16,8.5,8.7,9.0,...,C,9.1,0.0,9.5,0.0,8.7,0.0,89.0,0.0,2023-02-17 22:33:04.077133


In [12]:
# df.to_csv("data/uscrn.csv", index=False)

Let's also make a table for our various station locations. This will be useful when searching for the four-day forecasts in the NWS notebook. 

In [1]:
locations = df[['station_location', 'wbanno', 'longitude', 'latitude']].drop_duplicates()
# locations.to_csv("data/locations.csv", index=False)

##### 3.) Upload Core Data to BigQuery 

In [3]:
# df=pd.read_csv("data/uscrn.csv")

In [79]:
%%bash
bq mk -d --location=us-east4 team-week3:alaska

Dataset 'team-week3:alaska' successfully created.


Core Data: 

In [4]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Setting certain numeric columns (e.g. crx_vn, the flag columns) as strings will indicate that they are not meant to have arithmetic calculations done on them
schema = [
  bigquery.SchemaField("station_location", "STRING", mode="REQUIRED"), 
  bigquery.SchemaField("wbanno", "STRING", mode="REQUIRED"), 
  bigquery.SchemaField("crx_vn", "STRING", mode="NULLABLE"), 
  bigquery.SchemaField("utc_datetime", "DATETIME", mode="REQUIRED"), 
  bigquery.SchemaField("lst_datetime", "DATETIME", mode="REQUIRED"), 
  bigquery.SchemaField("longitude", "FLOAT", mode="REQUIRED"), 
  bigquery.SchemaField("latitude", "FLOAT", mode="REQUIRED"), 
  bigquery.SchemaField("t_calc", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("t_hr_avg", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("t_max", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("t_min", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("p_calc", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("solarad", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("solarad_flag", "STRING", mode="NULLABLE"), 
  bigquery.SchemaField("solarad_max", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("solarad_max_flag", "STRING", mode="NULLABLE"), 
  bigquery.SchemaField("solarad_min", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("solarad_min_flag", "STRING", mode="NULLABLE"), 
  bigquery.SchemaField("sur_temp_type", "STRING", mode="NULLABLE"), 
  bigquery.SchemaField("sur_temp", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("sur_temp_flag", "STRING", mode="NULLABLE"), 
  bigquery.SchemaField("sur_temp_max", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("sur_temp_max_flag", "STRING", mode="NULLABLE"), 
  bigquery.SchemaField("sur_temp_min", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("sur_temp_min_flag", "STRING", mode="NULLABLE"), 
  bigquery.SchemaField("rh_hr_avg", "FLOAT", mode="NULLABLE"), 
  bigquery.SchemaField("rh_hr_avg_flag", "STRING", mode="NULLABLE"), 
  bigquery.SchemaField("date_added_utc", "DATETIME", mode="REQUIRED")
]

In [5]:
key_path = "/home/alex/.creds/alex-sa-tw3.json"
credentials = service_account.Credentials.from_service_account_file(
   key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id)

table_id = f"{credentials.project_id}.alaska.uscrn"

jc = bigquery.LoadJobConfig(
   source_format = bigquery.SourceFormat.CSV,
   autodetect=False,
   schema=schema,
   create_disposition="CREATE_IF_NEEDED",
   write_disposition="WRITE_TRUNCATE", 
   destination_table_description="Historical weather data from USCRN stations in Alaska"
)

job = client.load_table_from_dataframe(df, table_id, job_config=jc)

job.result()

LoadJob<project=team-week3, location=us-east4, id=300f1e8c-284e-42be-b54a-e8551c540ac4>

##### 4.) DAG Tasks for Updating Dataset  

In this section I sketch out tasks for an Airflow DAG to scrape, transform, and upload USCRN data. Note that the last three functions (`getUpdates`, `transformDF`, and `uploadBQ`) borrow heavily from the code above in sections 2 & 3.  

In [57]:
from collections import deque
from io import StringIO

def lastAdded() -> datetime: 
  """Reads/returns latest 'date_added_utc' value from .csv"""
  with open("data/uscrn.csv", 'r') as fp:
    q = deque(fp, 1)  
  last_added = pd.read_csv(StringIO(''.join(q)), header=None).iloc[0,-1]
  last_added = datetime.strptime(last_added, "%Y-%m-%d %H:%M:%S.%f")
  # Convert to EST from UTC -- 'Last modified' field in getNewFile() is given in EST
  last_added = last_added - timedelta(hours=5)

  return last_added

Unnamed: 0,Name,Last modified,Size,Description
0,CRN60H0203-202301010100.txt,2022-12-31 20:47,37K,
1,CRN60H0203-202301010200.txt,2022-12-31 21:47,37K,
2,CRN60H0203-202301010300.txt,2022-12-31 22:53,37K,
3,CRN60H0203-202301010400.txt,2022-12-31 23:54,37K,
4,CRN60H0203-202301010500.txt,2023-01-01 00:49,37K,
...,...,...,...,...
1147,CRN60H0203-202302172000.txt,2023-02-17 15:47,36K,
1148,CRN60H0203-202302172100.txt,2023-02-17 16:47,37K,
1149,CRN60H0203-202302172200.txt,2023-02-17 17:47,36K,
1150,CRN60H0203-202302172300.txt,2023-02-17 18:47,38K,


Pandas has a neat function for reading HTML tables to dataframes (`pd.read_html`). It wouldn't work for our earlier task because those tables were stored as loose text within a body element -- `pd.read_html` relies on explicit HTML table syntax to work. It also wouldn't be ideal for iterating through lots of HTML pages like our task called for: iteratively creating and appending dataframes is very slow given the size of dataframe objects. But it's useful for reading a single table object:   

In [None]:

def getNewFileURLs(last_added:datetime) -> list: 
  """Check/obtain updates from USCRN updates page"""
  now = datetime.utcnow()
  updates_url = sources['USCRN']['updates'] + str(now.year)

  df = pd.read_html(updates_url, skiprows=[1,2])[0]
  df.drop(["Size", "Description"], axis=1, inplace=True)
  df.dropna(inplace=True)
  df['Last modified'] = pd.to_datetime(df['Last modified'])

  df = df[df['Last modified'] > last_added]

  # Pushed to XCOM
  update_range = (min(df['Last modified']), max(df['Last modified'])) # Push to XCOM

  new_file_urls = list(updates_url + "/" + df['Name'])

  return new_file_urls

In [None]:

def getUpdates(new_file_urls:list) -> dict: 
  """Scrape data from list of new urls, store and return as list of lists"""

  locations = pd.read_csv("data/locations.csv")[['station_location', 'wbanno']]
  locations['wbanno'] = locations['wbanno'].astype(int).astype(str)
  wbs = set(locations['wbanno'])

  rows = []
  for url in new_file_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content,'html.parser')
    soup_lines = str(soup).strip().split("\n")[3:]
    ak_rows = [re.split('\s+', line) for line in soup_lines if line[0:5] in wbs]
    rows.extend(ak_rows)

  return rows


def transformDF(rows:list): 
  """Read rows from getUpdates(), cast to dataframe, transform, write to csv"""
  
  # Get column headers 
  columns = list(pd.read_csv("data/column_descriptions.csv")['col_name'])

  # Get locations
  locations = pd.read_csv("data/locations.csv")[['station_location', 'wbanno']]
  locations['wbanno'] = locations['wbanno'].astype(int).astype(str) 
  locations.set_index("wbanno", inplace=True)

  # Create dataframe
  df = pd.DataFrame(rows, columns=columns[1:])

  # Merge locations
  df = df.merge(locations, how="left", left_on="wbanno", right_index=True)

  # Reorder columns 
  columns = ['station_location'] + list(df.columns)[:-1]
  df = df[columns]

  # Change datatypes
  df = df.apply(pd.to_numeric, errors='ignore')

  # Replace missing value designators with NaN
  df.replace([-99999,-9999], np.nan, inplace=True) 
  df.replace({'crx_vn':{-9:np.nan}}, inplace=True)
  df = df.filter(regex="^((?!soil).)*$") # almost all missing values

  # Create datetime columns
  df['utc_datetime'] = pd.to_datetime(df['utc_date'].astype(int).astype(str) + df['utc_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')
  df['lst_datetime'] = pd.to_datetime(df['lst_date'].astype(int).astype(str) + df['lst_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')

  # Drop old date and time columns 
  df.drop(['utc_date', 'utc_time', 'lst_date', 'lst_time'], axis=1, inplace=True)

  # Reorder columns 
  cols = ['station_location','wbanno','crx_vn','utc_datetime','lst_datetime'] + list(df.columns)[3:-2]
  df = df[cols]

  # Add date-added column (utc)
  df['date_added_utc'] = datetime.utcnow() 

  # Write to .csv
  # Pull `update_range` from XCOM (created by 'getNewFileUrls())
  df.to_csv(f"data/updates/upd-{update_range[0]}-{update_range[1]}.csv") # 

def uploadBQ():
  """Upload latest update file to BigQuery"""
  
  

##### 5.) **TO-DO**: _Remove Temporary Section_ 
This section is for testing the above functions for the DAG. 

In [145]:
with open("data/uscrn.csv", 'r') as fp:
  q = deque(fp, 1)  
last_added = pd.read_csv(StringIO(''.join(q)), header=None).iloc[0,-1]
last_added = datetime.strptime(last_added, "%Y-%m-%d %H:%M:%S.%f")
# Convert to EST from UTC -- 'Last modified' field in getNewFile() is given in EST
last_added = last_added - timedelta(hours=5)
last_added

datetime.datetime(2023, 2, 17, 17, 33, 4, 77133)

In [146]:
# def getNewFileURLs(last_added:datetime) -> list: 
  # """Check/obtain updates from USCRN updates page"""
now = datetime.utcnow()
updates_url = sources['USCRN']['updates'] + f"{now.year}"

df = pd.read_html(updates_url, skiprows=[1,2])[0]
df.drop(["Size", "Description"], axis=1, inplace=True)
df.dropna(inplace=True)
df['Last modified'] = pd.to_datetime(df['Last modified']) 

df = df[df['Last modified'] > last_added]

new_file_urls = list(updates_url + "/" + df['Name'])
new_file_urls

['https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/updates/2023/CRN60H0203-202302172200.txt',
 'https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/updates/2023/CRN60H0203-202302172300.txt',
 'https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/updates/2023/CRN60H0203-202302180000.txt',
 'https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/updates/2023/CRN60H0203-202302180100.txt']

In [194]:
# def getUpdates(new_file_urls:list) -> dict: 
# """Scrape data from list of new urls, store and return as list of lists"""

locations = pd.read_csv("data/locations.csv")[['station_location', 'wbanno']]
locations['wbanno'] = locations['wbanno'].astype(int).astype(str)
wbs = set(locations['wbanno'])

rows = []
for url in new_file_urls:
  response = requests.get(url)
  soup = BeautifulSoup(response.content,'html.parser')
  soup_lines = str(soup).strip().split("\n")[3:]
  ak_rows = [re.split('\s+', line) for line in soup_lines if line[0:5] in wbs]
  rows.extend(ak_rows)
print(rows[0])


['23583', '20230217', '2100', '20230217', '1200', '2.514', '-158.61', '59.28', '2.2', '2.1', '2.3', '1.7', '0.0', '161', '0', '331', '0', '38', '0', 'C', '-1.3', '0', '0.0', '0', '-2.7', '0', '70', '0', '-99.000', '-99.000', '-99.000', '-99.000', '-99.000', '-9999.0', '-9999.0', '-9999.0', '-9999.0', '-9999.0']


In [199]:
# def transformDF(rows:list): 
#   """Read rows from getUpdates(), cast to dataframe, transform, write to csv"""

# Get column headers 
columns = list(pd.read_csv("data/column_descriptions.csv")['col_name'])

# Get locations
locations = pd.read_csv("data/locations.csv")[['station_location', 'wbanno']]
locations['wbanno'] = locations['wbanno'].astype(int).astype(str) 
locations.set_index("wbanno", inplace=True)

# Create dataframe
df = pd.DataFrame(rows, columns=columns[1:])

# Merge locations
df = df.merge(locations, how="left", left_on="wbanno", right_index=True)

# Reorder columns 
columns = ['station_location'] + list(df.columns)[:-1]
df = df[columns]
# Change datatypes
df = df.apply(pd.to_numeric, errors='ignore')

# Replace missing value designators with NaN
df.replace([-99999,-9999], np.nan, inplace=True) 
df.replace({'crx_vn':{-9:np.nan}}, inplace=True)
df = df.filter(regex="^((?!soil).)*$") # almost all missing values

# Create datetime columns
df['utc_datetime'] = pd.to_datetime(df['utc_date'].astype(int).astype(str) + df['utc_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')
df['lst_datetime'] = pd.to_datetime(df['lst_date'].astype(int).astype(str) + df['lst_time'].astype(int).astype(str).str.zfill(4), format='%Y%m%d%H%M')

# Drop old date and time columns 
df.drop(['utc_date', 'utc_time', 'lst_date', 'lst_time'], axis=1, inplace=True)

# Reorder columns 
cols = ['station_location','wbanno','crx_vn','utc_datetime','lst_datetime'] + list(df.columns)[3:-2]
df = df[cols]

# Add date-added column (utc)
df['date_added_utc'] = datetime.utcnow() 