#### USCRN Data
 
##### Scraping

In [15]:
from bs4 import BeautifulSoup
import requests
import pandas as pd 
import yaml 

with open ("sources.yaml", "r") as yaml_file:
  sources = yaml.load(yaml_file)

  sources = yaml.load(yaml_file)


In [16]:
header_url = sources['USCRN']['headers']
header_response = requests.get(header_url)
header_soup = BeautifulSoup(header_response.content, "html.parser")

columns = str(header_soup).split("\n")[1].strip(" ").split(" ")
columns = list(map(lambda x: str.lower(x), columns)) # columns = [str.lower(c) for c in columns] -- faster?
columns.insert(0,'station_location')

In [17]:
base_url = sources["USCRN"]["index"]
base_response = requests.get(base_url)
base_soup = BeautifulSoup(base_response.content, "html.parser")

In [18]:
import re

links = base_soup.find_all("a") # 'links' in this notebook will refer to <a> elements, not urls
years = [str(x).zfill(1) for x in range(2000,2024)]
year_links = [link for link in links if link['href'].rstrip('/') in years]

file_urls = []
for year_link in year_links: 
  year_url = base_url + year_link.get("href")
  response = requests.get(year_url) 
  soup = BeautifulSoup(response.content, 'html.parser')
  file_links = soup.find_all('a', href=re.compile(r'AK.*\.txt'))
  if file_links:
    new_file_urls = [year_url + link.getText() for link in file_links]
    file_urls.extend(new_file_urls)

In [146]:
rows = []
for url in file_urls:
  # Get location from url -- will add to BS results in next step
  regex = r"([St.]*[A-Z][a-z]+_*[A-Za-z]*).*.txt" 
  file_name = re.search(regex, url).group(0)
  station_location = re.sub("(_formerly_Barrow.*|_[0-9].*)", "", file_name)
  # Get results 
  response = requests.get(url)
  soup = BeautifulSoup(response.content,'html.parser')
  soup_lines = [station_location + " " + line for line in str(soup).split("\n")] # <-- Before modification (see explanation below)
  # soup_lines = [station_location + " " + line for line in str(soup).strip().split("\n")] # <-- Correct code 
  new_rows = [re.split('\s+', row) for row in soup_lines]
  rows.extend(new_rows)

In [148]:
df = pd.DataFrame(rows)
df.to_csv("data/uscrn.csv")

In [24]:
df[df.isna().any(axis=1)]

Unnamed: 0,station_location,wbanno,utc_date,utc_time,lst_date,lst_time,crx_vn,longitude,latitude,t_calc,...,soil_moisture_5,soil_moisture_10,soil_moisture_20,soil_moisture_50,soil_moisture_100,soil_temp_5,soil_temp_10,soil_temp_20,soil_temp_50,soil_temp_100
3459,Fairbanks,,,,,,,,,,...,,,,,,,,,,
6941,Utqiagvik,,,,,,,,,,...,,,,,,,,,,
15702,Fairbanks,,,,,,,,,,...,,,,,,,,,,
24463,Utqiagvik,,,,,,,,,,...,,,,,,,,,,
33248,Fairbanks,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2157609,St._Paul,,,,,,,,,,...,,,,,,,,,,
2158670,Tok,,,,,,,,,,...,,,,,,,,,,
2159731,Toolik_Lake,,,,,,,,,,...,,,,,,,,,,
2160793,Utqiagvik,,,,,,,,,,...,,,,,,,,,,


Are these NaN rows being generated by my code, or are they present in the original data? Note there are 281 NaN rows and 281 urls in file_urls -- this suggests that there is one NaN row being made for each. Let's test the same code on a single url to see what's happening.


In [26]:
test_url = "https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/2003/CRNH0203-2003-AK_Utqiagvik_formerly_Barrow_4_ENE.txt"

regex = r"([St.]*[A-Z][a-z]+_*[A-Za-z]*).*.txt" 
file_name = re.search(regex, test_url).group(0)
station_location = re.sub("(_formerly_Barrow.*|_[0-9].*)", "", file_name)

response = requests.get(test_url)
soup = BeautifulSoup(response.content,'html.parser')
soup_lines = [station_location + " " + line for line in str(soup).split("\n")]
rows = [re.split('\s+', row) for row in soup_lines]

In [46]:
print(rows[-1])
display(pd.DataFrame(data=rows, columns=columns))

['Utqiagvik', '']


Unnamed: 0,station_location,wbanno,utc_date,utc_time,lst_date,lst_time,crx_vn,longitude,latitude,t_calc,...,soil_moisture_5,soil_moisture_10,soil_moisture_20,soil_moisture_50,soil_moisture_100,soil_temp_5,soil_temp_10,soil_temp_20,soil_temp_50,soil_temp_100
0,Utqiagvik,27516,20030101,0100,20021231,1600,1.001,-156.61,71.32,-20.9,...,-99.000,-99.000,-99.000,-99.000,-99.000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
1,Utqiagvik,27516,20030101,0200,20021231,1700,1.001,-156.61,71.32,-22.5,...,-99.000,-99.000,-99.000,-99.000,-99.000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
2,Utqiagvik,27516,20030101,0300,20021231,1800,1.001,-156.61,71.32,-23.9,...,-99.000,-99.000,-99.000,-99.000,-99.000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
3,Utqiagvik,27516,20030101,0400,20021231,1900,1.001,-156.61,71.32,-24.9,...,-99.000,-99.000,-99.000,-99.000,-99.000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
4,Utqiagvik,27516,20030101,0500,20021231,2000,1.001,-156.61,71.32,-25.7,...,-99.000,-99.000,-99.000,-99.000,-99.000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8756,Utqiagvik,27516,20031231,2100,20031231,1200,1.200,-156.61,71.32,-14.7,...,-99.000,-99.000,-99.000,-99.000,-99.000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
8757,Utqiagvik,27516,20031231,2200,20031231,1300,1.200,-156.61,71.32,-13.8,...,-99.000,-99.000,-99.000,-99.000,-99.000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
8758,Utqiagvik,27516,20031231,2300,20031231,1400,1.200,-156.61,71.32,-14.9,...,-99.000,-99.000,-99.000,-99.000,-99.000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0
8759,Utqiagvik,27516,20040101,0000,20031231,1500,1.200,-156.61,71.32,-18.2,...,-99.000,-99.000,-99.000,-99.000,-99.000,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0


Each text file has a trailing whitespace. We could edit our code to fix this by inserting a `strip()` before splitting the soup object on newlines: 
```python 
[station_location + " " + line for line in str(soup).strip().split("\n")]
```
Since it took almost 20 minutes to successfully download all 2 million+ rows of data, we won't re-run that code with the corrected line. 

Instead, we'll edit the dataframe and csv we have to remove those NaN rows.

In [60]:
df.dropna(thresh=2, inplace=True) # station_location is not NaN in those rows. 
df.to_csv("data/uscrn.csv")

##### Uploading 