Uses a list of tile IDs to scrape the corresponding LAZ files from the USGS database into a GCS bucket. Used for Tennessee, West Virginia, and Virginia lidar acquisition projects.

Inputs:
- List of tile IDs that intersect the study region for each lidar acquisition project (e.g. lidar_data/tile_IDs/TN_27_County_B1.csv)

Outputs:
- LAZ files in the corresponding state directory of the GCS bucket (e.g. lidar_data/tn/TN_27_County_B1_2248661NE.laz)

In [None]:
import pandas as pd
import requests
from multiprocessing import Pool
import time
from google.colab import auth
from google.cloud import storage

In [None]:
PROJECT_ID = 'skytruth-tech'
GCLOUD_BUCKET = 'mountaintop_mining'
MAIN_DIR = 'lidar_data'
TILE_IDS_DIR = MAIN_DIR + '/tile_IDs'
DATABASE_URL = 'https://rockyweb.usgs.gov/vdelivery/Datasets/Staged/Elevation/LPC/Projects'

In [None]:
# Select lidar acquisition project and set info
state = 'tn'
project_name = 'TN_27_County_B1'
file_path = 'USGS_LPC_TN_27County_blk1_2015_LAS_2018/laz/USGS_LPC_TN_27County_blk1_2015'
file_suffix = '_LAS_2018'

# state = 'tn'
# project_name = 'TN_27_County_B2'
# file_path = 'TN_27_County_QL2_LiDAR_Cumberland_Plateau_BAA/TN_27County_blk3_2015/LAZ/USGS_LPC_TN_27_County_QL2_LiDAR_Cumberland_Plateau_BAA'
# file_suffix = ''

# state = 'tn'
# project_name = 'TN_27_County_B3'
# file_path = 'TN_27_County_QL2_LiDAR_Cumberland_Plateau_BAA/TN_27County_blk3_2015/LAZ/USGS_LPC_TN_27_County_QL2_LiDAR_Cumberland_Plateau_BAA'
# file_suffix = ''

# state = 'va'
# project_name = 'VA_R3_Southwest_A'
# file_path = 'USGS_LPC_VA_FEMA_R3_Southwest_A_2016_LAS_2018/laz/USGS_LPC_VA_FEMA_R3_Southwest_A_2016'
# file_suffix = '_LAS_2018'

# state = 'wv'
# project_name = 'VA_NRCS_South_Central_B1'
# file_path = 'VA_FEMA-NRCS_SouthCentral_2017_D17/VA_South_Central_B1_2017/LAZ/USGS_LPC_VA_FEMA-NRCS_SouthCentral_2017_D17'
# file_suffix = ''

# state = 'wv'
# project_name = 'WV_HQ_B1'
# file_path = 'WV_FEMAHQ_2018_D18/WV_FEMAHQ_B1_2018/LAZ/USGS_LPC_WV_FEMAHQ_2018_D18'
# file_suffix = ''

# state = 'wv'
# project_name = 'WV_HQ_B2'
# file_path = 'WV_FEMAHQ_2018_D18/WV_FEMAHQ_B2_2018/LAZ/USGS_LPC_WV_FEMAHQ_2018_D18'
# file_suffix = ''

# state = 'wv'
# project_name = 'WV_HQ_B3'
# file_path = 'WV_FEMAHQ_2018_D18/WV_FEMAHQ_B3_2018/LAZ/USGS_LPC_WV_FEMAHQ_2018_D18'
# file_suffix = ''

# state = 'wv'
# project_name = 'WV_R3_East'
# file_path = 'WV_FEMA_R3_East_Lidar_2016_D16/WV_FEMA_R3_East_2016/LAZ/USGS_LPC_WV_FEMA_R3_East_Lidar_2016_D16'
# file_suffix = ''

# state = 'wv'
# project_name = 'WV_R3_Southcentral_B1'
# file_path = 'WV_FEMAR3_Southcentral_2018_D19/WV_FEMAR3_Southcentral_B1_2018/LAZ/USGS_LPC_WV_FEMAR3_Southcentral_2018_D19'
# file_suffix = ''

# state = 'wv'
# project_name = 'WV_R3_Southcentral_B3'
# file_path = 'WV_FEMAR3_Southcentral_2018_D19/WV_FEMAR3_Southcentral_B3_2018/LAZ/USGS_LPC_WV_FEMAR3_Southcentral_2018_D19'
# file_suffix = ''

# state = 'wv'
# project_name = 'WV_R3_Southcentral_B4'
# file_path = 'WV_FEMAR3_Southcentral_2018_D19/WV_FEMAR3_Southcentral_B4_2018/LAZ/USGS_LPC_WV_FEMAR3_Southcentral_2018_D19'
# file_suffix = ''

In [None]:
# Authenticate GCS
auth.authenticate_user()

In [None]:
# Initialize Google Cloud Storage client and access bucket
client = storage.Client(project=PROJECT_ID)
bucket = client.get_bucket(GCLOUD_BUCKET)

In [None]:
# Get list of tile IDs to scrape
csv_name = f'{project_name}.csv'
csv = bucket.blob(f'{TILE_IDS_DIR}/{csv_name}')
csv.download_to_filename(f'/content/{csv_name}')
df = pd.read_csv(f'/content/{csv_name}', header=0)
tile_IDs = df.iloc[:, 0].tolist()
print(len(tile_IDs))

In [None]:
# Scraper function
def scrape(tile_ID):
  url = f'{DATABASE_URL}/{file_path}_{tile_ID}{file_suffix}.laz'
  uploaded_file_name = f'{MAIN_DIR}/{state}/{project_name}_{tile_ID}.laz'
  uploaded_file = bucket.blob(uploaded_file_name)
  if not uploaded_file.exists():
    max_retries = 3
    for attempt in range(max_retries):
      try:
        with requests.get(url, stream=True) as response: # Avoids downloading the file locally
          if response.status_code == 200:
            uploaded_file.upload_from_file(response.raw)
            break
      except requests.exceptions.RequestException as e: # Tries again if after waiting 5 sec if there is a connection error
        time.sleep(5)

In [None]:
# Parallelize for speed
num_processes = 10

with Pool(num_processes) as pool:
        pool.map(scrape, tile_IDs)