Uses a list of tile IDs to scrape the corresponding LAZ files from Kentucky’s LiDAR database into a GCS bucket.

In [None]:
import pandas as pd
import requests
from multiprocessing import Pool
import time
import math
from google.colab import auth
from google.cloud import storage

In [None]:
PROJECT_ID = 'skytruth-tech'
GCLOUD_BUCKET = 'mountaintop_mining'
MAIN_DIR = 'lidar_data'
TILE_IDS_DIR = MAIN_DIR + '/tile_IDs'
TILE_INDEX = 'KY_tile_index_intersect.csv'

In [None]:
# Authenticate GCS
auth.authenticate_user()

In [None]:
# Initialize Google Cloud Storage client and access bucket
client = storage.Client(project=PROJECT_ID)
bucket = client.get_bucket(GCLOUD_BUCKET)

In [None]:
# Get table of tile IDs with Phase 1 and 2 download links
csv = bucket.blob(f'{TILE_IDS_DIR}/{TILE_INDEX}')
csv.download_to_filename(f'/content/{TILE_INDEX}')
df = pd.read_csv(f'/content/{TILE_INDEX}')
rows = df.to_dict('records')
print(df['Tile_ID'].nunique())

In [None]:
# Function to test if the Phase 2 download url exists
def is_nan(value):
    if isinstance(value, float) and math.isnan(value):
        return True
    elif isinstance(value, str) and value.lower() == 'nan':
        return True
    else:
        return False

In [None]:
# Scraper function: downloads the Phase 2 las file if it exists, otherwise downloads the Phase 1 las file
def scrape(row):
  tile_ID = row['Tile_ID']
  if is_nan(row['Phase2_download_url']):
    url = row['Phase1_download_url']
    year = row['Phase1_year']
  else:
    url = row['Phase2_download_url']
    year = row['Phase2_year']
  uploaded_file_name = f'{MAIN_DIR}/ky/KY_{year}_{tile_ID}.laz'
  uploaded_file = bucket.blob(uploaded_file_name)
  if not uploaded_file.exists():
    max_retries = 3
    for attempt in range(max_retries):
      try:
        with requests.get(url, stream=True) as response: # Avoids downloading the file locally
                if response.status_code == 200:
                    uploaded_file.upload_from_file(response.raw)
                    break
      except requests.exceptions.RequestException as e: # Tries again if after waiting 5 sec if there is a connection error
        time.sleep(5)

In [None]:
# Parallelize for speed
num_processes = 8

with Pool(num_processes) as pool:
        pool.map(scrape, rows)