# GRIDMET Data Ingestion Script

This notebook uses the Google Earth Engine API to pull data from GRIDMET: University of Idaho Gridded Surface Meteorological Dataset: https://developers.google.com/earth-engine/datasets/catalog/IDAHO_EPSCOR_GRIDMET

This notebook will run from reading a csv through completion and writing of a csv with the following metrics by day: Total Precipiation, Max Temp, Min Temp, Wind Direction, and Wind Max Velocity

Current script is created to pull a CSV from a google storage bucket and read in.

Columns expected for input are a key/unique ID (here listed as "Cell_ID"), geometry of a polygon, and date.
Geometry column is checked and converted to geopandas geometry as part of the script. Date column expects a string and as part of the function strips and returns month/day/year.


### Install Required Libraries etc.

In [None]:
import os
from os import listdir
from os.path import isfile, join

import imagecodecs
from geotiff import GeoTiff
import geopy
import geopy.distance as distance
from shapely.geometry import Polygon

import re
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

from pystac_client import Client
import planetary_computer
import xarray
import dask.dataframe as dd
import xrspatial
from datashader.transfer_functions import shade, stack
from datashader.colors import Elevation
from datashader.utils import export_image

from sklearn.neighbors import BallTree

import shutil
import requests
import ee

import time 
import signal

#import rioxarray
import rasterio
import rasterio.features
import shapely
from shapely import wkt


class TimeoutException(Exception):   # Custom exception class
    pass

def timeout_handler(signum, frame):   # Custom signal handler
    raise TimeoutException

### Connect to Google Storage Bucket

For other access options, please change/update

In [None]:
import csv
from io import StringIO

from google.cloud import storage

#Connect to Google cloud storage client
storage_client = storage.Client()

#Define your storage bucket by inserting your main bucket name here
bucket = storage_client.get_bucket('GOOGLE_BUCKET_NAME_HERE')

### Define Main Function

This function pulls a single day of data for the given geometry provided. 
All outputs are rounded to two decimal places.

Current script includes options for writing a temporary dataframe for batching, in the event connection to the cloud is unstable and large amounts of data are being pulled. 

In [None]:
def pull_GRIDMET(df, file_name):
    
    #Define a temporary dataframe that collects row level data. 
    internal_df = pd.DataFrame(columns = ['cell_id', 'geometry', 'date', 'precip_daily', 'wind_dir_avg','temp_min', 'temp_max', 'wind_vel'])
    
    #Set variable x and start time to provide progress feedback and monitoring of speed
    x= 0
    start_time = time.time()
    
    #Main function loop that moves over all rows of defined dataframe
    for i in range(len(df)):
        
        #define area of interest by coordinates in geometry column
        aoi = ee.Geometry.Polygon(list(df.geometry[i].exterior.coords))
        
        #Convert date column to date time in Y/m/d format and set start and end date
        start_date = datetime.strptime(df.date[i], '%Y-%m-%d') - timedelta(days=1)
        end_date = datetime.strptime(df.date[i], '%Y-%m-%d')

        x+=1
        
        #Try statement will look for image in the GEE library. If there is no image available for given day/geometry, excpet statement below will return all NULLS
        #Before running full script, its advised to check if the satellite in question covers your area and timeframe
        try:
            
            #Access image collection for defined daterange, area, and pulling bands of interest only.
            lst = ee.ImageCollection('IDAHO_EPSCOR/GRIDMET')\
                .filterDate(start_date, end_date)\
                .filterBounds(aoi)\
                .select('pr', 'th', 'tmmn', 'tmmx', 'vs')

            #Pull each band, select the average value, and round to two decimal places
            precip = round(lst.mean().sample(aoi, 1000).first().get('pr').getInfo(),2)
            wind_dir = round(lst.mean().sample(aoi, 1000).first().get('th').getInfo(),2)
            temp_min = round(lst.mean().sample(aoi, 1000).first().get('tmmn').getInfo(),2)
            temp_max = round(lst.mean().sample(aoi, 1000).first().get('tmmx').getInfo(),2)
            wind_vel = round(lst.mean().sample(aoi, 1000).first().get('vs').getInfo(),2)
            
            #Write calculated varialbes to the temporary internal dataframe
            internal_df.loc[internal_df.shape[0]] = [df.cell_id[i], df.geometry[i], df.date[i], precip, wind_dir, temp_min, temp_max, wind_vel]
        
        except Exception as e:
            internal_df.loc[internal_df.shape[0]] = [df.cell_id[i], df.geometry[i], df.date[i], 'NULL', 'NULL', 'NULL', 'NULL', 'NULL']
        
        #Progress printer - Update with whatever interval number you'd like
        if x % 1000 == 0:
            end_time = time.time()
            
            #BATCH SAVING - if interested in batch saving csv, un-hash the below statement
            #internal_df.to_csv(file_name, index = False)
            print(x, "files complete")
            print("--- %s seconds ---" % (end_time - start_time))
            start_time = end_time
            
    #Write final dataframe to CSV        
    internal_df.to_csv(file_name, index = False)


### Read in CSV from Google Cloud Storage

In [None]:
##Define the location within your Google Cloud Storage Bucket which you defined above
blob = bucket.blob('SUBFOLDER_NAME/CSV_NAME_OF_FILE.csv')
blob = blob.download_as_string()
blob = blob.decode('utf-8')

blob = StringIO(blob)  #tranform bytes to string here

#Read in as CSV and check your data structure. You should have the following columns: cell_id, geometry (in polygon form), and date.
df = pd.read_csv(blob)
df.head()

In [None]:
#Change geometry column to geopandas geometry and confirm datatype is geometry
df['geometry']=gpd.GeoSeries.from_wkt(df['geometry'])
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.dtypes

In [None]:
##Test Connection to Google Earth Engine API. You must have an account.

ee.Authenticate()
ee.Initialize()

#Run Function over your file and name the output CSV
pull_GRIDMET(gdf, file_name='OUTPUT_CSV_NAME_HERE.csv')

In [None]:
#Write your csv to the Google Cloud Storage Bucket
!gsutil cp 'OUTPUT_CSV_NAME_HERE.csv' 'gs://GOOGLE_BUCKET_NAME_HERE/SUBFOLDER_WRITING_TO/'