# Sentinel-1 and Sentinel-2 Data Ingestion Script

This notebook uses the Google Earth Engine API to pull data from two Sentinel Satellite Image Repositories

Sentinel-1 SAR GRD: C-band Synthetic Aperture Radar Ground Range Detected, log scaling: https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S1_GRD?hl=en

Sentinel-2 MSI: MultiSpectral Instrument, Level-1C: https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2?hl=en

Current script is created to pull a CSV from a google storage bucket and read in, and write single JPEG images consisting of data from three bands (RGB coded) into a Google Cloud Storage Bucket Subfolder.

Columns expected for input are a key/unique ID (here listed as "Cell_ID"), geometry of a polygon, and date.
Geometry column is checked and converted to geopandas geometry as part of the script. Date column expects a string and as part of the function strips and returns Y/m/d.

## Loading Packages and Required Libraries

In [None]:
%%capture
!sudo apt-get update -y
!sudo apt-get upgrade -y
!pip3 install numpy==1.21
# Install GDAL and Geopandas
!sudo apt-get install libgdal-dev -y
!sudo apt install gdal-bin python-gdal python3-gdal --quiet -y
!sudo apt install python3-rtree --quiet -y
!pip3 install git+git://github.com/geopandas/geopandas.git --quiet

!pip3 install -U tornado

In [None]:
%%capture
%pip install geemap

In [None]:
%%capture
%pip install "dask[complete]"
%pip install "dask[complete]" --upgrade

In [None]:
%%capture
%pip install pystac_client planetary_computer rasterio xarray-spatial

In [None]:
%%capture
!pip3 install matplotlib datetime pystac_client planetary_computer xarray datashader xarray-spatial 

In [None]:
%%capture
!pip3 install rasterio geotiff geopy shapely imagecodecs

In [None]:
! pip install geopandas

In [None]:
import os
from os import listdir
from os.path import isfile, join

import imagecodecs
from geotiff import GeoTiff
import geopy
import geopy.distance as distance
from shapely.geometry import Polygon

import re
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

from pystac_client import Client
import planetary_computer
import xarray
import dask.dataframe as dd
import xrspatial
from datashader.transfer_functions import shade, stack
from datashader.colors import Elevation
from datashader.utils import export_image

from sklearn.neighbors import BallTree

import shutil
import requests
import ee

import time 
import signal

#import rioxarray
import rasterio
import rasterio.features
import shapely
from shapely import wkt

from datetime import datetime, timedelta
import urllib.request


class TimeoutException(Exception):   # Custom exception class
    pass

def timeout_handler(signum, frame):   # Custom signal handler
    raise TimeoutException

## Connect to Google Storage Bucket

For other access options, please change/update

In [None]:
import csv
from io import StringIO

from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.get_bucket('GOOGLE_BUCKET_NAME_HERE')

## Pull in CSV, read to a dataframe, and prepare formats for all Sentinel Functions

CSV should have columns: cell_id, date, geometry (in Polygon form)

In [None]:
blob = bucket.blob('NAME_OF_CSV_HERE.csv')
blob = blob.download_as_string()
blob = blob.decode('utf-8')

blob = StringIO(blob)  #tranform bytes to string here

#Check dataframe to ensure data was read in correctly
df = pd.read_csv(blob)
df.head()

In [None]:
#Convert geometry column to geopandas geometry format
df['geometry']=gpd.GeoSeries.from_wkt(df['geometry'])
gdf = gpd.GeoDataFrame(df, geometry='geometry')

In [None]:
#Convert date to proper datetimeformat
from datetime import datetime, timedelta
traindf = gdf.loc[pd.to_datetime(gdf.date) >= datetime.strptime("2016-01-01", "%Y-%m-%d")].reset_index(drop = True)

In [None]:
#Convert to date
traindf["date"] = pd.to_datetime(traindf.date)


#Creating a string version of the date to use as a filename within the function
traindf["datestring"] = traindf.date.map(lambda d: str(d.year)+d.strftime('%j'))

## Sentinel-1 Main Function

Pulls bands VH, VV, and creates a third band of the ratio VV/VH and writes to a jpeg in a defined Google Cloud Storage Bucket Subfolder.

In [None]:
def pull_Sentinel1(traindf, overwrite = False, names_only = False ):
    
    #Create list of file location names to append to dataframe.
    filelocations = []
    x= 0
    
    #Define storage bucket for location to write jpegs to. 
    storage_client = storage.Client()
    bucket = storage_client.get_bucket('GOOGLE_BUCKET_NAME_HERE')
    
    #Iterate over each row in the dataframe.
    for i in range(len(traindf.SWE)):
        
        #Create time function to track progess
        if i % 1000 == 0:
            start = time.time()
        
        #create a filename for the Image using cell_id and the string date column made above
        pict_name = traindf.cell_id[i] + '_sentinel1_poly' + traindf.datestring[i] + '.jpg'
        
        #Define which subfolder in your Google Cloud Storage Bucket you want the jpegs to write to
        blob = bucket.blob('INSERT_SUBFOLDER_NAME_HERE/' + pict_name)

        #create the whole filename with path to the correct folder
        filename = os.path.join('gs://GOOGLE_BUCKET_NAME_HERE/INSERT_SUBFOLDER_NAME_HERE', pict_name)
        
        #To only capture filenames and append to main dataframe, and NOT WRITE any new jpeg images:
        if names_only:
            filelocations.append(filename)
            x += 1
            if x % 100 == 0:
                print(f'{x} files already exist')

        elif os.path.exists(filename) and not overwrite:
            filelocations.append(filename)
            x += 1
            if x % 100 == 0:
                print(f'{x} files already exist')

        else:

            #define area of interest by geometry column
            aoi = ee.Geometry.Polygon(list(traindf.geometry[i].exterior.coords))
            
            #Define the start and end days. Note there is a large window pulled due to inconsistency with Sentinel satellite image capture
            #This script pulls a wide window and then selects the *most recent* to the date defined, single image from the window
            start_date = traindf.date[i] - timedelta(days = 80)
            end_date = traindf.date[i] + timedelta(days = 1)

            x+=1
            
            #Try statement checks for avaialbe image from Google Earth Engine. If no image available, NULL will be appended to your file location column
            try:

                # Sentinel-1 image filtered on date range and on aoi
                se1 = ee.ImageCollection('COPERNICUS/S1_GRD')\
                  .filter(ee.Filter.listContains('transmitterReceiverPolarisation', 'VV'))\
                  .filter(ee.Filter.listContains('transmitterReceiverPolarisation', 'VH'))\
                  .filter(ee.Filter.listContains('transmitterReceiverPolarisation', 'HV').Not())\
                  .filter(ee.Filter.listContains('transmitterReceiverPolarisation', 'HH').Not())\
                  .filterDate(start_date, end_date)\
                  .filterBounds(aoi)\
                  .sort('system:time_start', False)\
                  .first()

                #Create a ratio band of VV/VH
                VVVH = (se2.select('VV').divide(se2.select('VH')))
                
                #Add calculated band to the image
                se2 = se2.addBands(VVVH.rename('VVVH'))
                
                #Define your RGB image with the two original bands, and the calculated band.
                rgb = ['VV', 'VH', 'VVVH']

                #Grab the url of the image
                url = se2.select(rgb).clip(aoi).getThumbURL({'min': -50, 'max': 1, 'region': aoi, 'format': 'jpg'})


                #Loop over to verify that url contains a jpeg image and if it does, write it to the subfolder, otherwise append NULL to the image list.
                try:
                    with urllib.request.urlopen(url) as response:
                        # check if URL contains an image
                        info = response.info()
                        if(info.get_content_type().startswith("image")):
                            
                            #write jpeg to the folder that was defined above
                            blob.upload_from_string(response.read(), content_type=info.get_content_type())
                            
                            #Append filename to the list above
                            filelocations.append(filename)
                        else:
                            #print error
                            print("Could not upload image. No image data type in URL", filename)
                            
                            #append NULL to the file list defined above to note no image for this row was captured.
                            filelocations.append("NULL")
                
                except Exception as e:
                    #Append NULL to the file list above to note that no image was captured
                    filelocations.append("NULL")
        
            except Exception as e:
                filelocations.append("NULL")

        #Create progress print out. Change to whatever level of progress reporting you prefer.
        if x % 1000 == 1:
            print(x, "files complete")
            print(f"It took {time.time() - start} seconds")
    
    #write all image locations to the main dataframe.
    traindf['sentinel1_filelocation'] = filelocations

In [None]:
##Test statement for ee API, you must have a google earth engine API account.
ee.Authenticate()
ee.Initialize()

In [None]:
#Run Function over your dataframe.
pull_Sentinel1(traindf)

In [None]:
#Check to confirm file locations were written to dataframe
traindf.head()

## Sentinel-2a Main Function

Pulls bands B8A, B11, B12, and outputs B11, B12 and a ratio of (B8A-B11)/(B8A+B11). Writes to a jpeg in a defined Google Cloud Storage Bucket Subfolder.

In [None]:
def pull_Sentinel2a(traindf, overwrite = False, names_only = False ):

    #Create list of file location names to append to dataframe.
    filelocations = []
    x= 0
    
    #Define storage bucket for location to write jpegs to.
    storage_client = storage.Client()
    bucket = storage_client.get_bucket('GOOGLE_BUCKET_NAME_HERE')

    #Iterate over each row in the dataframe.
    for i in range(len(traindf.SWE)):
        
        #create a filename for the Image using cell_id and the string date column made above
        pict_name = traindf.cell_id[i] + '_sentinel2a_poly' + traindf.datestring[i] + '.jpg'
        
        #Define which subfolder in your Google Cloud Storage Bucket you want the jpegs to write to
        blob = bucket.blob('INSERT_SUBFOLDER_NAME_HERE/' + pict_name)

        #create the whole filename with path to the correct folder
        filename = os.path.join('gs://GOOGLE_BUCKET_NAME_HERE/INSERT_SUBFOLDER_NAME_HERE', pict_name)
        
        #To only capture filenames and append to main dataframe, and NOT WRITE any new jpeg images:
        if names_only:
            filelocations.append(filename)
            x += 1
            if x % 5000 == 0:
                print(f'{x} files already exist')

        elif os.path.exists(filename) and not overwrite:
            filelocations.append(filename)
            x += 1
            if x % 5000 == 0:
                print(f'{x} files already exist')

        else:

            #define area of interest by geometry column
            aoi = ee.Geometry.Polygon(list(traindf.geometry[i].exterior.coords))
            
            #Define the start and end days. Note there is a large window pulled due to inconsistency with Sentinel satellite image capture
            #This script pulls a wide window and then selects the *most recent* to the date defined, single image from the window
            start_date = traindf.date[i] - timedelta(days = 80)
            end_date = traindf.date[i] + timedelta(days = 1)

            x+=1
            #Try statement checks for avaialbe image from Google Earth Engine. If no image available, NULL will be appended to your file location column
            try:
                # Sentinel-2 image filtered on date range and on aoi
                se2 = ee.ImageCollection('COPERNICUS/S2')\
                    .filterDate(start_date, end_date)\
                    .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', 10))\
                    .filterBounds(aoi) \
                    .select(['B8A', 'B11', 'B12'])\
                    .sort('system:time_start', False)\
                    .first()

                #Create a ratio band of (B8a-B11)/(B8a+B11)
                BRatio = se2.expression(
                      '((B8 - B11)/(B8 + B11))*100', {
                      'B8': se2.select('B8A'),
                      'B11': se2.select('B11')
                      })
                
                #Add band to original image
                se2 = se2.addBands(BRatio.rename('BRatio'))

                #Define the rgb image with desired bands
                rgb = ['BRatio', 'B11', 'B12']

                #Grab the url of the image
                url = se2.select(rgb).clip(aoi).getThumbURL({'min': -500, 'max':500, 'region': aoi, 'format': 'jpg'})

                #add the name to my list I created earlier
                filelocations.append(filename)

                #Open the URL of the jpeg and write it to the blob subfolder you defined above.
                with urllib.request.urlopen(url) as response:
                    info = response.info()
                    blob.upload_from_string(response.read(), content_type=info.get_content_type())

            #If not image available, write null into file locations list.
            except Exception as e:
                filelocations.append("NULL")

            #Function to track progess, modify as you see fit.
            if x % 5000 == 0:
                print(x, "files complete")
    #Append main dataframe with the proper file location for sentinel images
    traindf['sentinel2a_filelocation'] = filelocations

In [None]:
##Test statement for ee API, you must have a google earth engine API account.
ee.Authenticate()
ee.Initialize()

#Run Function over your dataframe.
pull_Sentinel2a(traindf)

In [None]:
#Check to confirm file locations were written to dataframe
traindf.head()

## Sentinel-2b Main Function

Pulls bands B8, B4, and B2, and outputs B2, B4 and a ratio of (B8-B4)/(B8+B4). Writes to a jpeg in a defined Google Cloud Storage Bucket Subfolder.

In [None]:
def pull_Sentinel2b(traindf, overwrite = False, names_only = False ):

    #Create list of file location names to append to dataframe.
    filelocations = []
    x= 0
    
    #Define storage bucket for location to write jpegs to.
    storage_client = storage.Client()
    bucket = storage_client.get_bucket('GOOGLE_BUCKET_NAME_HERE')

    #Iterate over each row in the dataframe.
    for i in range(len(traindf.SWE)):
        
        #create a filename for the Image using cell_id and the string date column made above
        pict_name = traindf.cell_id[i] + '_sentinel2b_poly' + traindf.datestring[i] + '.jpg'
        
        #Define which subfolder in your Google Cloud Storage Bucket you want the jpegs to write to
        blob = bucket.blob('INSERT_SUBFOLDER_NAME_HERE/' + pict_name)

        #create the whole filename with path to the correct folder
        filename = os.path.join('gs://GOOGLE_BUCKET_NAME_HERE/INSERT_SUBFOLDER_NAME_HERE', pict_name)
        
        #To only capture filenames and append to main dataframe, and NOT WRITE any new jpeg images:
        if names_only:
            filelocations.append(filename)
            x += 1
            if x % 5000 == 0:
                print(f'{x} files already exist')

        elif os.path.exists(filename) and not overwrite:
            filelocations.append(filename)
            x += 1
            if x % 5000 == 0:
                print(f'{x} files already exist')

        else:

            #define area of interest by geometry column
            aoi = ee.Geometry.Polygon(list(traindf.geometry[i].exterior.coords))
            
            #Define the start and end days. Note there is a large window pulled due to inconsistency with Sentinel satellite image capture
            #This script pulls a wide window and then selects the *most recent* to the date defined, single image from the window
            start_date = traindf.date[i] - timedelta(days = 80)
            end_date = traindf.date[i] + timedelta(days = 1)

            x+=1
            #Try statement checks for avaialbe image from Google Earth Engine. If no image available, NULL will be appended to your file location column
            try:
                # Sentinel-2 image filtered on date range and on aoi
                se2 = ee.ImageCollection('COPERNICUS/S2')\
                    .filterDate(start_date, end_date)\
                    .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', 10))\
                    .filterBounds(aoi) \
                    .select(['B8', 'B4', 'B2'])\
                    .sort('system:time_start', False)\
                    .first()

                #Create a ratio band of (B8a-B11)/(B8a+B11)
                BRatio = se2.expression(
                      '((B8 - B4)/(B8 + B4))*10000', {
                      'B8': se2.select('B8'),
                      'B4': se2.select('B4')
                      })
                
                #Add band to original image
                se2 = se2.addBands(BRatio.rename('BRatio'))

                #Define the rgb image with desired bands
                rgb = ['BRatio', 'B2', 'B4']

                #Grab the url of the image
                url = se2.select(rgb).clip(aoi).getThumbURL({'min': -500, 'max':500, 'region': aoi, 'format': 'jpg'})

                #add the name to my list I created earlier
                filelocations.append(filename)

                #Open the URL of the jpeg and write it to the blob subfolder you defined above.
                with urllib.request.urlopen(url) as response:
                    info = response.info()
                    blob.upload_from_string(response.read(), content_type=info.get_content_type())

            #If not image available, write null into file locations list.
            except Exception as e:
                filelocations.append("NULL")

            #Function to track progess, modify as you see fit.
            if x % 5000 == 0:
                print(x, "files complete")
    #Append main dataframe with the proper file location for sentinel images
    traindf['sentinel2b_filelocation'] = filelocations

In [None]:
##Test statement for ee API, you must have a google earth engine API account.
ee.Authenticate()
ee.Initialize()

#Run Function over your dataframe.
pull_Sentinel2b(traindf)

In [None]:
#Check to confirm file locations were written to dataframe
traindf.head()