This notebook contains the code that I used to process the entire relevant DRC area using the S2 Cloudless algorithm and export the results to Google Cloud storage.

In [1]:
## Initialize the Earth Engine library ##

import ee
import pandas as pd
import json

# Only have to Authenticate once
#ee.Authenticate()

# Initialize the library.
ee.Initialize()

In [2]:
path_to_file = 'DRC_squares_3.geojson'
# The above geojson was obtained by gridding out the DRC area of interest

import geojson
with open(path_to_file) as f:
    gj = geojson.load(f)

f = gj['features']

In [3]:
type(f)

list

In [4]:
f[1]

{"geometry": {"coordinates": [[[[16.197896, -2.192624], [16.293549, -2.192933], [16.292962, -2.373263], [16.184161, -2.372883], [16.184918, -2.325177], [16.173495, -2.248936], [16.197896, -2.192624]]]], "type": "MultiPolygon"}, "properties": {"FID": 0, "bottom": 9737083.691892697, "id": 42, "left": -43865.407990429085, "right": -23865.407990429085, "top": 9757083.691892697}, "type": "Feature"}

In [5]:
len(f)

3194

In [7]:
# Pull out the polygon coordinates
polys = []
for feature in f:
    polys.append(feature['geometry']['coordinates'][0])
len(polys)

3194

In [6]:
polys[100]

[[[17.368392, -2.737686],
  [17.547986, -2.73822],
  [17.547452, -2.918832],
  [17.36783, -2.918264],
  [17.368392, -2.737686]]]

## S2 Cloudless functions

In [44]:
def get_s2_sr_cld_col(aoi, start_date, end_date):
    # Import and filter S2 SR.
    s2_sr_col = (ee.ImageCollection('COPERNICUS/S2_SR')
        .filterBounds(aoi)
        .filterDate(start_date, end_date)
        .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', CLOUD_FILTER)))

    # Import and filter s2cloudless.
    s2_cloudless_col = (ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY')
        .filterBounds(aoi)
        .filterDate(start_date, end_date))

    # Join the filtered s2cloudless collection to the SR collection by the 'system:index' property.
    return ee.ImageCollection(ee.Join.saveFirst('s2cloudless').apply(**{
        'primary': s2_sr_col,
        'secondary': s2_cloudless_col,
        'condition': ee.Filter.equals(**{
            'leftField': 'system:index',
            'rightField': 'system:index'
        })
    }))

In [9]:
def add_cloud_bands(img):
    # Get s2cloudless image, subset the probability band.
    cld_prb = ee.Image(img.get('s2cloudless')).select('probability')

    # Condition s2cloudless by the probability threshold value.
    is_cloud = cld_prb.gt(CLD_PRB_THRESH).rename('clouds')

    # Add the cloud probability layer and cloud mask as image bands.
    return img.addBands(ee.Image([cld_prb, is_cloud]))

In [10]:
def add_shadow_bands(img):
    # Identify water pixels from the SCL band.
    not_water = img.select('SCL').neq(6)

    # Identify dark NIR pixels that are not water (potential cloud shadow pixels).
    SR_BAND_SCALE = 1e4
    dark_pixels = img.select('B8').lt(NIR_DRK_THRESH*SR_BAND_SCALE).multiply(not_water).rename('dark_pixels')

    # Determine the direction to project cloud shadow from clouds (assumes UTM projection).
    shadow_azimuth = ee.Number(90).subtract(ee.Number(img.get('MEAN_SOLAR_AZIMUTH_ANGLE')));

    # Project shadows from clouds for the distance specified by the CLD_PRJ_DIST input.
    cld_proj = (img.select('clouds').directionalDistanceTransform(shadow_azimuth, CLD_PRJ_DIST*10)
        .reproject(**{'crs': img.select(0).projection(), 'scale': 100})
        .select('distance')
        .mask()
        .rename('cloud_transform'))

    # Identify the intersection of dark pixels with cloud shadow projection.
    shadows = cld_proj.multiply(dark_pixels).rename('shadows')

    # Add dark pixels, cloud projection, and identified shadows as image bands.
    return img.addBands(ee.Image([dark_pixels, cld_proj, shadows]))

In [11]:
def add_cld_shdw_mask(img):
    # Add cloud component bands.
    img_cloud = add_cloud_bands(img)

    # Add cloud shadow component bands.
    img_cloud_shadow = add_shadow_bands(img_cloud)

    # Combine cloud and shadow mask, set cloud and shadow as value 1, else 0.
    is_cld_shdw = img_cloud_shadow.select('clouds').add(img_cloud_shadow.select('shadows')).gt(0)

    # Remove small cloud-shadow patches and dilate remaining pixels by BUFFER input.
    # 20 m scale is for speed, and assumes clouds don't require 10 m precision.
    is_cld_shdw = (is_cld_shdw.focal_min(2).focal_max(BUFFER*2/20)
        .reproject(**{'crs': img.select([0]).projection(), 'scale': 20})
        .rename('cloudmask'))

    # Add the final cloud-shadow mask to the image.
    return img_cloud_shadow.addBands(is_cld_shdw)

In [12]:
def apply_cld_shdw_mask_all_bands(img):
    # Subset the cloudmask band and invert it so clouds/shadow are 0, else 1.
    not_cld_shdw = img.select('cloudmask').Not()

    # Subset reflectance bands and update their masks, return the result.
    return img.updateMask(not_cld_shdw)

## Process images

In [13]:
def add_ndvi(img):
    """
    Adds an NDVI band to an Earth Engine image
    """
    ndvi = img.normalizedDifference(['B8', 'B4']).rename('NDVI')
    img = img.addBands(ndvi)
    img = img.float()
    return img

In [40]:
AOI = ee.Geometry.Polygon(polys[10]) # test with one AOI first

In [50]:
## Examples values ##

START_DATE = '2019-01-01'
END_DATE = '2019-12-31'
CLOUD_FILTER = 100
CLD_PRB_THRESH = 100
NIR_DRK_THRESH = 100
CLD_PRJ_DIST = 2
BUFFER = 50

In [45]:
s2_sr_cld_col = get_s2_sr_cld_col(AOI, START_DATE, END_DATE)

In [19]:
s2_sr_median = (s2_sr_cld_col.map(add_cld_shdw_mask)
                             .map(apply_cld_shdw_mask_all_bands)
                             .median())

In [20]:
s2_sr_median = add_ndvi(s2_sr_median)

In [17]:
def export_to_gcs(s2_sr_median,AOI,polygon_id,date_range,band_list):
    """
    Exports an Earth Engine image to Google Cloud Storage
    Arguments:
    s2_sr_median: The image you want to export
    AOI: The AOI you want to export (must be an Earth Engine Polygon)
    polygon_id: The id number of your polygon; will be put into the image name
    date_range: The date range you used to make the image; will be put into the image name
    band_list: The list of bands you want to export
    """
    export = ee.batch.Export.image.toCloudStorage(
      image=s2_sr_median.select(band_list),
      description='test',
      scale=10,
      region=AOI,
      fileNamePrefix=f'S2_CloudFree/s2cloudless_drc_2017/{date_range}_{polygon_id}',
      bucket='project-canopy-temp-2',
      maxPixels=1e13
)
    export.start()
    
    return export

In [15]:
# band_list = ['B1','B2','B3','B4','B5','B6','B7','B8','B8A','B9','B11','B12','TCI_R','TCI_G','TCI_B','AOT','WVP']

band_list = ['B2','B3','B4','B8','B11','B12','TCI_R','TCI_G','TCI_B', 'NDVI']

In [37]:
def s2cloudless_download(dates_dict,polygons,band_list,add_ndvi=True):
    """
    Applies the S2 Cloudless algorithm to each polygon in "polygons,"
    pulling images from within the date range defined by the dates_dict.
    Then exports the resulting images to Google Cloud Storage.
    Arguments:
    dates_dict: A dictionary containing the date ranges you want to pull images from
    when making the cloudfree composite.
    See two cells below for an example of what this should look like.
    polygons: A list of Earth Engine polygons defining the AOIs you want to get images from.
    band_list: The list of bands you want to export.
    add_ndvi: If True, adds an NDVI band to the image before downloading it.
    """
    
    #band_list: ['B1','B2','B3','B4','B5','B6','B7','B8','B8A','B9','B11','B12','TCI_R','TCI_G','TCI_B','AOT','WVP']
        
    for date_range in dates_dict.keys():
        for polygon_id,polygon in enumerate(polygons,1):
            print(f"processing {polygon_id} of {len(polygons)} for {date_range} range", end='\r', flush=True)
            AOI = ee.Geometry.Polygon(polygon)
            s2_sr_cld_col = get_s2_sr_cld_col(AOI, dates_dict[date_range]["START_DATE"], dates_dict[date_range]["END_DATE"])
            s2_sr_median = (s2_sr_cld_col.map(add_cld_shdw_mask)
                             .map(apply_cld_shdw_mask_all_bands)
                             .median())
            
            #return s2_sr_median
        
            if add_ndvi:
                s2_sr_median = add_ndvi(s2_sr_median)
            
            export = export_to_gcs(s2_sr_median,AOI,polygon_id,date_range,band_list)


In [52]:
## These values were found by Lorenzo Garbagna and Holly Burrows to offer the best quality for 2021 pulls ##

START_DATE = '2020-12-15'
END_DATE = '2021-02-12'
CLOUD_FILTER = 20
CLD_PRB_THRESH = 20
NIR_DRK_THRESH = 0.15
CLD_PRJ_DIST = 1
BUFFER = 25

In [94]:
dates_dict = {'2021':{'START_DATE':START_DATE,'END_DATE':END_DATE}}

In [95]:
s2cloudless_download(dates_dict,polys,band_list)

## You can have a maximum of 3,000 downloads going at once, which is why this errored out. ##

processing 3067 of 3194 for 2021 range

EEException: Too many tasks already in the queue (3000). Please wait for some of them to complete.

In [20]:
def s2cloudless_download_cont(dates_dict,polygons,band_list, start_id):
    """
    Continues a previous S2Cloudless download process.
    To be used if an "s2cloudless_download" pull gets interrupted,
    usually because there were already 3000 tasks in the queue.
    Arguments:
    dates_dict: Same as s2cloudless_download
    polygons: Same as s2cloudless_download
    band_list: Same as s2cloudless_download
    start_id: The id number of the first polygon you want to start downloading (all previous polys will be skipped)
    """
    
    #band_list: ['B1','B2','B3','B4','B5','B6','B7','B8','B8A','B9','B11','B12','TCI_R','TCI_G','TCI_B','AOT','WVP']
        
    for date_range in dates_dict.keys():
        for polygon_id,polygon in enumerate(polygons,1):
            if polygon_id >= start_id:
                # Only download a polygon if its ID is greater than or equal to the start_id
                print(f"processing {polygon_id} of {len(polygons)} for {date_range} range", end='\r', flush=True)
                AOI = ee.Geometry.Polygon(polygon)
                s2_sr_cld_col = get_s2_sr_cld_col(AOI, dates_dict[date_range]["START_DATE"], dates_dict[date_range]["END_DATE"])
                s2_sr_median = (s2_sr_cld_col.map(add_cld_shdw_mask)
                                 .map(apply_cld_shdw_mask_all_bands)
                                 .median())

                #return s2_sr_median

                s2_sr_median = add_ndvi(s2_sr_median)

                export = export_to_gcs(s2_sr_median,AOI,polygon_id,date_range,band_list)

In [97]:
s2cloudless_download_cont(dates_dict,polys,band_list,3067)

processing 3194 of 3194 for 2021 range

In [16]:
## These values were found by Lorenzo Garbagna and Holly Burrows to offer the best quality for 2019 pulls ##

START_DATE = '2018-12-15'
END_DATE = '2019-02-12'
CLOUD_FILTER = 20
CLD_PRB_THRESH = 12
NIR_DRK_THRESH = 0.08
CLD_PRJ_DIST = 0.5
BUFFER = 50

In [17]:
dates_dict = {'2019':{'START_DATE':START_DATE,'END_DATE':END_DATE}}

In [18]:
s2cloudless_download(dates_dict,polys,band_list)

processing 3020 of 3194 for 2019 range

EEException: Too many tasks already in the queue (3000). Please wait for some of them to complete.

In [21]:
s2cloudless_download_cont(dates_dict,polys,band_list,3020)

processing 3194 of 3194 for 2019 range

In [21]:
(0.15 + 0.08) / 2

0.11499999999999999

In [26]:
## These values were found by Lorenzo Garbagna and Holly Burrows to offer the best quality for 2017 pulls.
## However, I was unable to download these images because no relevant satellite images were found for 2017.
## I'm not sure why this is. This should be a key question to tackle. ##

START_DATE = '2017-03-15'
END_DATE = '2017-06-15'
CLOUD_FILTER = 20
CLD_PRB_THRESH = 12
NIR_DRK_THRESH = 0.08
CLD_PRJ_DIST = 0.5
BUFFER = 50

band_list = ['B2','B3','B4','B8','B11','B12', 'NDVI']

dates_dict = {'2017':{'START_DATE':START_DATE,'END_DATE':END_DATE}}